In [17]:
import os
import numpy as np
import pandas as pd

## Naive Bayes

In [38]:
def read_csv_data(filename):
    df = pd.read_csv(filename, header=None)
    predictive_attributes = df.columns.values[:-1].tolist()
    predictive_attributes = [f'A{i}' for i in predictive_attributes]
    classification_attribute = df.columns.values[-1].tolist()
    classification_attribute = f'C' 
    df.columns = predictive_attributes + [classification_attribute]
    return df
    

In [107]:
train_df, test_df = read_csv_data('ex2_train.csv'), read_csv_data('ex2_test.csv')

In [108]:
train_df

Unnamed: 0,A0,A1,A2,C
0,1,1,1,1
1,1,1,1,1
2,1,1,1,1
3,1,1,1,1
4,1,1,1,1
...,...,...,...,...
79,2,2,2,2
80,2,2,2,2
81,2,2,2,2
82,2,2,2,2


In [109]:
test_df

Unnamed: 0,A0,A1,A2,C
0,1,1,1,1
1,1,1,2,2
2,1,2,1,3
3,1,2,2,1
4,2,1,1,2
5,2,1,2,3
6,2,2,1,1
7,2,2,2,2


In [110]:
delta = 1
C_flag = False
num_train_examples = len(train_df.index)

labels = sorted(train_df.C.unique().tolist())
print(delta, C_flag, labels)

1 False [1, 2, 3]


In [113]:
for idx, row in test_df.iterrows():
    prob_a_label_list = []
    pred_attribute_values = row.values[:-1]
    for v in labels:
        denominator = len(train_df[train_df['C'] == v].index)
        print(f"P(C={v}) = [{denominator} / {num_train_examples}]")
        prob_c = denominator/num_train_examples
        prob_a_list = []
        for idx, u in enumerate(pred_attribute_values):
            #  q=#(Dom(Ai)), the number of different values of Ai
            q = len(train_df[f'A{idx}'].unique())
            num = len(train_df[(train_df[f'A{idx}'] == u) & (train_df[f'C'] == v)].index)
            den = denominator
            if C_flag:
                num += delta
                den += q*delta    
            prob_a = float(num/den)
            print(f"P(A{idx} | C={v}) = {num} / {den}")
            
            prob_a_list.append(prob_a)
        prob_a_label_list.append(np.prod(prob_a_list)*prob_c)   

    for idx, v in enumerate(sorted(labels)):
        print(f"NB(C={v}) = {prob_a_label_list[idx]:06f}")
    
    predicted_test_label = labels[np.argmax(prob_a_label_list)]
    true_test_label = row.values[-1]
    
    if predicted_test_label == true_test_label:
        print(f'match: "{predicted_test_label}"')
    else:
        print(f'fail: got "{predicted_test_label}" != want "{true_test_label}"')            

P(C=1) = [20 / 84]
P(A0 | C=1) = 15 / 20
P(A1 | C=1) = 15 / 20
P(A2 | C=1) = 10 / 20
P(C=2) = [34 / 84]
P(A0 | C=2) = 8 / 34
P(A1 | C=2) = 20 / 34
P(A2 | C=2) = 16 / 34
P(C=3) = [30 / 84]
P(A0 | C=3) = 15 / 30
P(A1 | C=3) = 20 / 30
P(A2 | C=3) = 10 / 30
NB(C=1) = 0.066964
NB(C=2) = 0.026363
NB(C=3) = 0.039683
match: "1"
P(C=1) = [20 / 84]
P(A0 | C=1) = 15 / 20
P(A1 | C=1) = 15 / 20
P(A2 | C=1) = 10 / 20
P(C=2) = [34 / 84]
P(A0 | C=2) = 8 / 34
P(A1 | C=2) = 20 / 34
P(A2 | C=2) = 18 / 34
P(C=3) = [30 / 84]
P(A0 | C=3) = 15 / 30
P(A1 | C=3) = 20 / 30
P(A2 | C=3) = 20 / 30
NB(C=1) = 0.066964
NB(C=2) = 0.029659
NB(C=3) = 0.079365
fail: got "3" != want "2"
P(C=1) = [20 / 84]
P(A0 | C=1) = 15 / 20
P(A1 | C=1) = 5 / 20
P(A2 | C=1) = 10 / 20
P(C=2) = [34 / 84]
P(A0 | C=2) = 8 / 34
P(A1 | C=2) = 14 / 34
P(A2 | C=2) = 16 / 34
P(C=3) = [30 / 84]
P(A0 | C=3) = 15 / 30
P(A1 | C=3) = 10 / 30
P(A2 | C=3) = 10 / 30
NB(C=1) = 0.022321
NB(C=2) = 0.018454
NB(C=3) = 0.019841
fail: got "1" != want "3"
P(C=1

## KNN

In [199]:
def read_data(filename):
    if filename.split('.')[-1] == 'txt':
        df = pd.read_csv(filename, sep=" ", header=None)[0].str.split(",", expand = True)
    if filename.split('.')[-1] == 'csv':
        df = pd.read_csv(filename, header=None)
    predictive_attributes = df.columns.values[:-1].tolist()
    predictive_attributes = [f'A{i}' for i in predictive_attributes]
    classification_attribute = df.columns.values[-1].tolist()
    classification_attribute = f'C' 
    df.columns = predictive_attributes + [classification_attribute]
    return df
    
train_df, test_df = read_data('knn2.train.txt'), read_data('knn2.test.txt')   
kNN_k = 3

In [200]:
train_df

Unnamed: 0,A0,A1,A2,C
0,1,1,1,A
1,1,2,10,A
2,2,1,13,A
3,5,5,30,C
4,6,6,40,C
5,10,11,77,B
6,12,14,88,B
7,19,17,99,B


In [201]:
test_df

Unnamed: 0,A0,A1,A2,C
0,3,3,6,A
1,4,5,20,B
2,2,2,33,A
3,10,9,19,A
4,14,12,100,B
5,17,19,101,B


In [202]:
verbose=True
labels = sorted(train_df.C.unique().tolist())
print(labels)

['A', 'B', 'C']


In [203]:
def decide_label(top_k, labels):
    # returns label with maximum weight, label in sorted order in case of ties 
    weighted_top_k = [(1.0/i[0], i[1]) for i in top_k]
    decision = {k:0.0 for k in labels}
    for point in weighted_top_k:
        decision[point[1]] += point[0]
    return max(decision, key=decision.get)

def euclidean_distance(pt1, pt2):
    point1 = np.array(pt1)
    point2 = np.array(pt2)
    return np.sqrt(np.sum(np.square(point1 - point2)))

In [204]:
for _, row in test_df.iterrows():
    test_point = tuple([int(p) for p in row[:-1].values])
    true_test_label = row[-1]
    distance_list = []
    train_label_list = []
    for __, train_row in train_df.iterrows():
        train_point = tuple([int(p) for p in train_row[:-1].values])
        distance_list.append(euclidean_distance(train_point, test_point))
        train_label_list.append(train_row[-1])
    top_k = sorted(zip(distance_list, train_label_list))[:kNN_k]
    predicted_test_label = decide_label(top_k=top_k, labels=labels)
    if verbose:
        print(f"want={true_test_label} got={predicted_test_label}")
        
    
    

want=A got=A
want=B got=A
want=A got=C
want=A got=A
want=B got=B
want=B got=B


## KMeans

In [261]:
train_df = read_data('km2.txt')

In [263]:
train_df.shape

(100, 4)

In [265]:
# create points dict
pred_cols = sorted(list(set(train_df.columns) - set('C')))
print(pred_cols)
points = {k:tuple([int(i) for i in v]) for k,v in zip(train_df['C'].values.tolist(), train_df[pred_cols].apply(tuple, axis=1).values.tolist())}
points

['A0', 'A1', 'A2']


{'A1': (81, 1887, 1847),
 'A2': (59, 81, 1318),
 'A3': (425, 540, 456),
 'A4': (1300, 694, 511),
 'A5': (162, 1089, 728),
 'A6': (1274, 1211, 1445),
 'A7': (1237, 1106, 495),
 'A8': (1466, 1528, 258),
 'A9': (47, 1947, 287),
 'A10': (888, 790, 1015),
 'A11': (1541, 408, 1387),
 'A12': (831, 1429, 1356),
 'A13': (1737, 631, 1485),
 'A14': (1026, 413, 1090),
 'A15': (1194, 563, 433),
 'A16': (147, 78, 324),
 'A17': (159, 1353, 1957),
 'A18': (1721, 1189, 199),
 'A19': (1000, 705, 888),
 'A20': (538, 1703, 1355),
 'A21': (451, 510, 605),
 'A22': (156, 266, 1828),
 'A23': (1561, 1202, 783),
 'A24': (1746, 1563, 376),
 'A25': (1002, 1718, 1447),
 'A26': (1094, 1577, 1463),
 'A27': (1996, 420, 623),
 'A28': (953, 1137, 1133),
 'A29': (1241, 59, 1033),
 'A30': (643, 1891, 2),
 'A31': (878, 1336, 546),
 'A32': (1107, 1940, 503),
 'A33': (552, 1843, 205),
 'A34': (1598, 1425, 1351),
 'A35': (1515, 1757, 1687),
 'A36': (10, 1410, 1285),
 'A37': (590, 1632, 1098),
 'A38': (553, 591, 582),
 'A39':

In [236]:
d = {'C1': [('A1', (81, 1887)), ('A3', (81, 1318)), ('A5', (456, 1300)), ('A7', (162, 1089)), ('A8', (728, 1274)), ('A11', (495, 1466)), ('A13', (47, 1947)), ('A14', (287, 888)), ('A21', (413, 1090)), ('A25', (159, 1353)), ('A33', (266, 1828)), ('A35', (783, 1746)), ('A37', (1002, 1718)), ('A41', (623, 953)), ('A46', (878, 1336)), ('A47', (546, 1107)), ('A49', (552, 1843)), ('A50', (205, 1598))], 'C2': [('A4', (425, 540)), ('A6', (694, 511)), ('A23', (433, 147)), ('A24', (78, 324)), ('A27', (1189, 199)), ('A29', (888, 538)), ('A31', (451, 510)), ('A32', (605, 156)), ('A43', (1241, 59))], 'C3': [('A2', (1847, 59)), ('A9', (1211, 1445)), ('A10', (1237, 1106)), ('A12', (1528, 258)), ('A15', (790, 1015)), ('A16', (1541, 408)), ('A17', (1387, 831)), ('A18', (1429, 1356)), ('A19', (1737, 631)), ('A20', (1485, 1026)), ('A22', (1194, 563)), ('A26', (1957, 1721)), ('A28', (1000, 705)), ('A30', (1703, 1355)), ('A34', (1561, 1202)), ('A36', (1563, 376)), ('A38', (1447, 1094)), ('A39', (1577, 1463)), ('A40', (1996, 420)), ('A42', (1137, 1133)), ('A44', (1033, 643)), ('A45', (1891, 2)), ('A48', (1940, 503))]}

In [238]:
{k:{','.join(i[0] for i in v)} for k,v in d.items()}

{'C1': {'A1,A3,A5,A7,A8,A11,A13,A14,A21,A25,A33,A35,A37,A41,A46,A47,A49,A50'},
 'C2': {'A4,A6,A23,A24,A27,A29,A31,A32,A43'},
 'C3': {'A2,A9,A10,A12,A15,A16,A17,A18,A19,A20,A22,A26,A28,A30,A34,A36,A38,A39,A40,A42,A44,A45,A48'}}

In [260]:
centroids = [(0,500), (200,200), (1000,1000)]
kMeans_k = 3
while(True):
    old_centroids = centroids
    new_clusters = {f'C{i+1}':[] for i in range(len(centroids))}
    for k, p in points.items():
        d = []
        for idx, q in enumerate(centroids):
            d.append((euclidean_distance(p,q), idx+1))
        new_centre = min(d)[1]
        # print(f"Point: {k}, min distance: {min(d)[0]}, assigned cluster: {min(d)[1]}")
        new_clusters[f'C{new_centre}'].append((k,p))
        
    centroids = []
    for k,v in new_clusters.items():
        d = []
        for i in v:
            d.append(i[1])
        x_new = np.average([j[0] for j in d])
        y_new = np.average([j[1] for j in d])
        centroids.append((x_new, y_new))
    
    new_centroids = centroids
    
    if (old_centroids != new_centroids):
        kMeans_k -= 1
    else:
        break

for k,v in new_clusters.items():
    v_ = ','.join([j[0] for j in v])
    out = f"{k} = " + "{" + f"{v_}" + "}"
    print(out)
for c in new_centroids:
    c_ = " ".join([str(i) for i in c])
    out = "([" + c_ + ")]"
    print(out)

C1 = {A1,A3,A5,A7,A8,A11,A13,A14,A15,A21,A25,A33,A35,A37,A41,A46,A47,A49,A50}
C2 = {A4,A6,A23,A24,A27,A28,A29,A31,A32,A43,A44}
C3 = {A2,A9,A10,A12,A16,A17,A18,A19,A20,A22,A26,A30,A34,A36,A38,A39,A40,A42,A45,A48}
([450.2105263157895 1408.2105263157894)]
([730.6363636363636 393.8181818181818)]
([1568.4 847.6)]


In [251]:
def print_metrics(metrics):
    for label in metrics.keys():
        print(f"Label={label} Precision={metrics[label]['tp']}/{metrics[label]['tp'] + metrics[label]['fp']} Recall={metrics[label]['tp']}/{metrics[label]['tp'] + metrics[label]['fn']}")

In [252]:
metrics = {'A': {'tp': 2, 'fp': 1, 'tn': 0, 'fn': 1}, 'B': {'tp': 2, 'fp': 1, 'tn': 0, 'fn': 1}}
print_metrics(metrics=metrics)

Label=A Precision=2/3 Recall=2/3
Label=B Precision=2/3 Recall=2/3
