# Assignment 2

## Naive Bayes

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('students.csv')

In [3]:
df

Unnamed: 0,Name,GPA,Effort,Hirable
0,Sarah,poor,lots,Yes
1,Dana,average,some,No
2,Alex,average,some,No
3,Annie,average,lots,Yes
4,Emily,excellent,lots,Yes
5,Pete,excellent,lots,No
6,John,excellent,lots,No
7,Kathy,poor,some,No


In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
df['Hirable'] = LabelEncoder().fit_transform(df['Hirable'])

In [6]:
features = ['GPA', 'Effort']
encoders = dict.fromkeys(features)

for key in encoders.keys():
    encoders[key] = LabelEncoder()
    df[key] = encoders[key].fit_transform(df[key])

In [7]:
df

Unnamed: 0,Name,GPA,Effort,Hirable
0,Sarah,2,0,1
1,Dana,0,1,0
2,Alex,0,1,0
3,Annie,0,0,1
4,Emily,1,0,1
5,Pete,1,0,0
6,John,1,0,0
7,Kathy,2,1,0


In [8]:
from sklearn.naive_bayes import MultinomialNB

In [9]:
mnb = MultinomialNB()
mnb.fit(df[features], df['Hirable'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
X_raw = {'GPA': 'poor', 'Effort': 'lots'}
X = {}
for key, encoder in encoders.items():
    X[key] = encoder.transform([ X_raw[key] ])[0]

X

{'GPA': 2, 'Effort': 0}

In [11]:
mnb.predict([list(X.values())])[0]

1

## Association Rule Learning

In [2]:
from apyori import apriori

In [1]:
records = [
    [1,2,3,4,5], [1,2,5], [1,2,3], [1,2,4], [2,3,4], [2,4,5], [1,3], [3,4]
]

In [3]:
min_support = 4/len(records)
min_support

0.5

In [4]:
rules = apriori(records, min_support=min_support)
results = list(rules)  

In [5]:
results[0]

RelationRecord(items=frozenset({1}), support=0.625, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({1}), confidence=0.625, lift=1.0)])

In [6]:
for item in results:

    print("Rule: " + str(item[2][0][0]) + " -> " + str(item[2][0][1]))

    print("Support: " + str(item[1]))

    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    print("=====================================")

Rule: frozenset() -> frozenset({1})
Support: 0.625
Confidence: 0.625
Lift: 1.0
Rule: frozenset() -> frozenset({2})
Support: 0.75
Confidence: 0.75
Lift: 1.0
Rule: frozenset() -> frozenset({3})
Support: 0.625
Confidence: 0.625
Lift: 1.0
Rule: frozenset() -> frozenset({4})
Support: 0.625
Confidence: 0.625
Lift: 1.0
Rule: frozenset() -> frozenset({1, 2})
Support: 0.5
Confidence: 0.5
Lift: 1.0
Rule: frozenset() -> frozenset({2, 4})
Support: 0.5
Confidence: 0.5
Lift: 1.0


## K-Means Clustering

In [2]:
from sklearn.cluster import KMeans
import numpy as np

In [3]:
data = [(2,10), (2,5), (8,4), (5,8), (7,5), (6,4), (1,2), (4,9)]

In [4]:
num_clusters = 3
seeds = np.array([[2,10], [5,8], [1,2]])

In [5]:
model = KMeans(n_clusters=num_clusters, init=seeds, max_iter=1)
model.fit(data)

KMeans(algorithm='auto', copy_x=True,
       init=array([[ 2, 10],
       [ 5,  8],
       [ 1,  2]]), max_iter=1,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [6]:
model.cluster_centers_

array([[ 2. , 10. ],
       [ 6. ,  6. ],
       [ 1.5,  3.5]])

In [7]:
model.labels_

array([0, 2, 1, 1, 1, 1, 2, 0])

In [8]:
df = pd.DataFrame({'coord': data})
df.index += 1
df.rename('A{}'.format, inplace=True)

In [9]:
for i in range(num_clusters):
    df['dist' + str(i+1)] = df['coord'].apply(lambda x: np.linalg.norm(x-seeds[i]))

In [10]:
df['y'] = df.iloc[:, 1:].idxmin(axis=1).str[-1].astype(int)

In [11]:
df

Unnamed: 0,coord,dist1,dist2,dist3,y
A1,"(2, 10)",0.0,3.605551,8.062258,1
A2,"(2, 5)",5.0,4.242641,3.162278,3
A3,"(8, 4)",8.485281,5.0,7.28011,2
A4,"(5, 8)",3.605551,0.0,7.211103,2
A5,"(7, 5)",7.071068,3.605551,6.708204,2
A6,"(6, 4)",7.211103,4.123106,5.385165,2
A7,"(1, 2)",8.062258,7.211103,0.0,3
A8,"(4, 9)",2.236068,1.414214,7.615773,2


In [12]:
np.mean(df[df['y'] == 1]['coord'].tolist(), axis=0)

array([ 2., 10.])

In [13]:
np.mean(df[df['y'] == 2]['coord'].tolist(), axis=0)

array([6., 6.])

In [14]:
np.mean(df[df['y'] == 3]['coord'].tolist(), axis=0)

array([1.5, 3.5])

In [15]:
print(df.round(4).to_markdown())

|    | coord   |   dist1 |   dist2 |   dist3 |   y |
|:---|:--------|--------:|--------:|--------:|----:|
| A1 | (2, 10) |  0      |  3.6056 |  8.0623 |   1 |
| A2 | (2, 5)  |  5      |  4.2426 |  3.1623 |   3 |
| A3 | (8, 4)  |  8.4853 |  5      |  7.2801 |   2 |
| A4 | (5, 8)  |  3.6056 |  0      |  7.2111 |   2 |
| A5 | (7, 5)  |  7.0711 |  3.6056 |  6.7082 |   2 |
| A6 | (6, 4)  |  7.2111 |  4.1231 |  5.3852 |   2 |
| A7 | (1, 2)  |  8.0623 |  7.2111 |  0      |   3 |
| A8 | (4, 9)  |  2.2361 |  1.4142 |  7.6158 |   2 |
