In [66]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [30]:
#Load the train and test datasets to create two DataFrames
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"

In [31]:
train = pd.read_csv(train_url)

In [32]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [33]:
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"

In [34]:
test = pd.read_csv(test_url)

In [35]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.2250,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0000,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.1500,,S


In [36]:
#Missing values in train
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [38]:
#Missing values in test
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [39]:
#Fill missing values with mean colum values in the train set
train.fillna(train.mean(),inplace=True)
test.fillna(train.mean(),inplace=True)

In [40]:
train = train.drop(['Name','Ticket','Cabin','Embarked'],axis=1)
test = test.drop(['Name','Ticket','Cabin','Embarked'],axis=1)

In [41]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,male,22.000000,1,0,7.2500
1,2,1,1,female,38.000000,1,0,71.2833
2,3,1,3,female,26.000000,0,0,7.9250
3,4,1,1,female,35.000000,1,0,53.1000
4,5,0,3,male,35.000000,0,0,8.0500
5,6,0,3,male,29.699118,0,0,8.4583
6,7,0,1,male,54.000000,0,0,51.8625
7,8,0,3,male,2.000000,3,1,21.0750
8,9,1,3,female,27.000000,0,2,11.1333
9,10,1,2,female,14.000000,1,0,30.0708


In [43]:
labelEncoder = LabelEncoder()
labelEncoder.fit(train['Sex'])
labelEncoder.fit(test['Sex'])
train['Sex'] = labelEncoder.transform(train['Sex'])
test['Sex'] = labelEncoder.transform(test['Sex'])

In [44]:
y_train = np.array(train['Survived'])

In [45]:
train=train.drop(['Survived'],axis=1)

In [46]:
X_train = np.array(train)

In [60]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

In [61]:
def kmeans(X, n_clusters):
    ss = StandardScaler()
    X = ss.fit_transform(X)
    km = KMeans(n_clusters=n_clusters)
    km.fit(X)
    y_pred = km.predict(X)
    return km
#The best number of clusters is 3
km = kmeans(X_train,2)

In [62]:
#How many are predicted correctly based on y
def correction(X, y, kmeans):
    correct = 0
    for i in range(len(X)):
        predict_me = np.array(X[i].astype(float))
        predict_me = predict_me.reshape(-1,len(predict_me))
        prediction = kmeans.predict(predict_me)
        if prediction[0] == y[i]:
            correct +=1
    print(correct/len(X))
correction(X_train, y_train, km)

0.6734006734006734


In [63]:
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering

In [64]:
algorithms = []
algorithms.append(KMeans(n_clusters=2, random_state=1))
algorithms.append(AffinityPropagation())
algorithms.append(SpectralClustering(n_clusters=2, random_state=1,
                                     affinity='nearest_neighbors'))
algorithms.append(AgglomerativeClustering(n_clusters=2))

In [67]:
data = []
for algo in algorithms:
    algo.fit(X_train)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y_train, algo.labels_),
        'AMI': metrics.adjusted_mutual_info_score(y_train, algo.labels_),
        'Homogenity': metrics.homogeneity_score(y_train, algo.labels_),
        'Completeness': metrics.completeness_score(y_train, algo.labels_),
        'V-measure': metrics.v_measure_score(y_train, algo.labels_),
        'Silhouette': metrics.silhouette_score(X_train, algo.labels_)}))

results = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure', 
                                           'Silhouette'],
                       index=['K-means', 'Affinity', 
                              'Spectral', 'Agglomerative'])

results



Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
K-means,0.115029,0.060426,0.06122,0.068689,0.06474,0.250356
Affinity,0.026803,0.066218,0.420412,0.07518,0.127551,0.279594
Spectral,0.267541,0.186122,0.18822,0.186804,0.187509,0.247634
Agglomerative,0.273299,0.198154,0.20445,0.198812,0.201592,0.229023


In [71]:
def k_mean_distance(data, cx, cy, i_centroid, cluster_labels):
        distances = [np.sqrt((x-cx)**2+(y-cy)**2) for (x, y) in data[cluster_labels == i_centroid]]
        return distances

In [72]:
centroids = km.cluster_centers_
print("centroids:", centroids)

centroids: [[-0.03458646  0.54305808  0.06810919 -0.26811081  0.04111616  0.01398133
  -0.34046375]
 [ 0.09017685 -1.41590853 -0.17758024  0.69904194 -0.10720165 -0.03645335
   0.88768685]]


In [73]:
from scipy.spatial import distance

In [None]:
distances_0 = []
distance = distance.euclidean(centroids, )