In [56]:
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
from sklearn.cluster import KMeans, MeanShift
from sklearn import preprocessing, model_selection
import pandas as pd

In [57]:
df = pd.read_excel('titanic.xls')
original_df = pd.DataFrame.copy(df)

df.drop(['body','name','sibsp','parch','cabin'], axis =1 , inplace = True)
df.apply(pd.to_numeric, errors='ignore')
df.fillna(0, inplace = True)
df.head()

Unnamed: 0,pclass,survived,sex,age,ticket,fare,embarked,boat,home.dest
0,1,1,female,29.0,24160,211.3375,S,2,"St Louis, MO"
1,1,1,male,0.9167,113781,151.55,S,11,"Montreal, PQ / Chesterville, ON"
2,1,0,female,2.0,113781,151.55,S,0,"Montreal, PQ / Chesterville, ON"
3,1,0,male,30.0,113781,151.55,S,0,"Montreal, PQ / Chesterville, ON"
4,1,0,female,25.0,113781,151.55,S,0,"Montreal, PQ / Chesterville, ON"


## handle non numerical data, transform to int64 and float64

In [None]:
def handle_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1

            df[column] = list(map(convert_to_int, df[column]))

    return df

df = handle_non_numerical_data(df)
df.head()

In [59]:
X = np.array(df.drop('survived', axis = 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = KMeans(n_clusters=2)
clf.fit(X)

correct = 0
for i in range(len(X)):
    predict_me = np.array(X[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = clf.predict(predict_me)
    if prediction[0] == y[i]:
        correct += 1

print(correct/len(X))

0.7349121466768526


### mean shift aproach

In [167]:
X = np.array(df.drop('survived', axis = 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

MeanShift()

In [168]:
labels = clf.labels_
cluster_centers = clf.cluster_centers_

original_df['cluster_group']=np.nan

for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]
    
n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group']==float(i)) ]
    survival_cluster = temp_df[  (temp_df['survived'] == 1) ]
    survival_rate = len(survival_cluster) / len(temp_df)
    survival_rates[i] = survival_rate
    
print(survival_rates)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


{0: 0.37750385208012327, 1: 0.9090909090909091}


In [64]:
print(original_df[(original_df['cluster_group'] == 1)])

     pclass  survived                                               name  \
35        1         1                           Bowen, Miss. Grace Scott   
49        1         1                 Cardeza, Mr. Thomas Drake Martinez   
50        1         1  Cardeza, Mrs. James Warburton Martinez (Charlo...   
66        1         1                        Chaudanson, Miss. Victorine   
116       1         1                Fortune, Mrs. Mark (Mary McDougald)   
183       1         1                             Lesurer, Mr. Gustave J   
250       1         1                         Ryerson, Miss. Emily Borie   
251       1         1              Ryerson, Miss. Susan Parker "Suzette"   
252       1         0                         Ryerson, Mr. Arthur Larned   
253       1         1    Ryerson, Mrs. Arthur Larned (Emily Maria Borie)   
302       1         1                                   Ward, Miss. Anna   

        sex   age  sibsp  parch    ticket      fare            cabin embarked  \
35   f

In [195]:
df_train = pd.read_csv('titanic/train.csv')
df_test = pd.read_csv('titanic/test.csv')
df_train.drop(['Name','Parch','Cabin','PassengerId','Ticket','Embarked','SibSp','Age','Fare'], axis =1 , inplace = True)
df_test.drop(['Name','Parch','Cabin','PassengerId','Ticket','Embarked','SibSp','Age','Fare'], axis =1 , inplace = True)
df_train

Unnamed: 0,Survived,Pclass,Sex
0,0,3,male
1,1,1,female
2,1,3,female
3,1,1,female
4,0,3,male
...,...,...,...
886,0,2,male
887,1,1,female
888,0,3,female
889,1,1,male


In [207]:
df_train.fillna(0, inplace = True)
df_test.fillna(0, inplace = True)

df_train = handle_non_numerical_data(df_train)
df_test = handle_non_numerical_data(df_test)
Xs = np.array(df_train.drop('Survived', axis = 1)).astype(float)
Xs = preprocessing.scale(Xs)
ys = np.array(df_train['Survived'])

clf = KMeans(n_clusters=2)
clf.fit(Xs)

correct = 0
for i in range(len(Xs)):
    predict_me = np.array(Xs[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = clf.predict(predict_me)
    if prediction[0] == ys[i]:
        correct += 1

print(correct/len(Xs))

0.7867564534231201


In [208]:
example_measures = np.array(df_test)
example_measures = example_measures.reshape(len(example_measures), -1)

X_test = preprocessing.scale(example_measures)

In [209]:
prediction = clf.predict(X_test)
print(prediction)

[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]


In [210]:
X_test[data]

array([ 0.87348191, -0.75592895])

In [211]:
import csv

with open('responses_titanic.csv', 'w', newline = '') as f:
    start = 892
    write = csv.writer(f)
    write.writerow(['PassengerId','Survived'])
    for data in range(len(X_test)):
        write.writerow([str(int(start)), str(prediction[data])])
        start +=1