In [157]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.cluster import MeanShift, KMeans

In [158]:
df = pd.read_excel('titanic.xls')
original_df = pd.DataFrame.copy(df)

df.drop(['body', 'name'], 1, inplace=True)
df.fillna(0, inplace=True)
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,1,female,29.0,0,0,24160,211.3375,B5,S,2,"St Louis, MO"
1,1,1,male,0.9167,1,2,113781,151.55,C22 C26,S,11,"Montreal, PQ / Chesterville, ON"
2,1,0,female,2.0,1,2,113781,151.55,C22 C26,S,0,"Montreal, PQ / Chesterville, ON"
3,1,0,male,30.0,1,2,113781,151.55,C22 C26,S,0,"Montreal, PQ / Chesterville, ON"
4,1,0,female,25.0,1,2,113781,151.55,C22 C26,S,0,"Montreal, PQ / Chesterville, ON"


In [159]:
def handle_non_numerical_data(df):
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x += 1
            
            df[column] = list(map(convert_to_int, df[column]))
    return df
df = handle_non_numerical_data(df)
#df.drop(['ticket'], 1, inplace=True)
df.drop(['ticket','home.dest'], 1, inplace=True)


In [160]:
X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

labels = clf.labels_

original_df['cluster_group'] = np.nan

for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]
    
n_clusters_ = len(np.unique(labels))

survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group'] == float(i)) ]
    survival_cluster = temp_df[ (temp_df['survived']==1) ]
    survival_rate = len(survival_rates) / len(temp_df)
    survival_rates[i] = survival_rate
print(survival_rates)


{0: 0.0, 1: 0.025, 2: 0.2, 3: 1.0, 4: 2.0, 5: 1.25, 6: 0.4}


In [161]:

original_df[original_df['cluster_group'] == 2].describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,10.0,10.0,8.0,10.0,10.0,10.0,2.0,10.0
mean,3.0,0.1,39.875,0.8,6.0,42.70375,234.5,2.0
std,0.0,0.316228,1.552648,0.421637,1.632993,15.590194,130.814755,0.0
min,3.0,0.0,38.0,0.0,5.0,29.125,142.0,2.0
25%,3.0,0.0,39.0,1.0,5.0,31.303125,188.25,2.0
50%,3.0,0.0,39.5,1.0,5.0,35.5375,234.5,2.0
75%,3.0,0.0,40.25,1.0,6.0,46.9,280.75,2.0
max,3.0,1.0,43.0,1.0,9.0,69.55,327.0,2.0


In [162]:
cluster_0 = original_df[original_df['cluster_group'] == 0]
cluster_0_fc = cluster_0[cluster_0['pclass'] == 1]
cluster_0_fc.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,268.0,268.0,230.0,268.0,268.0,268.0,30.0,268.0
mean,1.0,0.604478,40.213043,0.369403,0.190299,60.106669,172.566667,0.0
std,0.0,0.489877,13.825023,0.520836,0.455079,37.428113,84.511449,0.0
min,1.0,0.0,4.0,0.0,0.0,0.0,16.0,0.0
25%,1.0,0.0,30.0,0.0,0.0,29.7,114.0,0.0
50%,1.0,1.0,39.5,0.0,0.0,52.8271,173.5,0.0
75%,1.0,1.0,50.0,1.0,0.0,79.2,242.25,0.0
max,1.0,1.0,80.0,2.0,2.0,227.525,307.0,0.0
