In [12]:
# https://pythonprogramming.net/mean-shift-titanic-dataset-machine-learning-tutorial/
%matplotlib inline

import numpy as np
from sklearn.cluster import MeanShift, KMeans
from sklearn import preprocessing, cross_validation
import pandas as pd
import matplotlib.pyplot as plt


'''
Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
survival Survival (0 = No; 1 = Yes)
name Name
sex Sex
age Age
sibsp Number of Siblings/Spouses Aboard
parch Number of Parents/Children Aboard
ticket Ticket Number
fare Passenger Fare (British pound)
cabin Cabin
embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
boat Lifeboat
body Body Identification Number
home.dest Home/Destination
'''


# https://pythonprogramming.net/static/downloads/machine-learning-data/titanic.xls
df = pd.read_excel('titanic.xls')

original_df = pd.DataFrame.copy(df)
df.drop(['body','name'], 1, inplace=True)
df.fillna(0,inplace=True)

def handle_non_numerical_data(df):
    
    # handling non-numerical data: must convert.
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        #print(column,df[column].dtype)
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            
            column_contents = df[column].values.tolist()
            #finding just the uniques
            unique_elements = set(column_contents)
            # great, found them. 
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    # creating dict that contains new
                    # id per unique string
                    text_digit_vals[unique] = x
                    x+=1
            # now we map the new "id" vlaue
            # to replace the string. 
            df[column] = list(map(convert_to_int,df[column]))

    return df

df = handle_non_numerical_data(df)
df.drop(['ticket','home.dest'], 1, inplace=True)

X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

In [13]:
labels = clf.labels_
cluster_centers = clf.cluster_centers_

In [14]:
original_df['cluster_group']=np.nan

In [15]:
for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [16]:
n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group']==float(i)) ]
    #print(temp_df.head())

    survival_cluster = temp_df[  (temp_df['survived'] == 1) ]

    survival_rate = len(survival_cluster) / len(temp_df)
    #print(i,survival_rate)
    survival_rates[i] = survival_rate
    
survival_rates

{0: 0.37022292993630573, 1: 0.7567567567567568, 2: 1.0, 3: 0.1}

In [17]:
print(original_df[ (original_df['cluster_group']==1) ])

     pclass  survived                                               name  \
10        1         0                             Astor, Col. John Jacob   
16        1         0                           Baxter, Mr. Quigg Edmond   
17        1         1    Baxter, Mrs. James (Helene DeLaudeniere Chaput)   
54        1         1                Carter, Master. William Thornton II   
55        1         1                          Carter, Miss. Lucile Polk   
56        1         1                         Carter, Mr. William Ernest   
57        1         1          Carter, Mrs. William Ernest (Lucile Polk)   
78        1         1  Compton, Mrs. Alexander Taylor (Mary Eliza Ing...   
97        1         1  Douglas, Mrs. Frederick Charles (Mary Helene B...   
98        1         1        Douglas, Mrs. Walter Donald (Mahala Dutton)   
103       1         1                      Endres, Miss. Caroline Louise   
111       1         1                     Fortune, Miss. Alice Elizabeth   
112       1 

In [18]:
print(original_df[ (original_df['cluster_group']==0) ].describe())

            pclass     survived         age        sibsp       parch  \
count  1256.000000  1256.000000  995.000000  1256.000000  1256.00000   
mean      2.332803     0.370223   29.493635     0.487261     0.30414   
std       0.818554     0.483057   14.323514     1.047021     0.65463   
min       1.000000     0.000000    0.166700     0.000000     0.00000   
25%       2.000000     0.000000         NaN     0.000000     0.00000   
50%       3.000000     0.000000         NaN     0.000000     0.00000   
75%       3.000000     1.000000         NaN     1.000000     0.00000   
max       3.000000     1.000000   80.000000     8.000000     4.00000   

              fare        body  cluster_group  
count  1255.000000  116.000000         1256.0  
mean     26.593366  161.189655            0.0  
std      32.161736   97.850810            0.0  
min       0.000000    1.000000            0.0  
25%            NaN         NaN            0.0  
50%            NaN         NaN            0.0  
75%            



In [19]:
print(original_df[ (original_df['cluster_group']==2) ].describe())

       pclass  survived        age  sibsp     parch        fare  body  \
count     6.0       6.0   6.000000    6.0  6.000000    6.000000   0.0   
mean      1.0       1.0  40.833333    0.0  0.333333  429.011133   NaN   
std       0.0       0.0   9.239408    0.0  0.516398  129.075794   NaN   
min       1.0       1.0  35.000000    0.0  0.000000  262.375000   NaN   
25%       1.0       1.0  35.250000    0.0  0.000000  324.863550   NaN   
50%       1.0       1.0  36.000000    0.0  0.000000  512.329200   NaN   
75%       1.0       1.0  42.750000    0.0  0.750000  512.329200   NaN   
max       1.0       1.0  58.000000    0.0  1.000000  512.329200   NaN   

       cluster_group  
count            6.0  
mean             2.0  
std              0.0  
min              2.0  
25%              2.0  
50%              2.0  
75%              2.0  
max              2.0  




In [20]:
cluster_0 = (original_df[ (original_df['cluster_group']==0) ])
cluster_0_fc = (cluster_0[ (cluster_0['pclass']==1) ])
cluster_0_fc.describe()



Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,281.0,281.0,242.0,281.0,281.0,281.0,32.0,281.0
mean,1.0,0.594306,39.532714,0.384342,0.209964,66.034268,169.0,0.0
std,0.0,0.491902,14.352001,0.522669,0.495018,45.795996,83.083712,0.0
min,1.0,0.0,0.9167,0.0,0.0,0.0,16.0,0.0
25%,1.0,0.0,,0.0,0.0,30.0,,0.0
50%,1.0,1.0,,0.0,0.0,55.0,,0.0
75%,1.0,1.0,,1.0,0.0,82.1708,,0.0
max,1.0,1.0,80.0,2.0,2.0,227.525,307.0,0.0
