In [8]:
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
from sklearn.cluster import MeanShift
from sklearn import preprocessing, cross_validation
import pandas as pd

In [9]:
df = pd.read_excel('titanic.xls')
original_df = pd.DataFrame.copy(df)                  #we just made a copy of the original dataframe & saved it otherwise
print (df.head())

   pclass  survived                                             name     sex  \
0       1         1                    Allen, Miss. Elisabeth Walton  female   
1       1         1                   Allison, Master. Hudson Trevor    male   
2       1         0                     Allison, Miss. Helen Loraine  female   
3       1         0             Allison, Mr. Hudson Joshua Creighton    male   
4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   

       age  sibsp  parch  ticket      fare    cabin embarked boat   body  \
0  29.0000      0      0   24160  211.3375       B5        S    2    NaN   
1   0.9167      1      2  113781  151.5500  C22 C26        S   11    NaN   
2   2.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   
3  30.0000      1      2  113781  151.5500  C22 C26        S  NaN  135.0   
4  25.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   

                         home.dest  
0                     St 

In [10]:
df.drop(['body','name'], 1, inplace=True)           #removing unnecessary columns
df.convert_objects(convert_numeric=True)
df.fillna(0, inplace=True)
print (df.head())

   pclass  survived     sex      age  sibsp  parch  ticket      fare    cabin  \
0       1         1  female  29.0000      0      0   24160  211.3375       B5   
1       1         1    male   0.9167      1      2  113781  151.5500  C22 C26   
2       1         0  female   2.0000      1      2  113781  151.5500  C22 C26   
3       1         0    male  30.0000      1      2  113781  151.5500  C22 C26   
4       1         0  female  25.0000      1      2  113781  151.5500  C22 C26   

  embarked boat                        home.dest  
0        S    2                     St Louis, MO  
1        S   11  Montreal, PQ / Chesterville, ON  
2        S    0  Montreal, PQ / Chesterville, ON  
3        S    0  Montreal, PQ / Chesterville, ON  
4        S    0  Montreal, PQ / Chesterville, ON  


  from ipykernel import kernelapp as app


In [11]:
# a function to handle all non-numeric data from the dataset 
def handle_non_numerical_data(df):
    columns = df.columns.values                    #all the columns' name in 'columns variable'
    for column in columns:
        text_digit_vals = {}                       #a dictionary to store the column-values as numerical
        
        def convert_to_int(val):                   #a function to store the unique values in the dictionary
            return text_digit_vals[val]
        
                                                   #storing the uniques values inside each column and 
                                                   #giving a value to be stored in the dictionary
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist() #taking whole column as a list
            unique_elements = set(column_contents)       #finding how many unique values in each column
            x = 0
            for unique in unique_elements:               #giving each unique value a number and storing in the dict
                if unique not in text_digit_vals: 
                    text_digit_vals[unique] = x
                    x += 1
            
            df[column] = list(map(convert_to_int, df[column]))  #mapping using the funtion convert_to_int() 
    return df

In [12]:
df = handle_non_numerical_data(df)
print(df.head())

   pclass  survived  sex      age  sibsp  parch  ticket      fare  cabin  \
0       1         1    1  29.0000      0      0     771  211.3375     79   
1       1         1    0   0.9167      1      2     531  151.5500    137   
2       1         0    1   2.0000      1      2     531  151.5500    137   
3       1         0    0  30.0000      1      2     531  151.5500    137   
4       1         0    1  25.0000      1      2     531  151.5500    137   

   embarked  boat  home.dest  
0         1     1         98  
1         1     3         63  
2         1     0         63  
3         1     0         63  
4         1     0         63  


In [13]:
#try kore kore dekha jay, which column has how much effect on clustering accuracy
df.drop(['boat','sex'], 1, inplace=True)
print (df.head())

   pclass  survived      age  sibsp  parch  ticket      fare  cabin  embarked  \
0       1         1  29.0000      0      0     771  211.3375     79         1   
1       1         1   0.9167      1      2     531  151.5500    137         1   
2       1         0   2.0000      1      2     531  151.5500    137         1   
3       1         0  30.0000      1      2     531  151.5500    137         1   
4       1         0  25.0000      1      2     531  151.5500    137         1   

   home.dest  
0         98  
1         63  
2         63  
3         63  
4         63  


In [50]:
X = np.array(df.drop(['survived'], 1).astype(float))   
X = preprocessing.scale(X)                             
y = np.array(df['survived'])

clf = MeanShift()                 
clf.fit(X)                                 

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

In [51]:
labels = clf.labels_                     #giving the labels' their names
cluster_centers = clf.cluster_centers_
#print (labels)

In [52]:
original_df['cluster_group'] = np.nan    #making a new column in the copy

In [53]:
#now we iterate thru the labels and populate the values of this new created-column
for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]                   #column'er jei row, tar label'tai cluster-group
    #print(labels[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [54]:
n_clusters_ = len(np.unique(labels))     #finding the number of clusters, i.e. no. of unique labels in df
print (n_clusters_)

7


In [55]:
survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group']==float(i)) ]
    survival_cluster = temp_df[ (temp_df['survived']==1) ]
    survival_rate = len(survival_cluster) / len(temp_df) 
                                        #survival rate pawar jnno ei 2ta df use korlum, can b diffrnt ways to find this rate
    survival_rates[i] = survival_rate   #lastly storing that rate for that cluster_grp

print (survival_rates)

{0: 0.3826860841423948, 1: 0.0, 2: 0.6, 3: 1.0, 4: 0.5, 5: 0.0, 6: 1.0}


In [61]:
print (original_df[(original_df['cluster_group']==0)].describe())

            pclass     survived         age        sibsp        parch  \
count  1236.000000  1236.000000  983.000000  1236.000000  1236.000000   
mean      2.317961     0.382686   29.895982     0.385113     0.283172   
std       0.821789     0.486239   14.132032     0.716879     0.633025   
min       1.000000     0.000000    0.166700     0.000000     0.000000   
25%       2.000000     0.000000   21.000000     0.000000     0.000000   
50%       3.000000     0.000000   28.000000     0.000000     0.000000   
75%       3.000000     1.000000   38.000000     1.000000     0.000000   
max       3.000000     1.000000   80.000000     4.000000     4.000000   

              fare        body  cluster_group  
count  1235.000000  114.000000         1236.0  
mean     25.774817  162.473684            0.0  
std      30.340357   98.152871            0.0  
min       0.000000    1.000000            0.0  
25%       7.895800   72.750000            0.0  
50%      13.000000  165.500000            0.0  
75%   

### amader eikhne 6ta cluster create hoise based on the survival and also some other parameters(unknown to the programmer) .....so no. of clusters is diff than in video

### jodi prosno hoy j What is the survival_rate of male passengers in cluster 0? then :

In [63]:
cluster_0 = original_df[ (original_df['cluster_group']==0) ]
cluster_0_fc = cluster_0[ (cluster_0['sex']=='male') ]
cluster_0_fc.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,811.0,811.0,631.0,811.0,811.0,810.0,107.0,811.0
mean,2.38471,0.193588,30.711833,0.31566,0.172626,21.336634,163.700935,0.0
std,0.801211,0.395354,13.936842,0.696722,0.486652,25.037497,96.244992,0.0
min,1.0,0.0,0.3333,0.0,0.0,0.0,1.0,0.0
25%,2.0,0.0,21.0,0.0,0.0,7.8542,79.5,0.0
50%,3.0,0.0,28.0,0.0,0.0,10.5,166.0,0.0
75%,3.0,0.0,39.0,0.0,0.0,26.0,257.0,0.0
max,3.0,1.0,80.0,4.0,4.0,227.525,322.0,0.0
