## Meanshift Example
* source; sentdex ML - Pt. 39-40

In [1]:
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
from sklearn.cluster import MeanShift

In [2]:
import pandas as pd
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_excel("./data/titanic.xls")
original_df = pd.DataFrame.copy(df)

In [4]:
df.drop(['body','name'], 1, inplace = True)
#df.convert_objects(convert_numeric = True)
df.fillna(0, inplace = True)
print(df.describe())
print(df.columns.values.tolist())

            pclass     survived          age        sibsp        parch  \
count  1309.000000  1309.000000  1309.000000  1309.000000  1309.000000   
mean      2.294882     0.381971    23.877515     0.498854     0.385027   
std       0.837836     0.486055    17.590848     1.041658     0.865560   
min       1.000000     0.000000     0.000000     0.000000     0.000000   
25%       2.000000     0.000000     7.000000     0.000000     0.000000   
50%       3.000000     0.000000    24.000000     0.000000     0.000000   
75%       3.000000     1.000000    35.000000     1.000000     0.000000   
max       3.000000     1.000000    80.000000     8.000000     9.000000   

              fare  
count  1309.000000  
mean     33.270043  
std      51.747063  
min       0.000000  
25%       7.895800  
50%      14.454200  
75%      31.275000  
max     512.329200  
['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'home.dest']


In [5]:
# check pbpython.com/categorical-encoding.html for how to encode the data
# boat is an extreme example for following encoding, int and char mixed
# following is just one way of encoding
def handle_non_numerical_data(df):
    encode_dictionary = {}
    columns = df.columns.values
    new_df = df.copy() #not change original dataframe
    
    #print("columns: ",columns)
    for column in columns: # go through every columns
        text_digit_vals = {} # store the encoding info eg. {'male': 0, 'female': 1}
        def convert_to_int(val): # map function process input
            return text_digit_vals[val]
        if df[column].dtype != np.int64 and df[column].dtype != np.float64: # not number
            column_contents = df[column].values.tolist() # conver to list ['male','female','male'....]
            #print("df[colunm]: ", df[column])
            #print("column_contents: ", column_contents)
            unique_elements = set(column_contents) #unique value eg. {'male', 'female'}
            #print(column)
            #print("unique_elements: ", unique_elements)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    #print("text_digit_vals[unique]", unique, x)
                    
                    x += 1
            new_df[column] = list( map(convert_to_int, df[column])) # male...->text_digit_vals[male...]->0,1,1,0
        #print(text_digit_vals)
        #store the encode,{column:{unique:x,....'male':0}....}
        encode_dictionary[column] = text_digit_vals
    #print(encode_dictionary)
    return new_df, encode_dictionary
    
    

In [6]:
df2, encode_dictionary = handle_non_numerical_data(df) #drop 'body' 'day' replace string to numeric value
#df2
#encode_dictionary
X = np.array(df2.drop(['boat'], 1))
X = np.array(df2.drop(['survived'], 1).astype(float))
#print(X)
X = preprocessing.scale(X)
#print(X)
y = np.array(df2['survived'].astype(float))

In [7]:
clf = MeanShift()
clf.fit(X)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=None, seeds=None)

In [8]:
labels = clf.labels_
cluster_centers = clf.cluster_centers_
print("labels.shape: ", labels.shape)

original_df['cluster_group'] = pd.Series(labels) # DF is a dict of Series objects

print(original_df.describe())

labels.shape:  (1309,)
            pclass     survived          age        sibsp        parch  \
count  1309.000000  1309.000000  1046.000000  1309.000000  1309.000000   
mean      2.294882     0.381971    29.881135     0.498854     0.385027   
std       0.837836     0.486055    14.413500     1.041658     0.865560   
min       1.000000     0.000000     0.166700     0.000000     0.000000   
25%       2.000000     0.000000    21.000000     0.000000     0.000000   
50%       3.000000     0.000000    28.000000     0.000000     0.000000   
75%       3.000000     1.000000    39.000000     1.000000     0.000000   
max       3.000000     1.000000    80.000000     8.000000     9.000000   

              fare        body  cluster_group  
count  1308.000000  121.000000    1309.000000  
mean     33.295479  160.809917       0.102368  
std      51.758668   97.696922       0.505537  
min       0.000000    1.000000       0.000000  
25%       7.895800   72.000000       0.000000  
50%      14.454200  15

In [9]:
n_clusters_ = len(np.unique(labels)) # 4
survival_rates = {}
age_rates = {}
clf_df = {}

for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group']==float(i))]
    clf_df[i] = temp_df
    survival_cluster = temp_df[(temp_df['survived']==1)]
    age_cluster = temp_df[(temp_df['age']<=10)]
    #print(survival_cluster)
    survival_rate = len(survival_cluster)/len(temp_df)
    age_rate = len(age_cluster)/len(temp_df)
    survival_rates[i] = survival_rate
    age_rates[i] = age_rate
print("survival rate in deifferent cluster: ", survival_rates)
print("child under 10 in diffrent cluster: ", age_rates)

survival rate in deifferent cluster:  {0: 0.38072289156626504, 1: 0.0, 2: 0.6923076923076923, 3: 1.0, 4: 0.1}
child under 10 in diffrent cluster:  {0: 0.06265060240963856, 1: 0.38095238095238093, 2: 0.0, 3: 0.0, 4: 0.0}


In [10]:
print(clf_df[0].describe())
print("survival rate in 0_cluster: ", clf_df[0]['survived'].mean())

            pclass     survived         age        sibsp        parch  \
count  1245.000000  1245.000000  992.000000  1245.000000  1245.000000   
mean      2.311647     0.380723   29.824765     0.391165     0.288353   
std       0.825187     0.485760   14.199059     0.729766     0.637444   
min       1.000000     0.000000    0.166700     0.000000     0.000000   
25%       2.000000     0.000000   21.000000     0.000000     0.000000   
50%       3.000000     0.000000   28.000000     0.000000     0.000000   
75%       3.000000     1.000000   38.000000     1.000000     0.000000   
max       3.000000     1.000000   80.000000     4.000000     4.000000   

              fare        body  cluster_group  
count  1244.000000  115.000000         1245.0  
mean     26.908768  161.347826            0.0  
std      33.399699   98.355339            0.0  
min       0.000000    1.000000            0.0  
25%       7.895800   71.000000            0.0  
50%      13.416700  165.000000            0.0  
75%   

In [11]:
# Test boolean series in a dataframe/series
y = pd.Series(np.random.randn(5))
yx = pd.Series([True,False,True,False,True])
y[yx]

0   -0.706731
2   -0.149434
4    1.443069
dtype: float64

In [1]:
import tensorflow as tf