# Librerias

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
pd.options.display.float_format = '{:.2f}'.format

# Datos de entrenamiento

In [336]:
train = pd.read_csv('train.csv')
print(train.shape)
train.describe()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.38,2.31,29.7,0.52,0.38,32.2
std,257.35,0.49,0.84,14.53,1.1,0.81,49.69
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.12,0.0,0.0,7.91
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.45
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.33


# Datos que se uasaran para predecir y enviar el submit

In [337]:
test = pd.read_csv('test.csv')
print(test.shape)
test.describe()

(418, 11)


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.27,30.27,0.45,0.39,35.63
std,120.81,0.84,14.18,0.9,0.98,55.91
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.9
50%,1100.5,3.0,27.0,0.0,0.0,14.45
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.33


# Construimos la variable inicial de la persona

In [338]:
## get the title from the name
train["title"] = [i.split('.')[0] for i in train.Name]
train["title"] = [i.split(',')[1] for i in train.title]
## Whenever we split like that, there is a good change that we will end up with while space around our string values. Let's check that. 

In [339]:
print(train.title.unique())

[' Mr' ' Mrs' ' Miss' ' Master' ' Don' ' Rev' ' Dr' ' Mme' ' Ms' ' Major'
 ' Lady' ' Sir' ' Mlle' ' Col' ' Capt' ' the Countess' ' Jonkheer']


In [340]:
## Let's fix that
train.title = train.title.apply(lambda x: x.strip())

In [341]:
## We can also combile all three lines above for test set here
test['title'] = [i.split('.')[0].split(',')[1].strip() for i in test.Name]

## However it is important to be able to write readable code, and the line above is not so readable. 

In [342]:
## Let's replace some of the rare values with the keyword 'rare' and other word choice of our own. 
## train Data
train["title"] = [i.replace('Ms', 'Miss') for i in train.title]
train["title"] = [i.replace('Mlle', 'Miss') for i in train.title]
train["title"] = [i.replace('Mme', 'Mrs') for i in train.title]
train["title"] = [i.replace('Dr', 'rare') for i in train.title]
train["title"] = [i.replace('Col', 'rare') for i in train.title]
train["title"] = [i.replace('Major', 'rare') for i in train.title]
train["title"] = [i.replace('Don', 'rare') for i in train.title]
train["title"] = [i.replace('Jonkheer', 'rare') for i in train.title]
train["title"] = [i.replace('Sir', 'rare') for i in train.title]
train["title"] = [i.replace('Lady', 'rare') for i in train.title]
train["title"] = [i.replace('Capt', 'rare') for i in train.title]
train["title"] = [i.replace('the Countess', 'rare') for i in train.title]
train["title"] = [i.replace('Rev', 'rare') for i in train.title]

In [343]:
## we are writing a function that can help us modify title column
def name_converted(feature):
    """
    This function helps modifying the title column
    """
    
    result = ''
    if feature in ['the Countess','Capt','Lady','Sir','Jonkheer','Don','Major','Col', 'Rev', 'Dona', 'Dr']:
        result = 'rare'
    elif feature in ['Ms', 'Mlle']:
        result = 'Miss'
    elif feature == 'Mme':
        result = 'Mrs'
    else:
        result = feature
    return result

test.title = test.title.map(name_converted)
train.title = train.title.map(name_converted)

In [344]:
print(train.title.unique())
print(test.title.unique())

['Mr' 'Mrs' 'Miss' 'Master' 'rare']
['Mr' 'Mrs' 'Miss' 'Master' 'rare']


# Construyendo variable tamano de familia

In [345]:
## Family_size seems like a good feature to create
train['family_size'] = train.SibSp + train.Parch+1
test['family_size'] = test.SibSp + test.Parch+1

In [346]:
## bin the family size. 
def family_group(size):
    """
    This funciton groups(loner, small, large) family based on family size
    """
    
    a = ''
    if (size <= 1):
        a = 'loner'
    elif (size <= 4):
        a = 'small'
    else:
        a = 'large'
    return a

In [347]:
## apply the family_group function in family_size
train['family_group'] = train['family_size'].map(family_group)
test['family_group'] = test['family_size'].map(family_group)

In [348]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,family_size,family_group
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,2,small
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C,Mrs,2,small


# Eliminando variable tickets

In [349]:
train.drop(['Ticket'], axis=1, inplace=True)

test.drop(['Ticket'], axis=1, inplace=True)

In [350]:
test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,family_size,family_group
0,892,3,"Kelly, Mr. James",male,34.5,0,0,7.83,,Q,Mr,1,loner
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,,S,Mrs,2,small


# Calculating fare based on family size.

In [351]:
## Calculating fare based on family size. 
train['calculated_fare'] = train.Fare/train.family_size
test['calculated_fare'] = test.Fare/test.family_size

# Eliminando pasengerId y Name

In [352]:
train.drop(['PassengerId'], axis=1, inplace=True)

test.drop(['PassengerId'], axis=1, inplace=True)

In [353]:
train.drop(['Name'], axis=1, inplace=True)

test.drop(['Name'], axis=1, inplace=True)

In [354]:
train.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,family_size,family_group,calculated_fare
0,0,3,male,22.0,1,0,7.25,,S,Mr,2,small,3.62
1,1,1,female,38.0,1,0,71.28,C85,C,Mrs,2,small,35.64


In [355]:
test.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,family_size,family_group,calculated_fare
0,3,male,34.5,0,0,7.83,,Q,Mr,1,loner,7.83
1,3,female,47.0,1,0,7.0,,S,Mrs,2,small,3.5


# Separando Supervivientes de la data train

In [356]:
train.index.name = "index"
survivers = train.Survived

train.drop(["Survived"],axis=1, inplace=True)


train.head(2)


Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,family_size,family_group,calculated_fare
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3,male,22.0,1,0,7.25,,S,Mr,2,small,3.62
1,1,female,38.0,1,0,71.28,C85,C,Mrs,2,small,35.64


In [357]:
survivers.head(2)

index
0    0
1    1
Name: Survived, dtype: int64

# Uniremos las dos datas

In [358]:
test["index"] = list(range(892,1310))
test = test.set_index('index')
test.head(2)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,family_size,family_group,calculated_fare
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
892,3,male,34.5,0,0,7.83,,Q,Mr,1,loner,7.83
893,3,female,47.0,1,0,7.0,,S,Mrs,2,small,3.5


In [359]:
all_data = pd.concat([train,test], ignore_index = False)
all_data.head(2)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,family_size,family_group,calculated_fare
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3,male,22.0,1,0,7.25,,S,Mr,2,small,3.62
1,1,female,38.0,1,0,71.28,C85,C,Mrs,2,small,35.64


# Imputando Fare

In [360]:
all_data[all_data.Fare.isnull()]

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,family_size,family_group,calculated_fare
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1044,3,male,60.5,0,0,,,S,Mr,1,loner,


In [361]:
missing_value = all_data[(all_data.Pclass == 3) & 
                     (all_data.Embarked == "S") & 
                     (all_data.Sex == "male")].Fare.mean()
## replace the test.fare null values with test.fare mean
all_data.Fare.fillna(missing_value, inplace=True)

# Assign all the null values to N

In [362]:
## Assign all the null values to N
all_data.Cabin.fillna("N", inplace=True)
all_data.Cabin = [i[0] for i in all_data.Cabin]

# Viendo los valores medio de las cabinas

In [363]:
all_data.groupby("Cabin")['Fare'].mean().sort_values()

Cabin
G    14.21
F    18.08
N    19.13
T    35.50
A    41.24
D    53.01
E    54.56
C   107.93
B   122.38
Name: Fare, dtype: float64

# Funcion para estimar la cabina a traves de fare

In [364]:
def cabin_estimator(i):
    """Grouping cabin feature by the first letter"""
    a = 0
    if i<16:
        a = "G"
    elif i>=16 and i<27:
        a = "F"
    elif i>=27 and i<38:
        a = "T"
    elif i>=38 and i<47:
        a = "A"
    elif i>= 47 and i<53:
        a = "E"
    elif i>= 53 and i<54:
        a = "D"
    elif i>=54 and i<116:
        a = 'C'
    else:
        a = "B"
    return a

In [365]:
with_N = all_data[all_data.Cabin == "N"]

without_N = all_data[all_data.Cabin != "N"]

In [366]:
print(with_N.shape)
with_N.head(2)

(1014, 12)


Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,family_size,family_group,calculated_fare
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3,male,22.0,1,0,7.25,N,S,Mr,2,small,3.62
2,3,female,26.0,0,0,7.92,N,S,Miss,1,loner,7.92


In [367]:
print(without_N.shape)
without_N.head(2)

(295, 12)


Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,family_size,family_group,calculated_fare
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,female,38.0,1,0,71.28,C,C,Mrs,2,small,35.64
3,1,female,35.0,1,0,53.1,C,S,Mrs,2,small,26.55


# Imputando la cabina

In [368]:
##applying cabin estimator function. 
with_N['Cabin'] = with_N.Fare.apply(lambda x: cabin_estimator(x))
print(without_N.shape)
with_N.head(3)

(295, 12)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,family_size,family_group,calculated_fare
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3,male,22.0,1,0,7.25,G,S,Mr,2,small,3.62
2,3,female,26.0,0,0,7.92,G,S,Miss,1,loner,7.92
4,3,male,35.0,0,0,8.05,G,S,Mr,1,loner,8.05


# Uniendo de nuevo en una sola data

In [369]:
all_data = pd.concat([with_N, without_N], axis=0).sort_index()
all_data

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,family_size,family_group,calculated_fare
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3,male,22.00,1,0,7.25,G,S,Mr,2,small,3.62
1,1,female,38.00,1,0,71.28,C,C,Mrs,2,small,35.64
2,3,female,26.00,0,0,7.92,G,S,Miss,1,loner,7.92
3,1,female,35.00,1,0,53.10,C,S,Mrs,2,small,26.55
4,3,male,35.00,0,0,8.05,G,S,Mr,1,loner,8.05
...,...,...,...,...,...,...,...,...,...,...,...,...
1305,3,male,,0,0,8.05,G,S,Mr,1,loner,8.05
1306,1,female,39.00,0,0,108.90,C,C,rare,1,loner,108.90
1307,3,male,38.50,0,0,7.25,G,S,Mr,1,loner,7.25
1308,3,male,,0,0,8.05,G,S,Mr,1,loner,8.05


# Imputando Embarked

In [370]:
all_data[all_data.Embarked.isnull()]

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,family_size,family_group,calculated_fare
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
61,1,female,38.0,0,0,80.0,B,,Miss,1,loner,80.0
829,1,female,62.0,0,0,80.0,B,,Mrs,1,loner,80.0


In [371]:
missing_value = all_data[(all_data.Pclass == 1) & 
                     (all_data.Cabin == "B") & 
                     (all_data.Sex == "female")].Embarked.mode()[0]

all_data.Embarked.fillna(missing_value, inplace=True)

# Imputando CalculatedFare

In [372]:
all_data[all_data.calculated_fare.isnull()]

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,family_size,family_group,calculated_fare
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1044,3,male,60.5,0,0,13.15,G,S,Mr,1,loner,


In [373]:
missing_value = all_data[(all_data.Pclass == 3) & 
                     (all_data.Embarked == "S") & 
                     (all_data.Sex == "male")].calculated_fare.mean()

all_data.calculated_fare.fillna(missing_value, inplace=True)

In [374]:
all_data = pd.get_dummies(all_data, columns=['title',"Pclass", 'Cabin','Embarked','Sex', 'family_group'], drop_first=False)

In [375]:
## Importing RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

## writing a function that takes a dataframe with missing values and outputs it by filling the missing values. 
def completing_age(df):
    
    age_df = df
    
    temp_train = age_df.loc[age_df.Age.notnull()] ## df with age values
    temp_test = age_df.loc[age_df.Age.isnull()] ## df without age values

    y = temp_train.Age.values ## setting target variables(age) in y 

    x = temp_train.drop("Age", axis = 1).values

    rfr = RandomForestRegressor(n_estimators=1500, n_jobs=-1)
    rfr.fit(x, y)
    
    predicted_age = rfr.predict(temp_test.drop("Age", axis = 1).values)
    
    df.loc[df.Age.isnull(), "Age"] = predicted_age
    

    return df

In [376]:
completing_age(all_data);

In [377]:
train = all_data[:891]

test = all_data[891:]

# adding saved target variable with train. 
train['Survived'] = survivers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [378]:
train.head(2)

Unnamed: 0_level_0,Age,SibSp,Parch,Fare,family_size,calculated_fare,title_Master,title_Miss,title_Mr,title_Mrs,...,Cabin_T,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,family_group_large,family_group_loner,family_group_small,Survived
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,22.0,1,0,7.25,2,3.62,0,0,1,0,...,0,0,0,1,0,1,0,0,1,0
1,38.0,1,0,71.28,2,35.64,0,0,0,1,...,0,1,0,0,1,0,0,0,1,1


In [379]:
# separating our independent and dependent variable
X = train.drop(['Survived'], axis = 1)
y = train["Survived"]

from sklearn.preprocessing import StandardScaler
st_scale = StandardScaler()

sc = st_scale

In [380]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = .33, random_state=0)

In [381]:
# Feature Scaling
## We will be using standardscaler to transform
from sklearn.preprocessing import StandardScaler
st_scale = StandardScaler()

## transforming "train_x"
X_train = st_scale.fit_transform(X_train)
## transforming "test_x"
X_test = st_scale.transform(X_test)

## transforming "The testset"
test = st_scale.transform(test)

In [382]:
## Using StratifiedShuffleSplit
## We can use KFold, StratifiedShuffleSplit, StratiriedKFold or ShuffleSplit, They are all close cousins. look at sklearn userguide for more info.   
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
cv = StratifiedShuffleSplit(n_splits = 10, test_size = .25, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%
## Using standard scale for the whole dataset.

## saving the feature names for decision tree display
column_names = X.columns

X = sc.fit_transform(X)


In [383]:
## Importing the model. 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
## trying out multiple values for k
k_range = range(1,31)
## 
weights_options=['uniform','distance']
# 
param = {'n_neighbors':k_range, 'weights':weights_options}
## Using startifiedShufflesplit. 
cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=15)
# estimator = knn, param_grid = param, n_jobs = -1 to instruct scikit learn to use all available processors. 
grid = GridSearchCV(KNeighborsClassifier(), param,cv=cv,verbose = False, n_jobs=-1)
## Fitting the model. 
grid.fit(X,y)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=15, test_size=0.3,
            train_size=None),
             error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': range(1, 31),
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=False)

In [384]:
knn_grid= grid.best_estimator_
knn_grid.score(X,y)

0.8249158249158249

In [385]:
from sklearn.svm import SVC
Cs = [0.001, 0.01, 0.1, 1,1.5,2,2.5,3,4,5, 10] ## penalty parameter C for the error term. 
gammas = [0.0001,0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=15)
grid_search = GridSearchCV(SVC(kernel = 'rbf', probability=True), param_grid, cv=cv) ## 'rbf' stands for gaussian kernel
grid_search.fit(X,y)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=15, test_size=0.3,
            train_size=None),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=True, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 1.5, 2, 2.5, 3, 4, 5, 10],
                         'gamma': [0.0001, 0.001, 0.01, 0.1, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [386]:
# using the best found hyper paremeters to get the score. 
svm_grid = grid_search.best_estimator_
svm_grid.score(X,y)

0.835016835016835

In [387]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
n_estimators = [140,145,150,155,160];
max_depth = range(1,10);
criterions = ['gini', 'entropy'];
cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=15)


parameters = {'n_estimators':n_estimators,
              'max_depth':max_depth,
              'criterion': criterions
              
        }
grid = GridSearchCV(estimator=RandomForestClassifier(max_features='auto'),
                                 param_grid=parameters,
                                 cv=cv,
                                 n_jobs = -1)
grid.fit(X,y) 

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=15, test_size=0.3,
            train_size=None),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
      

In [388]:
rf_grid = grid.best_estimator_
rf_grid.score(X,y)

0.8956228956228957

In [389]:
from xgboost import XGBClassifier
XGBClassifier = XGBClassifier()
XGBClassifier.fit(X, y)
y_pred = XGBClassifier.predict(X_test)
XGBClassifier_accy = round(accuracy_score(y_pred, y_test), 3)
print(XGBClassifier_accy)

0.922


In [390]:
from sklearn.gaussian_process import GaussianProcessClassifier
GaussianProcessClassifier = GaussianProcessClassifier()
GaussianProcessClassifier.fit(X, y)
y_pred = GaussianProcessClassifier.predict(X_test)
gau_pro_accy = round(accuracy_score(y_pred, y_test), 3)
print(gau_pro_accy)

0.908


In [391]:
from sklearn.ensemble import VotingClassifier

voting_classifier = VotingClassifier(estimators=[
    ('svc', svm_grid),
    ('random_forest', rf_grid),
    ('knn_classifier', knn_grid),
    ('XGB_Classifier', XGBClassifier),
    ('gaussian_process_classifier', GaussianProcessClassifier)
],voting='hard')

#voting_classifier = voting_classifier.fit(train_x,train_y)
voting_classifier = voting_classifier.fit(X,y)

In [392]:
y_pred = voting_classifier.predict(X_test)
voting_accy = round(accuracy_score(y_pred, y_test), 3)
print(voting_accy)


0.885


In [393]:
submit = pd.DataFrame(voting_classifier.predict(test), index = list(range(892,1310)))
submit.to_csv("submit.csv")

# Fin