In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [None]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
train_data.hist(figsize=(20,12))

In [None]:
sns.barplot(y="Survived",x="Sex",data=train_data)

In [None]:
female=train_data.groupby(["Sex"]).Survived.sum()[0]
print("Percentage of male suvived are : {:0.3f}%".format(100*female/train_data.groupby(["Sex"]).Survived.count()[0]))
male=train_data.groupby(["Sex"]).Survived.sum()[1]
print("Percentage of male suvived are : {:0.3f}%".format(100*male/train_data.groupby(["Sex"]).Survived.count()[1]))

In [None]:
sns.barplot(y="Survived",x="Embarked",data=train_data)

In [None]:
sns.barplot(y="Survived",x="SibSp",data=train_data)

In [None]:
sns.barplot(y="Survived",x="Parch",data=train_data)

In [None]:
p=sns.FacetGrid(col="Survived",data=train_data)
p.map(plt.hist,"Age")

In [None]:
sns.pointplot(x="Pclass", y="Survived", hue="Sex", data=train_data,
              palette={"male": "blue", "female": "pink"},
              markers=["*", "o"], linestyles=["-", "--"]);

In [None]:
sns.pointplot(x="Pclass", y="Fare", hue="Survived", data=train_data,
              palette={0: "blue", 1: "pink"},
              markers=["*", "o"], linestyles=["-", "--"]);

In [None]:
train_data["Fare_modified"]=np.ceil(train_data["Fare"] / 50)
sns.barplot(y="Survived",x="Fare_modified",data=train_data)

## Retain the original data
### Removing "modified_fare"

In [None]:
train_data.drop("Fare_modified",axis=1,inplace=True)

In [None]:
test_data["Survived"]=(test_data.Sex=="female").astype(int)
test_data.head()
test_data.shape

In [None]:
test_data[["PassengerId","Survived"]].to_csv("data/predictions/female_live.csv",index=False)

## Accuracy on Kaggle: 0.76555
### Based on Gender only

In [None]:
test_data.drop("Survived",axis=1,inplace=True)
test_data.shape

## Combining Test and Training Datasets of performing processing operations simulataneously
### Removing "Survived" column from training data and saving it for future

In [None]:
survived_train=train_data["Survived"]

In [None]:
train_data.drop("Survived",axis=1,inplace=True)

In [None]:
data=pd.concat([train_data,test_data],sort=False)

In [None]:
data.info()

In [None]:
missing_cols=[i for i in data.columns if data[i].isnull().any()]
missing_cols

In [None]:
data["Age"]=data.Age.fillna(data.Age.median())
data["Fare"]=data.Fare.fillna(data.Fare.median())

In [None]:
missing_cols=[i for i in data.columns if data[i].isnull().any()]
missing_cols

In [None]:
data=pd.get_dummies(data,columns=["Sex"],drop_first=True)

In [None]:
data.head()

In [None]:
cols_select=["Sex_male","Age","Fare","SibSp","Parch"]


In [None]:
data[cols_select].head()

In [None]:
data[cols_select].info()
data_new=data[cols_select]

In [None]:
df_train=data_new.iloc[:891]
df_test=data_new.iloc[891:]

## As sklearn only uses numpy arrays so, changing dataframe to array

In [None]:
X=df_train
test=df_test
#Remember to use the previously extacted column
y=survived_train.values

## Fitting models

In [None]:
clf=DecisionTreeClassifier(max_depth=5)
#clf=RandomForestClassifier(max_depth=5,n_estimators=25)

clf

In [None]:
clf.fit(X,y)
pred=clf.predict(test)
test_data["Survived"]=pred

In [None]:
test_data[["PassengerId","Survived"]].to_csv("data/predictions/DT.csv",index=False)
# test_data[["PassengerId","Survived"]].to_csv("data/predictions/RF.csv",index=False)


## Accuracy on Kaggle: 0.77990  DecisionTreeClassifier(max_depth=5)
### Based on ""Sex_male","Age","Fare","SibSp"

In [None]:
from sklearn.model_selection import cross_val_score,train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=54)

In [None]:
clf=RandomForestClassifier(max_depth=78,n_estimators=25)
# clf=RandomForestClassifier(max_depth=5,n_estimators=25)
clf.fit(X_train,y_train)
pred=clf.predict(X_test)
from sklearn.metrics import accuracy_score
acc=accuracy_score(pred,y_test)
acc

In [None]:
acc_l=[]
for i in range (1,100):
    clf=RandomForestClassifier(max_depth=i,n_estimators=25)
    clf.fit(X_train,y_train)
    pred=clf.predict(X_test)
    acc_l.append(accuracy_score(pred,y_test))
print(max(acc_l),(1+acc_l.index(max(acc_l))))

In [None]:
clf=RandomForestClassifier(max_depth=9,n_estimators=25)
clf.fit(X,y)
pred=clf.predict(test)
k = 6
cv_result = cross_val_score(clf,X,y,cv=k) # uses R^2 as score 
print('CV Scores: ',cv_result)
print('CV scores average: ',np.sum(cv_result)/k)

In [None]:
test_data["Survived"]=pred

In [None]:
test_data[["PassengerId","Survived"]].to_csv("data/predictions/RF_tuned.csv",index=False)

In [None]:
## Accuracy on Kaggle: 0.77990  clf=RandomForestClassifier(max_depth=i,n_estimators=25)
### Based on ""Sex_male","Age","Fare","SibSp"

In [None]:
parameters = {'C':[1, 10, 100],
              'gamma':[0.1, 0.01]}
clf=SVC()
cv = GridSearchCV(clf,parameters,cv=3)
cv.fit(X,y)
y_pred = cv.predict(test)
print("Accuracy: {}".format(cv.score(X_test, y_test)))
print("Tuned Model Parameters: {}".format(cv.best_params_))

In [None]:
clf=SVC(C=100,gamma=0.01)
cv = GridSearchCV(clf,parameters,cv=3)
cv.fit(X,y)
y_pred = cv.predict(test)

In [None]:
test_data["Survived"]=y_pred

In [None]:
test_data[["PassengerId","Survived"]].to_csv("data/predictions/svc.csv",index=False)

## Starting with Feature Engineering