**1.IMPORTING IMPORTANT LIBRARIES**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


**2. LOADING THE DATA **


In [None]:
df=pd.read_csv("../input/train.csv")
test=pd.read_csv("../input/test.csv")

In [None]:
print(df.head(5))
print(df.columns)
print(df.describe(include='all'))
print(df.isnull().sum())

**4. DATA ANALYSIS**

**Classification on the basis of class**


In [None]:
survived_class = df[df['Survived']==1]['Pclass'].value_counts()
dead_class = df[df['Survived']==0]['Pclass'].value_counts()
df_class=pd.DataFrame([survived_class,dead_class])
df_class.index=['Survived','Dead']
df_class.columns=['Class 1','Class 2','Class 3']
print(df_class)
df_class.plot(kind='bar')
plt.ylabel('No. of people',size=15,color='green')
plt.xlabel('Survival',size=20,color='blue')
plt.show()
Class1_survived= df_class.iloc[0,0]/df_class.iloc[:,0].sum()*100
Class2_survived = df_class.iloc[0,1]/df_class.iloc[:,1].sum()*100
Class3_survived = df_class.iloc[0,2]/df_class.iloc[:,2].sum()*100
print('Percentage of Class1 passenger survived is ',round(Class1_survived),'%')
print('Percentage of Class2 passenger survived is ',round(Class2_survived),'%')
print('Percentage of Class3 passenger survived is ',round(Class3_survived),'%')

**Classification on the basis of gender**

In [None]:
survived_gender=df[df['Survived']==1]['Sex'].value_counts()
dead_gender=df[df['Survived']==0]['Sex'].value_counts()
df_gender=pd.DataFrame([survived_gender,dead_gender])
df_gender.columns=['Survived','Dead']
df_gender.index=['Female','Male']
print(df_gender)
df_gender.plot(kind='bar')
plt.ylabel('No. of people',size=15,color='green')
plt.xlabel('Sex',size=20,color='blue')
plt.show()
female_survived=df_gender.iloc[0,0]/df_gender.iloc[0,:].sum()*100
male_survived=df_gender.iloc[1,0]/df_gender.iloc[1,:].sum()*100
print('Percentage of male passengers survived is ',round(male_survived),'%')
print('Percentage of female passengers survived is ',round(female_survived),'%')

**Classification on the basis of Age**

In [None]:
bins = [ 0, 5, 12, 18, 24, 35, 60, np.inf]
labels = [ 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
df['AgeGroup'] = pd.cut(df["Age"], bins, labels = labels)
print(df[['AgeGroup','Survived']].groupby(['AgeGroup'],as_index=False).mean())
sns.barplot(x="AgeGroup",y="Survived",data=df)
plt.xlabel('AgeGroup',color='blue',size=18)
plt.ylabel('Survival Rate',color='green',size=18)
plt.title('Age vs Survival Rate',color='Black',size=20)
plt.show()

**Classification on the basis of  Fare**

In [None]:
bins = [0,100,250,600]
labels=['Economic Class','Business Class','First Class']
df['Class']=pd.cut(df['Fare'],bins,labels=labels)
print(df[['Class','Survived']].groupby(['Class'],as_index=False).mean())
sns.barplot(x="Class",y="Survived",data=df)
plt.xlabel('Class',color='blue',size=18)
plt.ylabel('Survival Rate',color='green',size=18)
plt.title('First Class Passengers got the maximum survival rate',color='Black',size=20)
plt.show()

**5. DATA CLEANING**

**Cabin Column**

Checking the number of values in 'Cabin' since it seems a tedious column considering the large number of missing values

In [None]:
print(len(df['Cabin']))

As 687 values are missing from toatal 891 values so it will be a difficult prediction and will not be that accurate . So ,I consider it dropping.

In [None]:
df=df.drop(['Cabin'],axis=1)
test=test.drop(['Cabin'],axis=1)
df.columns

**Embarked Column**

 Checking the distribution of embarked column.

In [None]:
print("Number of people embarking in Southampton (S):")
southampton = df[df["Embarked"] == "S"].shape[0]
print(southampton)

print("Number of people embarking in Cherbourg (C):")
cherbourg = df[df["Embarked"] == "C"].shape[0]
print(cherbourg)

print("Number of people embarking in Queenstown (Q):")
queenstown = df[df["Embarked"] == "Q"].shape[0]
print(queenstown)

**Ticket Column**

Ticket serial number will not yield any information about the survival, so dropping it.

In [None]:
df=df.drop(['Ticket'],axis=1)
test=test.drop(['Ticket'],axis=1)
test.columns

**Age Feature**

Categorising them into sub category.

In [None]:
#create a combined group of both datasets
combine = [df, test]

#extract a title for each Name in the train and test datasets
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

print(pd.crosstab(df['Title'], df['Sex']))

Making it more simple by converting them into basic categories.

In [None]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Filling the missing ages value and arranging it into 5 categories.

In [None]:
#Filling missing values
df['Age'] = df.groupby(['Title'])['Age'].transform(lambda x: x.fillna(x.mean()))
test['Age'] = test.groupby(['Title'])['Age'].transform(lambda x: x.fillna(x.mean()))

#AgeCategories
df['Age'] = df['Age'].astype(int)
test['Age']    = test['Age'].astype(int)

df.loc[ df['Age'] <= 16, 'Age'] = 0
df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3
df.loc[(df['Age'] > 64), 'Age'] = 4

test.loc[ test['Age'] <= 16, 'Age'] = 0
test.loc[(test['Age'] > 16) & (test['Age'] <= 32), 'Age'] = 1
test.loc[(test['Age'] > 32) & (test['Age'] <= 48), 'Age'] = 2
test.loc[(test['Age'] > 48) & (test['Age'] <= 64), 'Age'] = 3
test.loc[(test['Age'] > 64), 'Age'] = 4
df['Age'].head(5)

Mapping Sex to 0 and 1.

In [None]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

test.head(5)

**Name Feature**

It will not be useful now as we have already extracted the information to imporvise the Age feature.

In [None]:
df=df.drop(['Name'],axis=1)
test=test.drop(['Name'],axis=1)
df.columns

**AgeGroup Feature**

Now everything has been done from AgeGroup column ,so time to drop it.

In [None]:
df.drop('AgeGroup',axis=1,inplace=True)

**Sex Feature**

Getting the dummy column for male and female.

In [None]:
df = pd.concat([df.drop('Sex', axis=1), pd.get_dummies(df['Sex'])], axis=1)
test = pd.concat([test.drop('Sex', axis=1), pd.get_dummies(test['Sex'])], axis=1)
test.head(5)

**Class Feature**

Now , time to drop the class column.

In [None]:
df.drop('Class',axis=1,inplace=True)
df.head()

**EMBARKED FEATURE**

We will fill out  missing values by 'S' as it is approximately 72% in all feature and will map values.

In [None]:
df['Embarked'].replace({'S':1,'C':2,'Q':3},inplace=True)
df['Embarked']=df['Embarked'].fillna(1)
test['Embarked'].replace({'S':1,'C':2,'Q':3},inplace=True)
test['Embarked']=test['Embarked'].fillna(1)

test.head(5)

**Fare feature**

Fare feature should be divided into various categories to make sure it is suitable for prediction.

In [None]:
#fill in missing Fare value in test set based on mean fare for that Pclass 
for x in range(len(test["Fare"])):
    if pd.isnull(test["Fare"][x]):
        pclass = test["Pclass"][x] #Pclass = 3
        test["Fare"][x] = round(df[df["Pclass"] == pclass]["Fare"].mean(), 4)
        
#map Fare values into groups of numerical values
df['FareBand'] = pd.qcut(df['Fare'], 4, labels = [1, 2, 3, 4])
test['FareBand'] = pd.qcut(test['Fare'], 4, labels = [1, 2, 3, 4])

#drop Fare values
df = df.drop(['Fare'], axis = 1)
test = test.drop(['Fare'], axis = 1)

Further dropping the title.

In [None]:
df=df.drop(['Title'],axis=1)
test=test.drop(['Title'],axis=1)
test.columns

**6. BEST MODEL**


**Splitting the data**

Here I'm splitting the 30% of the data for cross validation.

In [None]:
from sklearn.model_selection import train_test_split

predictors=df.drop(['Survived','PassengerId'],axis=1)
target=df['Survived']
x_train,x_cv,y_train,y_cv=train_test_split(predictors,target,test_size=0.35,random_state=0)

**Choosing the best learning algorithm for our model**


Woah!!! Everything looks fine now considering the data. Time to choose the best algorithm for our model out of these four.

**1.KNN**

**2.Logistic Regression**

**3.Random Forest Classifier**

**4.SVM**


**Applying KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
y_pred = knn.predict(x_cv)
acc_knn = round(accuracy_score(y_pred,y_cv) * 100, 2)
print(acc_knn)

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_cv)
acc_logreg = round(accuracy_score(y_pred, y_cv) * 100, 2)
print(acc_logreg)

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

randomforest = RandomForestClassifier()
randomforest.fit(x_train, y_train)
y_pred = randomforest.predict(x_cv)
acc_randomforest = round(accuracy_score(y_pred, y_cv) * 100, 2)
print(acc_randomforest)

In [None]:
# Support Vector Machines
from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train, y_train)
y_pred = svc.predict(x_cv)
acc_svc = round(accuracy_score(y_pred, y_cv) * 100, 2)
print(acc_svc)

Now let's compare the score prediction by various method.

In [None]:
models = pd.DataFrame({
    'Method': ['KNN', 'Logistic Regression', 
              'Random Forest', 'Support Vector Machine'],
    'Score': [acc_knn, acc_logreg, 
              acc_randomforest, acc_svc]})
models.sort_values(by='Score', ascending=False)

**SUBMISSION TIME**

I'll use the KNeighborsClassifier model in submission.

In [None]:
svc = RandomForestClassifier()
svc.fit(x_train, y_train)
y_pred = svc.predict(test.drop('PassengerId',axis=1))
print(y_pred)

In [None]:
submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':y_pred})
submission.head(5)

In [None]:
filename = 'TPredictions.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)