## **Importing the libraries**

In [73]:
import pandas as pd # For data processing  
import numpy as np # For linear algebra
import matplotlib.pyplot as plt #For plotting 
from sklearn.preprocessing import LabelEncoder #For non numarical labels
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## **Reading the data and getting info of it**

In [74]:
data=pd.read_csv('titanic.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [75]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## **Viewing the first 10 rows of data**

In [76]:
data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## **Dropping the data that are object and not required**

In [77]:
data.drop(['Name','Ticket','Cabin'],axis=1,inplace=True)
data

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S
887,888,1,1,female,19.0,0,0,30.0000,S
888,889,0,3,female,,1,2,23.4500,S
889,890,1,1,male,26.0,0,0,30.0000,C


## **Checking which features have null values**

In [78]:
data.isna().any()

PassengerId    False
Survived       False
Pclass         False
Sex            False
Age             True
SibSp          False
Parch          False
Fare           False
Embarked        True
dtype: bool

## **Filling the missing values of data**

In [79]:
data.Age.fillna(data.Age.median(), inplace = True)
data.Embarked.fillna(data.Embarked.mode()[0], inplace = True)

## **Coverting non numerical labels**

In [80]:
gtr=LabelEncoder() # Calling the the labelEncoder function 
Sex=gtr.fit_transform(data['Sex'])
Embarked=gtr.fit_transform(data['Embarked'])
data['Sex']=Sex
data['Embarked']=Embarked

In [81]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.2500,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.9250,2
3,4,1,1,0,35.0,1,0,53.1000,2
4,5,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...,...
886,887,0,2,1,27.0,0,0,13.0000,2
887,888,1,1,0,19.0,0,0,30.0000,2
888,889,0,3,0,28.0,1,2,23.4500,2
889,890,1,1,1,26.0,0,0,30.0000,0


## **Splitting the data for training and testing**

In [82]:
X_train = data.drop("Survived", axis=1)
Y_train = data["Survived"]
X_test  = data.drop("PassengerId", axis=1).copy()
X_train, X_test, Y_train, Y_test=train_test_split(X,Y,test_size=0.2)

## **KNN**

In [83]:
from sklearn.neighbors import KNeighborsClassifier 
knn=KNeighborsClassifier()
knn.fit(X_train,Y_train)
A=knn.predict(X_test)
knn_accuracy=accuracy_score(Y_test,A)
Names=['knn_algorithm']
Accuracy=[knn_accuracy]
knn_accuracy

0.5977653631284916

## **SVM**

In [84]:
from sklearn.svm import SVC
svm=SVC()
svm.fit(X_train,Y_train)
B=svm.predict(X_test)
svm_accuracy=accuracy_score(Y_test,B)
Names.append('SVM_algorithm')
Accuracy.append(svm_accuracy)
svm_accuracy

0.6033519553072626

## **Naive Bayes**

In [85]:
from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(X_train,Y_train)
n=naive.predict(X_test)
naivebayes_accuracy=accuracy_score(Y_test,n)
Names.append('Naviebayes_algorithm')
Accuracy.append(naivebayes_accuracy)
naivebayes_accuracy

0.7486033519553073

## **Decision Tree**

In [86]:
#Decision tree classification
from sklearn.tree import DecisionTreeClassifier
tree=DecisionTreeClassifier()
tree.fit(X_train,Y_train)
c=tree.predict(X_test)
DecisionTree_accuracy=accuracy_score(Y_test,c)
Names.append('DecisionTree_algorithm')
Accuracy.append(DecisionTree_accuracy)
DecisionTree_accuracy

0.7486033519553073

## **Random Forest**

In [87]:
#Random forest classification
from sklearn.ensemble import RandomForestClassifier
random=RandomForestClassifier()
random.fit(X_train,Y_train)
d=random.predict(X_test)
RandomForest_accuracy=accuracy_score(Y_test,d)
Names.append('RandomForest_algorithm')
Accuracy.append(RandomForest_accuracy)
RandomForest_accuracy

0.7932960893854749

## **Showing the accuracies obtained in increasing order**

In [88]:

GTR={knn_accuracy*100:"knn_accuracy",naivebayes_accuracy*100:"naivebayes_accuracy",svm_accuracy*100:"svm_accuracy",
     DecisionTree_accuracy*100:"DecisionTree_accuracy",RandomForest_accuracy*100:"RandomForest_accuracy"}
sorted(GTR)
Accuracy=np.array(Accuracy)
Accuracy=Accuracy*100
A=np.argsort(Accuracy)
print('The most accuracte value is obtained by ',Names[A[-1]], 'and the accuracy score percentage is',Accuracy[A[-1]],'%') 
print('The second most accuracte value is obtained by ',Names[A[-2]], 'and the accuracy score percentage is',Accuracy[A[-2]],'%') 
print('The third accuracte value is obtained by ',Names[A[-3]], 'and the accuracy score percentage is',Accuracy[A[-3]],'%') 
print('The second least accuracte value is obtained by ',Names[A[-4]], 'and the accuracy score percentage is',Accuracy[A[-4]],'%') 
print('The least accuracte value is obtained by ',Names[A[-5]], 'and the accuracy score percentage is',Accuracy[A[-5]],'%') 

The most accuracte value is obtained by  RandomForest_algorithm and the accuracy score percentage is 79.3296089385475 %
The second most accuracte value is obtained by  DecisionTree_algorithm and the accuracy score percentage is 74.86033519553072 %
The third accuracte value is obtained by  Naviebayes_algorithm and the accuracy score percentage is 74.86033519553072 %
The second least accuracte value is obtained by  SVM_algorithm and the accuracy score percentage is 60.33519553072626 %
The least accuracte value is obtained by  knn_algorithm and the accuracy score percentage is 59.77653631284916 %


As we can see random forest is the most accurate one we can actually know the importance of each feature 

In [91]:
importance = pd.DataFrame({'feature':X_train.columns,'importance':np.round(random.feature_importances_,3)})
importance = importance.sort_values('importance',ascending=False).set_index('feature')
importance.head(10)

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
Sex,0.238
Fare,0.208
PassengerId,0.183
Age,0.171
Pclass,0.091
SibSp,0.043
Embarked,0.037
Parch,0.03


### **By this we can conclude that sex and fare are two major features of this titanic dataset**