In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('titanic.csv',index_col=[0])

In [3]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.drop(['Cabin','Name','Ticket'],axis=1,inplace=True)

In [5]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [6]:
df['Age']=df.groupby('Sex')['Age'].apply(lambda x:x.fillna(x.mean()))

In [7]:
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [8]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

In [9]:
df.fillna({'Embarked':'S'},inplace=True)

In [10]:
df['Family']=df['SibSp']+df['Parch']
df.drop(['SibSp','Parch'],axis=1,inplace=True)

In [11]:
df

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Embarked,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,22.000000,7.2500,S,1
2,1,1,female,38.000000,71.2833,C,1
3,1,3,female,26.000000,7.9250,S,0
4,1,1,female,35.000000,53.1000,S,1
5,0,3,male,35.000000,8.0500,S,0
...,...,...,...,...,...,...,...
887,0,2,male,27.000000,13.0000,S,0
888,1,1,female,19.000000,30.0000,S,0
889,0,3,female,27.915709,23.4500,S,3
890,1,1,male,26.000000,30.0000,C,0


In [12]:
X=df.drop('Survived',axis=1)
y=df['Survived']

In [15]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
     ---------------------------------------- 99.8/99.8 MB 3.2 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3



[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report,confusion_matrix

In [17]:
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from xgboost import XGBClassifier

In [18]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=1)

In [19]:
rf=RandomForestClassifier()
lr= LogisticRegression()
knn=KNeighborsClassifier()
xgc=XGBClassifier()

In [20]:
column_trans=make_column_transformer((OneHotEncoder(sparse=False),['Sex','Embarked']),remainder='passthrough')

In [21]:
scaler=StandardScaler()

In [22]:
for model in [lr,rf,knn,xgc]:
    pipe=make_pipeline(column_trans,scaler,model)
    print('-'*50)
    print('Model used',model)
    pipe.fit(x_train,y_train)
    print('Training Score',pipe.score(x_train,y_train))
    print('Testing Score',pipe.score(x_test,y_test))
    y_pred=pipe.predict(x_test)
    print(classification_report(y_test,y_pred))
    print(confusion_matrix(y_test,y_pred))

--------------------------------------------------
Model used LogisticRegression()
Training Score 0.811377245508982
Testing Score 0.7982062780269058
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       128
           1       0.80      0.71      0.75        95

    accuracy                           0.80       223
   macro avg       0.80      0.79      0.79       223
weighted avg       0.80      0.80      0.80       223

[[111  17]
 [ 28  67]]
--------------------------------------------------
Model used RandomForestClassifier()
Training Score 0.9895209580838323
Testing Score 0.7623318385650224
              precision    recall  f1-score   support

           0       0.74      0.90      0.81       128
           1       0.81      0.58      0.67        95

    accuracy                           0.76       223
   macro avg       0.78      0.74      0.74       223
weighted avg       0.77      0.76      0.75       223

[[115  13]
 [ 40  55

In [23]:
# Hyper Parametr Tuning

In [24]:
pipe=make_pipeline(column_trans,scaler,lr)

In [25]:
y=pd.DataFrame(y_test)
y.columns=["actual"]
y=y.reset_index()

In [26]:
y.drop('PassengerId',axis=1,inplace=True)

In [None]:
y_prob=pipe.predict_proba(x_test)
prob=pd.DataFrame(y_prob[:,1],columns=["pred_probability"])

In [None]:
pred=pd.DataFrame(y_pred,columns=['Predicted'])

In [None]:
model1=pd.concat([pred,prob,y],axis=1)

In [None]:
model1

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

auc = roc_auc_score(y_test, prob)
auc

fpr, tpr, thresholds = roc_curve(y_test, prob)
fpr,thresholds,tpr
thresholds

In [None]:
plt.plot(fpr, tpr, color='orange', label='ROC')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--',label='ROC curve (area = %0.2f)' % auc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import recall_score,accuracy_score

In [None]:
accuracy = []
for i in thresholds:
    y_pred = np.where(prob > i, 1, 0)
    accuracy.append(accuracy_score(y_test, y_pred))

In [None]:
accuracy_model = pd.concat([pd.Series(accuracy), pd.Series(thresholds)], axis=1)
accuracy_model.columns = ["accuracy", "threshold"]
accuracy_model.sort_values(by="accuracy", ascending=False)

In [None]:
import pickle

In [None]:
pickle.dump(pipe,open('Titanic_Survivour.pkl','wb'))

In [None]:
model=pickle.load(open('Titanic_Survivour.pkl','rb'))

In [None]:
X

In [None]:
test=pd.DataFrame({'Pclass':[1],'Sex':['male'],'Age':[30],'Fare':[60],'Embarked':['Q'],'Family':[0]})

In [None]:
model.predict(test)

In [None]:
df.to_csv('df_clean.csv')