In [102]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [103]:
df = pd.read_csv('dataset/titanic.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,,3,"Heikkinen, Miss. Laina",female,,0,0,STON/O2. 3101282,,,S
3,4,,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
890,887,0.0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
891,888,1.0,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
892,889,0.0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
893,890,1.0,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 895 entries, 0 to 894
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  895 non-null    int64  
 1   Survived     891 non-null    float64
 2   Pclass       895 non-null    int64  
 3   Name         895 non-null    object 
 4   Sex          895 non-null    object 
 5   Age          715 non-null    float64
 6   SibSp        895 non-null    int64  
 7   Parch        895 non-null    int64  
 8   Ticket       895 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     893 non-null    object 
dtypes: float64(3), int64(4), object(5)
memory usage: 84.0+ KB


In [105]:
df.isnull().sum()

PassengerId      0
Survived         4
Pclass           0
Name             0
Sex              0
Age            180
SibSp            0
Parch            0
Ticket           0
Fare             4
Cabin          691
Embarked         2
dtype: int64

In [106]:
df.drop(['PassengerId','Ticket','Fare','Cabin','Name'] , inplace=True , axis=1)

In [107]:
df.fillna(method='ffill' , inplace=True)

In [108]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    0
dtype: int64

In [109]:
x = df.iloc[:,1:]

In [110]:
x

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,male,22.0,1,0,S
1,1,female,38.0,1,0,C
2,3,female,38.0,0,0,S
3,1,female,35.0,1,0,S
4,3,male,35.0,0,0,S
...,...,...,...,...,...,...
890,2,male,27.0,0,0,S
891,1,female,19.0,0,0,S
892,3,female,19.0,1,2,S
893,1,male,26.0,0,0,C


In [111]:
y = df.iloc[:,0]

In [112]:
y

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
890    0.0
891    1.0
892    0.0
893    1.0
894    0.0
Name: Survived, Length: 895, dtype: float64

In [113]:
x_train,x_test,y_train,y_test = train_test_split(x , y , test_size=0.2 ,random_state=2)

In [114]:
x_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
874,3,male,26.0,0,0,S
671,3,male,25.0,0,0,S
30,1,male,40.0,0,0,C
10,3,female,4.0,1,1,S
335,1,male,45.5,0,0,S
...,...,...,...,...,...,...
534,2,female,2.0,1,1,S
584,2,female,25.0,1,1,S
493,3,male,9.0,1,1,S
527,1,female,44.0,0,1,C


In [115]:
x_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
471,1,male,56.00,0,0,S
158,3,male,40.50,0,0,S
317,3,male,28.00,0,0,S
455,3,male,36.00,1,0,S
719,3,male,19.00,0,0,S
...,...,...,...,...,...,...
310,1,female,0.92,0,0,C
694,1,male,31.00,1,0,S
89,3,female,33.00,3,0,S
80,3,male,25.00,0,0,S


In [116]:
y_train

874    0.0
671    0.0
30     0.0
10     1.0
335    0.0
      ... 
534    1.0
584    1.0
493    1.0
527    1.0
168    0.0
Name: Survived, Length: 716, dtype: float64

In [117]:
y_test

471    0.0
158    0.0
317    0.0
455    0.0
719    0.0
      ... 
310    1.0
694    1.0
89     1.0
80     0.0
790    1.0
Name: Survived, Length: 179, dtype: float64

In [118]:
tr = ColumnTransformer(transformers=[
    ('t2',OneHotEncoder(drop='first',sparse=False),['Sex','Embarked'])
] , remainder='passthrough')

In [119]:
a = tr.fit_transform(x_train)
a

array([[ 1.,  0.,  1., ..., 26.,  0.,  0.],
       [ 1.,  0.,  1., ..., 25.,  0.,  0.],
       [ 1.,  0.,  0., ..., 40.,  0.,  0.],
       ...,
       [ 1.,  0.,  1., ...,  9.,  1.,  1.],
       [ 0.,  0.,  0., ..., 44.,  0.,  1.],
       [ 1.,  0.,  1., ...,  1.,  4.,  1.]])

In [124]:
b = tr.fit_transform(x_test)
b

array([[ 1. ,  0. ,  1. , ..., 56. ,  0. ,  0. ],
       [ 1. ,  0. ,  1. , ..., 40.5,  0. ,  0. ],
       [ 1. ,  0. ,  1. , ..., 28. ,  0. ,  0. ],
       ...,
       [ 0. ,  0. ,  1. , ..., 33. ,  3. ,  0. ],
       [ 1. ,  0. ,  1. , ..., 25. ,  0. ,  0. ],
       [ 0. ,  0. ,  1. , ..., 18. ,  0. ,  0. ]])

In [121]:
log = LogisticRegression()

In [122]:
log.fit(a,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [126]:
y_pred = log.predict(b)
y_pred

array([0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1.,
       0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0.,
       1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0.,
       0., 0., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 1.])

In [127]:
from sklearn.metrics import accuracy_score , confusion_matrix

In [128]:
print("Accuracy matrix =",accuracy_score(y_test,y_pred))

Accuracy matrix = 0.7932960893854749


In [129]:
pd.DataFrame(confusion_matrix(y_test , y_pred)) # if all the values are comes along the diagonal then accuracy is 100%

Unnamed: 0,0,1
0,95,14
1,23,47


In [130]:
result = pd.DataFrame()
result['Actual Label'] = y_test
result['Decision Tree Prediction'] = y_pred

In [131]:
result

Unnamed: 0,Actual Label,Decision Tree Prediction
471,0.0,0.0
158,0.0,0.0
317,0.0,0.0
455,0.0,0.0
719,0.0,0.0
...,...,...
310,1.0,1.0
694,1.0,0.0
89,1.0,0.0
80,0.0,0.0
