In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data=pd.read_excel(r'C:\Users\Administrateur\Downloads\titanic3.xls')

In [3]:
data.shape

(1309, 14)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 107.4+ KB


In [5]:
data.isna().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [6]:
data.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [7]:
data = data.drop(['name','age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'boat', 'body', 'home.dest'], axis=1 )

In [8]:
data.sample(7)

Unnamed: 0,pclass,survived,sex,embarked
1290,3,1,female,S
175,1,0,male,S
1228,3,1,male,S
304,1,1,female,C
653,3,1,female,C
833,3,0,male,S
930,3,0,male,Q


In [9]:
data['embarked'].dropna()

0       S
1       S
2       S
3       S
4       S
       ..
1304    C
1305    C
1306    C
1307    C
1308    S
Name: embarked, Length: 1307, dtype: object

In [10]:
data = data.dropna(axis = 0)

In [11]:
X = data.drop(['survived'], axis=1)
y = data['survived']

In [12]:
X.head(4)

Unnamed: 0,pclass,sex,embarked
0,1,female,S
1,1,male,S
2,1,female,S
3,1,male,S


In [13]:
X.shape

(1307, 3)

In [14]:
y.shape

(1307,)

In [15]:
# Encoding the sex and the embarked columns

from sklearn.preprocessing import OneHotEncoder


In [16]:
from sklearn.compose import make_column_transformer
column_trans = make_column_transformer(
    (OneHotEncoder(), ['sex', 'embarked']),
    remainder='passthrough')
# transform only sex & embarked cols not pclass class

In [17]:
column_trans.fit_transform(X)

array([[1., 0., 0., 0., 1., 1.],
       [0., 1., 0., 0., 1., 1.],
       [1., 0., 0., 0., 1., 1.],
       ...,
       [0., 1., 1., 0., 0., 3.],
       [0., 1., 1., 0., 0., 3.],
       [0., 1., 0., 0., 1., 3.]])

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
logreg = LogisticRegression(solver='lbfgs')

In [20]:
column_trans.fit_transform(X)

array([[1., 0., 0., 0., 1., 1.],
       [0., 1., 0., 0., 1., 1.],
       [1., 0., 0., 0., 1., 1.],
       ...,
       [0., 1., 1., 0., 0., 3.],
       [0., 1., 1., 0., 0., 3.],
       [0., 1., 0., 0., 1., 3.]])

In [21]:
from sklearn.pipeline import make_pipeline

In [22]:
# transform the cols and make a model
pipe = make_pipeline(column_trans, logreg)

In [23]:
from sklearn.model_selection import cross_val_score

In [26]:
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.6940101196221229

In [49]:
# making some new data

X_new = X.sample(5, random_state=99)
X_new

Unnamed: 0,pclass,sex,embarked
681,3,female,C
365,2,female,S
153,1,female,C
506,2,male,S
86,1,male,S


In [50]:
pipe.fit(X, y)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['sex', 'embarked'])],
                                   verbose=False)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                         

In [51]:
pipe.predict(X_new)

array([1, 1, 1, 0, 0], dtype=int64)

In [52]:
pipe.predict(X_new)

array([1, 1, 1, 0, 0], dtype=int64)

In [53]:
pipe.predict_proba(X_new).mean()

0.5000000000000001

In [54]:
pipe.classes_

array([0, 1], dtype=int64)

In [55]:
pipe.decision_function(X_new)

array([ 0.92545228,  1.06189591,  2.49285546, -1.4277945 , -0.64409291])