### Логистическая регрессия

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#### Titanic

In [2]:
data = pd.read_csv('./train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Установим в качестве индекса PassengerId

In [3]:
data.set_index('PassengerId',inplace=True)

In [4]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
data.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
target= 'Survived'

In [7]:
y = data[target]

In [8]:
X= data.drop(target,axis=1)

Изучение качества и очистки данных

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Name      891 non-null    object 
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Ticket    891 non-null    object 
 7   Fare      891 non-null    float64
 8   Cabin     204 non-null    object 
 9   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 76.6+ KB


In [10]:
X = X.drop(['Cabin','Name','Ticket'], axis=1)

In [11]:
mean_age = X['Age'].mean()
mean_age

29.69911764705882

In [13]:
X['Age'] = X['Age'].fillna(mean_age)

In [15]:
X['Age'].unique()

array([22.        , 38.        , 26.        , 35.        , 29.69911765,
       54.        ,  2.        , 27.        , 14.        ,  4.        ,
       58.        , 20.        , 39.        , 55.        , 31.        ,
       34.        , 15.        , 28.        ,  8.        , 19.        ,
       40.        , 66.        , 42.        , 21.        , 18.        ,
        3.        ,  7.        , 49.        , 29.        , 65.        ,
       28.5       ,  5.        , 11.        , 45.        , 17.        ,
       32.        , 16.        , 25.        ,  0.83      , 30.        ,
       33.        , 23.        , 24.        , 46.        , 59.        ,
       71.        , 37.        , 47.        , 14.5       , 70.5       ,
       32.5       , 12.        ,  9.        , 36.5       , 51.        ,
       55.5       , 40.5       , 44.        ,  1.        , 61.        ,
       56.        , 50.        , 36.        , 45.5       , 20.5       ,
       62.        , 41.        , 52.        , 63.        , 23.5 

In [16]:
X['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [20]:
embarked_mode = X['Embarked'].mode()[0]
embarked_mode 

'S'

In [21]:
X['Embarked'] = X['Embarked'].fillna(embarked_mode )

In [22]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  891 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 55.7+ KB


#### Работа с категориальными переменнами

#### Sex

In [24]:
X['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [26]:
X['Sex']=(X['Sex']=='female').astype(int)

In [28]:
X['Sex'].value_counts()

0    577
1    314
Name: Sex, dtype: int64

#### Embarked

In [30]:
X = pd.get_dummies(X)
X.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,0,22.0,1,0,7.25,0,0,1
2,1,1,38.0,1,0,71.2833,1,0,0
3,3,1,26.0,0,0,7.925,0,0,1
4,1,1,35.0,1,0,53.1,0,0,1
5,3,0,35.0,0,0,8.05,0,0,1


In [31]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      891 non-null    int64  
 1   Sex         891 non-null    int64  
 2   Age         891 non-null    float64
 3   SibSp       891 non-null    int64  
 4   Parch       891 non-null    int64  
 5   Fare        891 non-null    float64
 6   Embarked_C  891 non-null    uint8  
 7   Embarked_Q  891 non-null    uint8  
 8   Embarked_S  891 non-null    uint8  
dtypes: float64(2), int64(4), uint8(3)
memory usage: 51.3 KB


#### Pclass

In [32]:
X['Pclass'].unique()

array([3, 1, 2])

In [33]:
X['Pclass'] = X['Pclass'].astype('category')

In [35]:
X = pd.get_dummies(X)

In [36]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Sex         891 non-null    int64  
 1   Age         891 non-null    float64
 2   SibSp       891 non-null    int64  
 3   Parch       891 non-null    int64  
 4   Fare        891 non-null    float64
 5   Embarked_C  891 non-null    uint8  
 6   Embarked_Q  891 non-null    uint8  
 7   Embarked_S  891 non-null    uint8  
 8   Pclass_1    891 non-null    uint8  
 9   Pclass_2    891 non-null    uint8  
 10  Pclass_3    891 non-null    uint8  
dtypes: float64(2), int64(3), uint8(6)
memory usage: 47.0 KB


### Разбиение данных на тренировачные, валидационные и тестовые датасеты

In [38]:
X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.25,random_state=42)

### Построение модели

In [39]:
lr = LogisticRegression()

In [40]:
lr.fit(X_train,y_train)

LogisticRegression()

In [41]:
y_pred=lr.predict(X_valid)

In [42]:
y_pred

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0])

In [61]:
df_error = pd.DataFrame({'y_valid':y_valid.values,
             
             'y_pred':y_pred}, columns=['y_valid','y_pred'])
df_error

Unnamed: 0,y_valid,y_pred
0,1,0
1,0,0
2,0,0
3,1,1
4,1,1
...,...,...
218,1,1
219,0,0
220,0,0
221,0,1


#### Метрика Accuracy 

In [62]:
from sklearn.metrics import accuracy_score

In [63]:
accuracy_score(y_valid,y_pred)

0.8026905829596412

In [64]:
y_pred_train = lr.predict(X_train)

In [65]:
accuracy_score(y_train,y_pred_train)

0.8083832335329342

#### Вычисление вероятностей событий

In [66]:
y_proba = lr.predict_proba(X_valid)
y_proba

array([[0.88803846, 0.11196154],
       [0.7306994 , 0.2693006 ],
       [0.87015652, 0.12984348],
       [0.08991938, 0.91008062],
       [0.25062017, 0.74937983],
       [0.07826309, 0.92173691],
       [0.3344696 , 0.6655304 ],
       [0.90522441, 0.09477559],
       [0.24542478, 0.75457522],
       [0.10340604, 0.89659396],
       [0.69433675, 0.30566325],
       [0.93487211, 0.06512789],
       [0.62484704, 0.37515296],
       [0.8476136 , 0.1523864 ],
       [0.75859307, 0.24140693],
       [0.07898048, 0.92101952],
       [0.72748704, 0.27251296],
       [0.33438492, 0.66561508],
       [0.70247546, 0.29752454],
       [0.70651162, 0.29348838],
       [0.88386345, 0.11613655],
       [0.6428651 , 0.3571349 ],
       [0.39821924, 0.60178076],
       [0.86944941, 0.13055059],
       [0.89833287, 0.10166713],
       [0.92821101, 0.07178899],
       [0.56597441, 0.43402559],
       [0.72832999, 0.27167001],
       [0.9144287 , 0.0855713 ],
       [0.42638413, 0.57361587],
       [0.

### Сохранение данных

In [67]:
X_train.to_pickle('X_train.pkl')
y_train.to_pickle('y_train.pkl')

X_valid.to_pickle('X_valid.pkl')
y_valid.to_pickle('y_valid.pkl')