In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
def data_prepocessing(df):
    df['Title'] = df['Name'].str.split(',').str[1].str.split('.').str[0]
    df = df.drop(columns=['Name', 'Ticket', 'Cabin'])
    df['Age'] = df['Age'].fillna(30)
    df['Embarked'] = df['Embarked'].fillna('S')
    df['Pclass'] = df['Pclass'].map({1: 'Upper Class', 2: 'Middle Class', 3: 'Lower Class'})
    df['Embarked'] = df['Embarked'].map({'C': 'Cherbourg, France', 'Q': 'Queenstown, Ireland', 'S': 'Southampton, England'})
    df_dummies = pd.get_dummies(df[['Pclass', 'Sex', 'Embarked', 'Title']], prefix='', prefix_sep='', drop_first=True)
    df = df.drop(columns=['Pclass', 'Sex', 'Embarked', 'Title'])
    df = pd.concat([df, df_dummies], axis=1)

    return df

In [3]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
df = data_prepocessing(df)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   PassengerId           891 non-null    int64  
 1   Survived              891 non-null    int64  
 2   Age                   891 non-null    float64
 3   SibSp                 891 non-null    int64  
 4   Parch                 891 non-null    int64  
 5   Fare                  891 non-null    float64
 6   Middle Class          891 non-null    uint8  
 7   Upper Class           891 non-null    uint8  
 8   male                  891 non-null    uint8  
 9   Queenstown, Ireland   891 non-null    uint8  
 10  Southampton, England  891 non-null    uint8  
 11   Col                  891 non-null    uint8  
 12   Don                  891 non-null    uint8  
 13   Dr                   891 non-null    uint8  
 14   Jonkheer             891 non-null    uint8  
 15   Lady                 8

In [9]:
df.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Middle Class,Upper Class,male,"Queenstown, Ireland",...,Master,Miss,Mlle,Mme,Mr,Mrs,Ms,Rev,Sir,the Countess
0,1,0,22.0,1,0,7.25,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,2,1,38.0,1,0,71.2833,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,1,26.0,0,0,7.925,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,4,1,35.0,1,0,53.1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,0,35.0,0,0,8.05,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [18]:
X = df.iloc[:, 2:11]
X

Unnamed: 0,Age,SibSp,Parch,Fare,Middle Class,Upper Class,male,"Queenstown, Ireland","Southampton, England"
0,22.0,1,0,7.2500,0,0,1,0,1
1,38.0,1,0,71.2833,0,1,0,0,0
2,26.0,0,0,7.9250,0,0,0,0,1
3,35.0,1,0,53.1000,0,1,0,0,1
4,35.0,0,0,8.0500,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...
886,27.0,0,0,13.0000,1,0,1,0,1
887,19.0,0,0,30.0000,0,1,0,0,1
888,30.0,1,2,23.4500,0,0,0,0,1
889,26.0,0,0,30.0000,0,1,1,0,0


In [19]:
y = df['Survived']
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
X_train

Unnamed: 0,Age,SibSp,Parch,Fare,Middle Class,Upper Class,male,"Queenstown, Ireland","Southampton, England"
331,45.5,0,0,28.5000,0,1,1,0,1
733,23.0,0,0,13.0000,1,0,1,0,1
382,32.0,0,0,7.9250,0,0,1,0,1
704,26.0,1,0,7.8542,0,0,1,0,1
813,6.0,4,2,31.2750,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
106,21.0,0,0,7.6500,0,0,0,0,1
270,30.0,0,0,31.0000,0,1,1,0,1
860,41.0,2,0,14.1083,0,0,1,0,1
435,14.0,1,2,120.0000,0,1,0,0,1


In [25]:
X_test

Unnamed: 0,Age,SibSp,Parch,Fare,Middle Class,Upper Class,male,"Queenstown, Ireland","Southampton, England"
709,30.0,1,1,15.2458,0,0,1,0,0
439,31.0,0,0,10.5000,1,0,1,0,1
840,20.0,0,0,7.9250,0,0,1,0,1
720,6.0,0,1,33.0000,1,0,0,0,1
39,14.0,1,0,11.2417,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
433,17.0,0,0,7.1250,0,0,1,0,1
773,30.0,0,0,7.2250,0,0,1,0,0
25,38.0,1,5,31.3875,0,0,0,0,1
84,17.0,0,0,10.5000,1,0,0,0,1


In [26]:
y_test

709    1
439    0
840    0
720    1
39     1
      ..
433    0
773    0
25     1
84     1
10     1
Name: Survived, Length: 179, dtype: int64

In [27]:
y_train

331    0
733    0
382    0
704    0
813    0
      ..
106    1
270    0
860    0
435    1
102    0
Name: Survived, Length: 712, dtype: int64

In [28]:
std = StandardScaler()

In [30]:
X_std_train = std.fit_transform(X_train)
X_std_train

array([[ 1.22453038, -0.47072241, -0.47934164, ...,  0.7243102 ,
        -0.30335547,  0.59248936],
       [-0.50801097, -0.47072241, -0.47934164, ...,  0.7243102 ,
        -0.30335547,  0.59248936],
       [ 0.18500557, -0.47072241, -0.47934164, ...,  0.7243102 ,
        -0.30335547,  0.59248936],
       ...,
       [ 0.87802211,  1.23056874, -0.47934164, ...,  0.7243102 ,
        -0.30335547,  0.59248936],
       [-1.20102751,  0.37992316,  2.04874166, ..., -1.38062393,
        -0.30335547,  0.59248936],
       [-0.66201465, -0.47072241,  0.78470001, ...,  0.7243102 ,
        -0.30335547,  0.59248936]])

In [33]:
X_std_test = std.transform(X_test)
X_std_test

array([[ 0.03100189,  0.37992316,  0.78470001, ...,  0.7243102 ,
        -0.30335547, -1.68779402],
       [ 0.10800373, -0.47072241, -0.47934164, ...,  0.7243102 ,
        -0.30335547,  0.59248936],
       [-0.73901649, -0.47072241, -0.47934164, ...,  0.7243102 ,
        -0.30335547,  0.59248936],
       ...,
       [ 0.64701659,  0.37992316,  5.8408666 , ..., -1.38062393,
        -0.30335547,  0.59248936],
       [-0.970022  , -0.47072241, -0.47934164, ..., -1.38062393,
        -0.30335547,  0.59248936],
       [-1.97104589,  0.37992316,  0.78470001, ..., -1.38062393,
        -0.30335547,  0.59248936]])

In [34]:
logreq = LogisticRegression()

In [35]:
lr = logreq.fit(X_std_train, y_train)

In [38]:
print(f"""
    Coefficients: {lr.coef_}\n
    Intercept: {lr.intercept_}
""")


    Coefficients: [[-0.3897059  -0.34370126 -0.10733138  0.17216707  0.50658465  0.74672855
  -1.27216568 -0.03638207 -0.21416054]]

    Intercept: [-0.67925517]



In [39]:
lr.score(X_std_train, y_train)

0.8075842696629213

In [40]:
lr.score(X_std_test, y_test)

0.7988826815642458

In [42]:
confusion_matrix(y_test, lr.predict(X_std_test))

array([[90, 15],
       [21, 53]], dtype=int64)

In [77]:
logistic_params = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Regularization type
    'C': np.linspace(0.1,2,20),                  # Inverse of regularization strength
    # 'solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'],  # Optimization algorithm
    # 'max_iter': [100, 200, 500],                   # Maximum number of iterations
    # 'fit_intercept': [True, False],                # Whether to include the intercept
    # 'class_weight': [None, 'balanced'],            # Handling class imbalance
    # 'l1_ratio': [0.0, 0.5, 1.0]                    # Only used if penalty='elasticnet'
}


In [78]:
gridseach = GridSearchCV(LogisticRegression(), logistic_params)
gridseach.fit(X_std_train, y_train)
gridseach.best_params_

Traceback (most recent call last):
  File "c:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "c:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.

{'C': 0.1, 'penalty': 'l2'}

In [61]:
best_params = {'C': 0.3,
 'class_weight': None,
 'fit_intercept': True,
 'l1_ratio': 0.5,
 'max_iter': 100,
 'penalty': 'elasticnet',
 'solver': 'saga'}

In [74]:
loqres = LogisticRegression(**best_params)

lr = loqres.fit(X_std_train, y_train)

In [75]:
lr.score(X_std_train, y_train)

0.8075842696629213

In [76]:
lr.score(X_std_test, y_test)

0.7988826815642458

In [79]:
logreq1 = LogisticRegression(**{'C': 0.1, 'penalty': 'l2'})
lr1 = logreq1.fit(X_std_train, y_train)

In [80]:
lr1.score(X_std_train, y_train)

0.8117977528089888

In [81]:
lr1.score(X_std_test, y_test)

0.8044692737430168

In [88]:
test = data_prepocessing(pd.read_csv('data/test.csv'))
test.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Middle Class,Upper Class,male,"Queenstown, Ireland","Southampton, England",Dona,Dr,Master,Miss,Mr,Mrs,Ms,Rev
0,892,34.5,0,0,7.8292,0,0,1,1,0,0,0,0,0,1,0,0,0
1,893,47.0,1,0,7.0,0,0,0,0,1,0,0,0,0,0,1,0,0
2,894,62.0,0,0,9.6875,1,0,1,1,0,0,0,0,0,1,0,0,0
3,895,27.0,0,0,8.6625,0,0,1,0,1,0,0,0,0,1,0,0,0
4,896,22.0,1,1,12.2875,0,0,0,0,1,0,0,0,0,0,1,0,0


In [89]:
PassengerId = test['PassengerId']

In [97]:
data = test.iloc[:, 1:10]
data

Unnamed: 0,Age,SibSp,Parch,Fare,Middle Class,Upper Class,male,"Queenstown, Ireland","Southampton, England"
0,34.5,0,0,7.8292,0,0,1,1,0
1,47.0,1,0,7.0000,0,0,0,0,1
2,62.0,0,0,9.6875,1,0,1,1,0
3,27.0,0,0,8.6625,0,0,1,0,1
4,22.0,1,1,12.2875,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
413,30.0,0,0,8.0500,0,0,1,0,1
414,39.0,0,0,108.9000,0,1,0,0,0
415,38.5,0,0,7.2500,0,0,1,0,1
416,30.0,0,0,8.0500,0,0,1,0,1


In [100]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   418 non-null    float64
 1   SibSp                 418 non-null    int64  
 2   Parch                 418 non-null    int64  
 3   Fare                  418 non-null    float64
 4   Middle Class          418 non-null    uint8  
 5   Upper Class           418 non-null    uint8  
 6   male                  418 non-null    uint8  
 7   Queenstown, Ireland   418 non-null    uint8  
 8   Southampton, England  418 non-null    uint8  
dtypes: float64(2), int64(2), uint8(5)
memory usage: 15.2 KB


In [99]:
data['Fare'] = data['Fare'].fillna(32)

In [101]:
data_std = std.transform(data)

In [102]:
lr1.predict(data_std)

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [104]:
data_submission_logres = pd.DataFrame({'PassengerId': PassengerId, 'Survived': lr1.predict(data_std)})

In [105]:
data_submission_logres.to_csv('data/data_submission_logres.csv', index=False)