In [37]:
import pandas as pd
import random
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [38]:
df = pd.read_csv('../Datasets/Titanic/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [39]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [40]:
df.shape

(891, 12)

In [41]:
categorical = ['Survived', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
numeric = ['Pclass','Age','SibSp','Parch','Fare']
work_columns = ['Pclass', 'Name', 'Sex', 'Age', 'SibSp','Parch', 'Ticket', 'Fare', 'Embarked']

In [42]:
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [43]:
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [44]:
df['Embarked'] = df['Embarked'].fillna('S')
df['Embarked'] = df['Embarked'].replace('NaN',pd.NA)
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

### Jako, że w kolumnie Cabin brakujące wartości stanowią znaczną większość, będziemy prowadzić nad nimi rozważania w formie surowej

In [45]:
df_copy = df.copy()


### Generujemy losowo brakujące wartości

In [46]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [47]:
from Preparation import prepare_to_file, prepare_and_return
from Scoring import score, initialize_result_file
from Scenarios import drop_columns,remove_missing,fill_missing_mode,fill_missing_max,fill_missing_min,fill_missing_mean,fill_missing_regression,fill_missing_zero,standardize,normalize,remove_outliers_lof,encode_categorical

In [48]:
df_copy = prepare_and_return(df,work_columns)
df_copy.isna().sum()

PassengerId      0
Survived         0
Pclass          15
Name             6
Sex             12
Age              9
SibSp           11
Parch            9
Ticket          12
Fare             9
Cabin          687
Embarked         6
dtype: int64

In [None]:
initialize_result_file()

### Brak preprocessingu -> usuwanie brakujących wartości

In [49]:
df_1 = df.copy()
df_1.isna().sum().sum()

687

In [50]:
df_1 = remove_missing(df_1)
df_1

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


In [51]:
y = df_1['Survived']
y

1      1
3      1
6      0
10     1
11     1
      ..
871    1
872    0
879    1
887    1
889    1
Name: Survived, Length: 204, dtype: int64

In [52]:
df_1= drop_columns(df_1,categorical)
df_1 = df_1.apply(pd.to_numeric)
X = df_1

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [54]:

score("Titanic",1,"No Preprocessing",X_train,y_train,X_test,y_test)

Scenario: No Preprocessing
Xgboost: 0.6190476190476191
Random Forest Classifier: 0.7619047619047619
KNeighbors Classifier: 0.5714285714285714


### Scenariusz 2: Wypełnienie brakujących wartości w kolumnach liczbowych wartością średnią

In [55]:
df_2 = df.copy()
df_2 = fill_missing_mean(df_2,numeric)
# for col in numeric:
#     df_2[col] = df_2[col].fillna(df_2[col].mean())

In [56]:
y = df_2['Survived']

In [57]:
df_2 = drop_columns(df_2,categorical)
df_2 = df_2 = df_2.apply(pd.to_numeric)
X = df_2

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [59]:
score("Titanic",1,"Fill Missing with mean",X_train,y_train,X_test,y_test)

Scenario: Fill Missing with mean
Xgboost: 0.7333333333333333
Random Forest Classifier: 0.7444444444444445
KNeighbors Classifier: 0.6888888888888889


### Scenariusz 3: Wypełnienie brakujących wartości w kolumnach liczbowych wartością minimalną

In [60]:
df_3 = df.copy()
df_3 = fill_missing_min(df_3,numeric)
# for col in numeric:
#     df_3[col] = df_3[col].fillna(df_3[col].min())

In [61]:
y = df_3['Survived']

In [62]:
df_3 = df_3.drop(categorical,axis=1)
df_3 = df_3 = df_3.apply(pd.to_numeric)
X = df_3

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [64]:
score("Titanic",1,"Fill_missing_with_min",X_train,y_train,X_test,y_test)

Scenario: Fill_missing_with_min
Xgboost: 0.7333333333333333
Random Forest Classifier: 0.7444444444444445
KNeighbors Classifier: 0.6888888888888889


### Scenariusz 4: Wypełnienie brakujących wartości w kolumnach liczbowych wartością maksymalną

In [65]:
df_4 = df.copy()
df_4 = fill_missing_max(df_4,numeric)
# for col in numeric:
#     df_4[col] = df_4[col].fillna(df_4[col].max())

In [66]:
y = df_4['Survived']

In [67]:
df_4 = df_4.drop(categorical,axis=1)
df_4 = df_4.apply(pd.to_numeric)
X = df_4

In [68]:
score("Titanic",1,"Fill_missing_with_max",X_train,y_train,X_test,y_test)

Scenario: Fill_missing_with_max
Xgboost: 0.7333333333333333
Random Forest Classifier: 0.7444444444444445
KNeighbors Classifier: 0.6888888888888889


### 5. Wypełnianie brakujących wartości regresją liniową

In [69]:
df_new = df_copy.copy()
df_new

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C


In [70]:
df_copy.isna().sum()

PassengerId      0
Survived         0
Pclass          15
Name             6
Sex             12
Age              9
SibSp           11
Parch            9
Ticket          12
Fare             9
Cabin          687
Embarked         6
dtype: int64

In [71]:
df_new.isnull().sum()

PassengerId      0
Survived         0
Pclass          15
Name             6
Sex             12
Age              9
SibSp           11
Parch            9
Ticket          12
Fare             9
Cabin          687
Embarked         6
dtype: int64

In [72]:
df_new = fill_missing_regression(df_new, numeric)

In [73]:
y = df_new['Survived']
df_new = df_new.drop(categorical,axis=1)
df_new = df_new.apply(pd.to_numeric)
X = df_new

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [75]:
score("Titanic",1,"Regression",X_train,y_train,X_test,y_test)

Scenario: Regression
Xgboost: 0.7333333333333333
Random Forest Classifier: 0.7333333333333333
KNeighbors Classifier: 0.6888888888888889


### 5:  Standaryzacja

In [77]:
df_5 = df.copy()
df_5 = fill_missing_mean(df_5,numeric)
# for col in numeric:
#     df_5[col] = df_5[col].fillna(df_5[col].mean())

In [78]:
# for col in numeric:
#     values = df_5[col].values
#     df_scaled = standard_scaler.fit_transform(values.reshape(-1, 1)) 
#     df_scaled = pd.DataFrame(df_scaled)
#     df_5[col] = df_scaled
df_5 = standardize(df,numeric)

In [79]:
y = df_5['Survived']

In [80]:
df_5 = df_5.drop(categorical,axis=1)
df_5 = df_5.apply(pd.to_numeric)
X = df_5

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [82]:
score("Titanic",1,"Standardize",X_train,y_train,X_test,y_test)

Scenario: Standardize
Xgboost: 0.7333333333333333
Random Forest Classifier: 0.7444444444444445
KNeighbors Classifier: 0.6444444444444445


### 6: Normalizacja

In [83]:
df_6 = df.copy()
for col in numeric:
    df_6[col] = df_6[col].fillna(df_6[col].mean())

In [84]:
# for col in numeric:
#     values = df_6[col].values
#     df_scaled = min_max_scaler.fit_transform(values.reshape(-1, 1)) 
#     df_scaled = pd.DataFrame(df_scaled)
#     df_6[col] = df_scaled
df_6 = normalize(df_6,numeric)

In [85]:
y = df_6['Survived']

In [86]:
df_6 = df_6.drop(categorical,axis=1)
df_6 = df_6.apply(pd.to_numeric)
X = df_6

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [88]:
score("Titanic",1,"Normalizacja",X_train,y_train,X_test,y_test)

Scenario: Normalizacja
Xgboost: 0.7333333333333333
Random Forest Classifier: 0.7444444444444445
KNeighbors Classifier: 0.5777777777777777


### 7. Normalizacja + Outliers (LOF)

In [89]:
df_7 = df.copy()
for col in numeric:
    df_7[col] = df_7[col].fillna(df_7[col].mean())

In [90]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

In [91]:
# for col in numeric:
#     values = df_7[col].values
#     df_scaled = min_max_scaler.fit_transform(values.reshape(-1, 1)) 
#     df_scaled = pd.DataFrame(df_scaled)
#     df_7[col] = df_scaled
df_7 = normalize(df_7,numeric)
df_7.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,1.0,"Braund, Mr. Owen Harris",male,0.271174,0.125,0.0,A/5 21171,0.014151,,S
1,2,1,0.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,0.472229,0.125,0.0,PC 17599,0.139136,C85,C
2,3,1,1.0,"Heikkinen, Miss. Laina",female,0.321438,0.0,0.0,STON/O2. 3101282,0.015469,,S
3,4,1,0.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,0.434531,0.125,0.0,113803,0.103644,C123,S
4,5,0,1.0,"Allen, Mr. William Henry",male,0.434531,0.0,0.0,373450,0.015713,,S


In [92]:
df_7.shape

(891, 12)

In [93]:
df_7 = remove_outliers_lof(df_7,numeric)

In [94]:
from sklearn.neighbors import LocalOutlierFactor

In [95]:
# df_temp = df_7
# df_temp = df_temp.loc[:, numeric] 
# df_temp.head()


In [96]:

# clf = LocalOutlierFactor(n_neighbors=2)
# clf.fit(df_temp)
# y_pred_outliers = clf.fit_predict(df_temp)
# df_temp['outlier'] = y_pred_outliers

# df_temp = df_temp.loc[df_temp['outlier'] == 1]
# df_temp.drop('outlier', axis=1, inplace=True)
# df_temp = df_temp.reset_index(drop=True)
# df_temp.head()

In [97]:
# df_7 = df_7[df_7.index.isin(df_temp.index)]

In [98]:
df_7.shape

(650, 12)

In [99]:
y = df_7['Survived']
y

0      0
1      1
2      1
3      1
4      0
      ..
645    1
646    0
647    1
648    0
649    1
Name: Survived, Length: 650, dtype: int64

In [100]:
df_7 = df_7.drop(categorical,axis=1)
df_7 = df_7.apply(pd.to_numeric)
X = df_7

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [102]:
score("Titanic",1,"Normalization_with_LOF",X_train,y_train,X_test,y_test)

Scenario: Normalization_with_LOF
Xgboost: 0.6307692307692307
Random Forest Classifier: 0.6461538461538462
KNeighbors Classifier: 0.6153846153846154


### Scenariusz 8: Zamiana kolumn kategorycznych na liczbowe poprzez wykorzystanie LabelEncoder

In [103]:
from sklearn.preprocessing import LabelEncoder

In [104]:
df_8 = df.copy()
df_8 = remove_missing(df_8)
to_be_encoded = [
    "Name",
    "Sex",
    "Embarked",
    "Ticket",
    "Cabin"
]
df_8 = encode_categorical(df_8,to_be_encoded)
df_8.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,-1.566107,51,0,0.638789,0.432793,-0.473674,124,0.786845,81,0
3,4,1,-1.566107,74,0,0.407926,0.432793,-0.473674,37,0.42073,55,2
6,7,0,-1.566107,123,1,1.870059,-0.474545,-0.473674,63,0.395814,129,2
10,11,1,0.827377,162,0,-1.977659,0.432793,0.76763,135,-0.312172,145,2
11,12,1,-1.566107,27,0,2.177876,-0.474545,-0.473674,32,-0.113846,49,2


In [105]:
y = df_8['Survived']

In [106]:
df_8 = df_8.apply(pd.to_numeric)
X = df_8

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [108]:
score("Titanic",1,"Label_Encoding",X_train,y_train,X_test,y_test)

Scenario: Label_Encoding
Xgboost: 1.0
Random Forest Classifier: 1.0
KNeighbors Classifier: 0.6666666666666666


### Scenariusz 9: Zamiana kolumn kategorycznych na liczbowe poprzez wykorzystanie LabelEncoder, po wcześniejszym wypełnieniu braków najpopularniejszą wartością

In [109]:
df_9 = df.copy()
to_be_encoded = [
    "Name",
    "Sex",
    "Embarked",
    "Ticket",
    "Cabin"
]


df_9.isna().sum().sum()


687

In [110]:
df_9= df_9.fillna(df_9.mode().iloc[0])
df_9.isna().sum().sum()

0

In [111]:
df_9 = encode_categorical(df_9,to_be_encoded)
# encoder = LabelEncoder()

# for col in to_be_encoded:
#   df_9[col] = encoder.fit_transform(df_9[col])


In [112]:
y = df_9['Survived']

In [113]:
df_9 = df_9.apply(pd.to_numeric)
X = df_9

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [115]:
score("Titanic",1,"Label_encoding_+_fill_missing_mean",X_train,y_train,X_test,y_test)

Scenario: Label_encoding_+_fill_missing_mean
Xgboost: 1.0
Random Forest Classifier: 1.0
KNeighbors Classifier: 0.6333333333333333


### Scenariusz 10: Wypełnienie braków (liczbowe wartością średnią, kategoryczne najpopularniejszą), konwersja kolumn kategorycznych na liczbowe, modyfikacja kolumn w celu uzyskania większej ilości informacji

In [116]:
df_10 = df.copy()
df_10.isna().sum().sum()

687

In [117]:
df_10['Title'] = df_10['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df_10['Title'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Countess      1
Capt          1
Ms            1
Sir           1
Lady          1
Mme           1
Don           1
Jonkheer      1
Name: Title, dtype: int64

In [118]:
df_10['Title'] = df_10['Title'].fillna(df_10['Title'].mode().iloc[0])
df_10['Title'].isna().sum()

0

In [119]:
df_10['Title'] = df_10['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df_10['Title'] = df_10['Title'].replace('Mlle', 'Miss')
df_10['Title'] = df_10['Title'].replace('Ms', 'Miss')
df_10['Title'] = df_10['Title'].replace('Mme', 'Mrs')
df_10['Title'].value_counts()

Mr        517
Miss      185
Mrs       126
Master     40
Rare       23
Name: Title, dtype: int64

In [120]:
df_10 = df_10.drop(['Name','Ticket','PassengerId'],axis=1)
df_10

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,0,0.827377,male,-0.592481,0.432793,-0.473674,-0.502445,,S,Mr
1,1,-1.566107,female,0.638789,0.432793,-0.473674,0.786845,C85,C,Mrs
2,1,0.827377,female,-0.284663,-0.474545,-0.473674,-0.488854,,S,Miss
3,1,-1.566107,female,0.407926,0.432793,-0.473674,0.420730,C123,S,Mrs
4,0,0.827377,male,0.407926,-0.474545,-0.473674,-0.486337,,S,Mr
...,...,...,...,...,...,...,...,...,...,...
886,0,-0.369365,male,-0.207709,-0.474545,-0.473674,-0.386671,,S,Rare
887,1,-1.566107,female,-0.823344,-0.474545,-0.473674,-0.044381,B42,S,Miss
888,0,0.827377,female,0.000000,0.432793,2.008933,-0.176263,,S,Miss
889,1,-1.566107,male,-0.284663,-0.474545,-0.473674,-0.044381,C148,C,Mr


In [121]:
df_10['Cabin'] = df_10['Cabin'].fillna('000')
df_10['Cabin'] = df_10['Cabin'].str[:1]
df_10['Cabin'].value_counts()

0    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: Cabin, dtype: int64

In [122]:
for col in numeric:
    df_10[col] = df_10[col].fillna(df_10[col].mean())

In [123]:
df_10 = df_10.fillna(df_10.mode().iloc[0])
df_10.isna().sum().sum()

0

In [124]:
to_be_encoded = [
    "Title",
    "Sex",
    "Embarked",
    "Cabin"
]

encoder = LabelEncoder()

for col in to_be_encoded:
  df_10[col] = encoder.fit_transform(df_10[col])

In [125]:
y = df_10['Survived']

In [126]:
df_10 = df_10.apply(pd.to_numeric)
X = df_10

In [127]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [128]:
score("Titanic",1,"Custom_preprocessing",X_train,y_train,X_test,y_test)

Scenario: Custom_preprocessing
Xgboost: 1.0
Random Forest Classifier: 1.0
KNeighbors Classifier: 0.9555555555555556
