In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from fancyimpute import KNN

Using TensorFlow backend.


In [2]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train_data.shape

(891, 12)

In [6]:
test_data.shape

(418, 11)

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [9]:
y_train = train_data['Survived']
ID_train = train_data['PassengerId']
X_train = train_data.drop(columns =['PassengerId', 'Survived'])
ID_test = test_data['PassengerId']
X_test = test_data.drop(columns =['PassengerId'])

In [10]:
encoder = OrdinalEncoder()
imputer = KNN()
cat_cols = ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [11]:
def encode(data):
    '''function to encode non-null data and replace it in the original data'''
    #retains only non-null values
    nonulls = np.array(data.dropna())
    #reshapes the data for encoding
    impute_reshape = nonulls.reshape(-1,1)
    #encode date
    impute_ordinal = encoder.fit_transform(impute_reshape)
    #Assign back encoded values to non-null values
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

#create a for loop to iterate through each column in the data
for columns in cat_cols:
    encode(X_train[columns])
    encode(X_test[columns])

In [12]:
# impute data and convert 
X_train = pd.DataFrame(np.round(imputer.fit_transform(X_train)),columns = X_train.columns)
X_test = pd.DataFrame(np.round(imputer.fit_transform(X_test)),columns = X_test.columns)

Imputing row 1/891 with 1 missing, elapsed time: 0.277
Imputing row 101/891 with 1 missing, elapsed time: 0.288
Imputing row 201/891 with 1 missing, elapsed time: 0.294
Imputing row 301/891 with 2 missing, elapsed time: 0.307
Imputing row 401/891 with 1 missing, elapsed time: 0.325
Imputing row 501/891 with 1 missing, elapsed time: 0.368
Imputing row 601/891 with 1 missing, elapsed time: 0.372
Imputing row 701/891 with 0 missing, elapsed time: 0.375
Imputing row 801/891 with 1 missing, elapsed time: 0.378
Imputing row 1/418 with 1 missing, elapsed time: 0.048
Imputing row 101/418 with 0 missing, elapsed time: 0.051
Imputing row 201/418 with 2 missing, elapsed time: 0.054
Imputing row 301/418 with 1 missing, elapsed time: 0.059
Imputing row 401/418 with 0 missing, elapsed time: 0.062


In [19]:
X_train['PassengerId'] = ID_train
X_train['Survived'] = y_train
X_test['PassengerId'] = ID_test

train_columns = X_train.columns
test_columns = X_test.columns

In [14]:
X_train

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,PassengerId,Survived
0,3.0,108.0,1.0,22.0,1.0,0.0,523.0,7.0,62.0,2.0,1,0
1,1.0,190.0,0.0,38.0,1.0,0.0,596.0,71.0,81.0,0.0,2,1
2,3.0,353.0,0.0,26.0,0.0,0.0,669.0,8.0,78.0,2.0,3,1
3,1.0,272.0,0.0,35.0,1.0,0.0,49.0,53.0,55.0,2.0,4,1
4,3.0,15.0,1.0,35.0,0.0,0.0,472.0,8.0,62.0,2.0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,2.0,548.0,1.0,27.0,0.0,0.0,101.0,13.0,137.0,2.0,887,0
887,1.0,303.0,0.0,19.0,0.0,0.0,14.0,30.0,30.0,2.0,888,1
888,3.0,413.0,0.0,25.0,1.0,2.0,675.0,23.0,93.0,2.0,889,0
889,1.0,81.0,1.0,26.0,0.0,0.0,8.0,30.0,60.0,0.0,890,1


In [15]:
X_test

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,PassengerId
0,3.0,206.0,1.0,34.0,0.0,0.0,152.0,8.0,61.0,1.0,892
1,3.0,403.0,0.0,47.0,1.0,0.0,221.0,7.0,46.0,2.0,893
2,2.0,269.0,1.0,62.0,0.0,0.0,73.0,10.0,59.0,1.0,894
3,3.0,408.0,1.0,27.0,0.0,0.0,147.0,9.0,40.0,2.0,895
4,3.0,178.0,0.0,22.0,1.0,1.0,138.0,12.0,50.0,2.0,896
...,...,...,...,...,...,...,...,...,...,...,...
413,3.0,353.0,1.0,23.0,0.0,0.0,267.0,8.0,46.0,2.0,1305
414,1.0,283.0,0.0,39.0,0.0,0.0,324.0,109.0,22.0,0.0,1306
415,3.0,332.0,1.0,38.0,0.0,0.0,346.0,7.0,39.0,2.0,1307
416,3.0,384.0,1.0,21.0,0.0,0.0,220.0,8.0,47.0,2.0,1308


In [16]:
scaler=MinMaxScaler()
scaled_df_train=scaler.fit_transform(X_train)
X_train_scaled = scaled_df_train

scaled_df_test = scaler.fit_transform(X_test)
X_test_scaled = scaled_df_test

In [20]:
df_train = pd.DataFrame(X_train_scaled, columns=train_columns)

df_test = pd.DataFrame(X_test_scaled, columns=test_columns)

df_train.to_csv(r'data/train2.csv', index=False)
df_test.to_csv(r'data/test2.csv', index=False)