<a href="https://colab.research.google.com/github/gorzanskik-ai/titanic/blob/main/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
path_train = '/content/drive/MyDrive/machine learning/projects/titanic/train.csv'
path_test = '/content/drive/MyDrive/machine learning/projects/titanic/test.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv(path_train)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
test = pd.read_csv(path_test)
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
train.isnull().sum() / len(train) * 100

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

In [None]:
test.isnull().sum() / len(test) * 100

PassengerId     0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            20.574163
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.239234
Cabin          78.229665
Embarked        0.000000
dtype: float64

In [None]:
train['Cabin'][train['Cabin'].notnull()]

1              C85
3             C123
6              E46
10              G6
11            C103
          ...     
871            D35
872    B51 B53 B55
879            C50
887            B42
889           C148
Name: Cabin, Length: 204, dtype: object

In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


Filling missing AGE values by random numbers of (mean - std, mean + std) interval

In [None]:
for df in [train, test]:
    mean = df['Age'].mean()
    std = df['Age'].std()
    size_of_nan = df['Age'].isnull().sum()
    random_age = np.random.randint(mean - std, mean + std, size_of_nan)

    indexes = df[pd.isnull(df['Age'])].index
    df['Age'].iloc[indexes] = random_age

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Age'].iloc[indexes] = random_age


Creating categories AGE

In [None]:
for df in [train, test]:
    df['Age'] = df['Age'].astype(int)
    df.loc[df['Age'] <= 11, 'Age'] = 0
    df.loc[(df['Age'] > 11) & (df['Age'] <= 18), 'Age'] = 1
    df.loc[(df['Age'] > 18) & (df['Age'] <= 22), 'Age'] = 2
    df.loc[(df['Age'] > 22) & (df['Age'] <= 27), 'Age'] = 3
    df.loc[(df['Age'] > 27) & (df['Age'] <= 33), 'Age'] = 4
    df.loc[(df['Age'] > 33) & (df['Age'] <= 40), 'Age'] = 5
    df.loc[(df['Age'] > 40) & (df['Age'] <= 66), 'Age'] = 6
    df.loc[ df['Age'] > 66, 'Age'] = 6

New feature AGE * PCLASS

In [None]:
for df in [train, test]:
    df['Age*Class'] = df['Age'] * df['Pclass']

Filling missing EMBARKED values by most popular

In [None]:
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [None]:
for df in [train, test]:
    most_popular = df['Embarked'].mode().iloc[0]
    df['Embarked'] = df['Embarked'].fillna(most_popular)

Mapping EMBARKED values

In [None]:
ports = {"S": 0, "C": 1, "Q": 2}

for df in [train, test]:
    df['Embarked'] = df['Embarked'].map(ports)

Mapping SEX values

In [None]:
genders = {'male': 0, 'female': 1}

for df in [train, test]:
    df['Sex'] = df['Sex'].map(genders)

New feature CABIN -> DECK

In [None]:
deck = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'U': 8}

for df in [train, test]:
    df['Cabin'].fillna('U', inplace=True)
    decks = []
    for cabin in df['Cabin']:
        text = ''
        for letter in str(cabin):
            if letter.isalpha():
                text += letter
        decks.append(text[0])

    df['Deck'] = decks
    df['Deck'] = df['Deck'].map(deck)
    df['Deck'].fillna(8, inplace=True)
    df['Deck'] = df['Deck'].astype(int)

train = train.drop(['Cabin'], axis=1)
test = test.drop(['Cabin'], axis=1)

FARE -> float -> int

In [None]:
for df in [train, test]:
    most_popular = df['Fare'].mode().iloc[0]
    df['Fare'].fillna(most_popular, inplace=True)
    df['Fare'] = df['Fare'].astype(int)

New feature NAME -> TITLE

In [None]:
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for df in [train, test]:
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    df['Title'] = df['Title'].map(titles)

train = train.drop(['Name'], axis=1)
test = test.drop(['Name'], axis=1)

Drop TICKET

In [None]:
train['Ticket'].describe()

count        891
unique       681
top       347082
freq           7
Name: Ticket, dtype: object

In [None]:
train = train.drop(['Ticket'], axis=1)
test = test.drop(['Ticket'], axis=1)

Creating categories FARE

In [None]:
for df in [train, test]:
    df.loc[df['Fare'] <= 7.91, 'Fare'] = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
    df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare'] = 2
    df.loc[(df['Fare'] > 31) & (df['Fare'] <= 99), 'Fare'] = 3
    df.loc[(df['Fare'] > 99) & (df['Fare'] <= 250), 'Fare'] = 4
    df.loc[df['Fare'] > 250, 'Fare'] = 5
    df['Fare'] = df['Fare'].astype(int)

New feature FAMILY SIZE

In [None]:
for df in [train, test]:
    df['Family_Size'] = df['Parch'] + df['SibSp']

New feature FARE PER PERSON

In [None]:
for df in [train, test]:
    df['Fare_Per_Person'] = df['Fare'] / (df['Family_Size'] + 1)

Drop PassengerID

In [None]:
train = train.drop(['PassengerId'], axis=1)
test = test.drop(['PassengerId'], axis=1)

New feature ALONE

In [None]:
def alone(x):
    if x != 0:
        return 0
    else:
        return 1

for df in [train, test]:
    df['Alone'] = df['Family_Size'].apply(alone)

In [None]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age*Class,Deck,Title,Family_Size,Fare_Per_Person,Alone
0,0,3,0,2,1,0,0,0,6,8,1,1,0.0,0
1,1,1,1,5,1,0,3,1,5,3,3,1,1.5,0
2,1,3,1,3,0,0,0,0,9,8,2,0,0.0,1
3,1,1,1,5,1,0,3,0,5,3,3,1,1.5,0
4,0,3,0,5,0,0,1,0,15,8,1,0,1.0,1


In [None]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age*Class,Deck,Title,Family_Size,Fare_Per_Person,Alone
0,3,0,5,0,0,0,2,15,8,1,0,0.0,1
1,3,1,6,1,0,0,0,18,8,3,1,0.0,0
2,2,0,6,0,0,1,2,12,8,1,0,1.0,1
3,3,0,3,0,0,1,0,9,8,1,0,1.0,1
4,3,1,2,1,1,1,0,6,8,3,2,0.333333,0


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Survived         891 non-null    int64  
 1   Pclass           891 non-null    int64  
 2   Sex              891 non-null    int64  
 3   Age              891 non-null    int64  
 4   SibSp            891 non-null    int64  
 5   Parch            891 non-null    int64  
 6   Fare             891 non-null    int64  
 7   Embarked         891 non-null    int64  
 8   Age*Class        891 non-null    int64  
 9   Deck             891 non-null    int64  
 10  Title            891 non-null    int64  
 11  Family_Size      891 non-null    int64  
 12  Fare_Per_Person  891 non-null    float64
 13  Alone            891 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 97.6 KB


In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Pclass           418 non-null    int64  
 1   Sex              418 non-null    int64  
 2   Age              418 non-null    int64  
 3   SibSp            418 non-null    int64  
 4   Parch            418 non-null    int64  
 5   Fare             418 non-null    int64  
 6   Embarked         418 non-null    int64  
 7   Age*Class        418 non-null    int64  
 8   Deck             418 non-null    int64  
 9   Title            418 non-null    int64  
 10  Family_Size      418 non-null    int64  
 11  Fare_Per_Person  418 non-null    float64
 12  Alone            418 non-null    int64  
dtypes: float64(1), int64(12)
memory usage: 42.6 KB


In [None]:
print(train.isnull().sum().sum())
print(test.isnull().sum().sum())

0
0


In [None]:
train.to_csv('/content/drive/MyDrive/machine learning/projects/titanic/train2.csv')
test.to_csv('/content/drive/MyDrive/machine learning/projects/titanic/test2.csv')