## **Practice Exercise 17**
### Automated feature Engineering on the provided dataset

### Importing modules

In [3]:
import pandas as pd
# import autosklearn.classification
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Installing Featuretools

In [None]:
!pip3 install featuretools

### Importing featuretools

In [8]:
import featuretools as ft
from featuretools.primitives import *


### Loading data

In [9]:
train_df = pd.read_csv('https://raw.githubusercontent.com/fenago/DSBook/main/Chapter%2017/train.csv')
test_df = pd.read_csv('https://raw.githubusercontent.com/fenago/DSBook/main/Chapter%2017/test.csv')
answers = pd.read_csv('https://raw.githubusercontent.com/fenago/DSBook/main/Chapter%2017/gender_submission.csv')

In [10]:
print(train_df.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


### Cleaning data

In [11]:
combine = train_df.append(test_df)

passenger_id=test_df['PassengerId']
#combine.drop(['PassengerId'], axis=1, inplace=True)
combine = combine.drop(['Ticket', 'Cabin'], axis=1)

combine.Fare.fillna(combine.Fare.mean(), inplace=True)

combine['Sex'] = combine.Sex.apply(lambda x: 0 if x == "female" else 1)

for name_string in combine['Name']:
    combine['Title']=combine['Name'].str.extract('([A-Za-z]+)\.',expand=True)
    
#replacing the rare title with more common one.
mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
          'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}
combine.replace({'Title': mapping}, inplace=True)

combine = combine.drop(['Name'], axis=1)

titles=['Mr','Miss','Mrs','Master','Rev','Dr']
for title in titles:
    age_to_impute = combine.groupby('Title')['Age'].median()[titles.index(title)]
    combine.loc[(combine['Age'].isnull()) & (combine['Title'] == title), 'Age'] = age_to_impute
combine.isnull().sum()

freq_port = train_df.Embarked.dropna().mode()[0]
combine['Embarked'] = combine['Embarked'].fillna(freq_port)
    
combine['Embarked'] = combine['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
combine['Title'] = combine['Title'].map( {'Mr': 0, 'Mrs': 1, 'Miss': 2, 'Master': 3, 'Rev': 4, 'Dr': 5} ).astype(int)
combine.fillna(0, inplace=True)

In [None]:
combine.info()

### Entity set

In [None]:
es = ft.EntitySet(id = 'titanic_data')

es = es.add_dataframe(dataframe_name = 'combine', dataframe = combine.drop(['Survived'], axis=1),\
                              index = 'PassengerId')

es

In [None]:
es = es.normalize_dataframe(base_dataframe_name='combine', new_dataframe_name='Embarked', index='Embarked')
es = es.normalize_dataframe(base_dataframe_name='combine', new_dataframe_name='Sex', index='Sex')
es = es.normalize_dataframe(base_dataframe_name='combine', new_dataframe_name='Title', index='Title')
es = es.normalize_dataframe(base_dataframe_name='combine', new_dataframe_name='Pclass', index='Pclass')
es = es.normalize_dataframe(base_dataframe_name='combine', new_dataframe_name='Parch', index='Parch')
es = es.normalize_dataframe(base_dataframe_name='combine', new_dataframe_name='SibSp', index='SibSp')
es

### List Primitives

In [None]:
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 100
primitives[primitives['type'] == 'aggregation'].head(primitives[primitives['type'] == 'aggregation'].shape[0])

In [None]:
primitives[primitives['type'] == 'transform'].head(primitives[primitives['type'] == 'transform'].shape[0])