In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Feature Construction

This is a technique by which we construct a certain feature from one or more features which may increase the models performance.

In [2]:
link = 'https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day45-feature-construction-and-feature-splitting/train.csv'
df = pd.read_csv(link)[['Age','Pclass','SibSp','Parch','Survived']].dropna()

df.sample(5)

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
311,18.0,1,2,2,1
245,44.0,1,2,0,0
363,35.0,3,0,0,0
879,56.0,1,0,1,1
886,27.0,2,0,0,0


In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [7]:
X = df.iloc[:,:4]
y = df.iloc[:,-1]

np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))

0.6933333333333332

In [8]:
X['Family_size'] = X['SibSp'] + X['Parch'] + 1
X.sample(5)

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size
840,20.0,3,0,0,1
591,52.0,1,1,0,2
310,24.0,1,0,0,1
51,21.0,3,0,0,1
142,24.0,3,1,0,2


In [9]:
def myfunc(num):
    if num == 1:
        #alone
        return 0
    elif num >1 and num <=4:
        # small family
        return 1
    else:
        # large family
        return 2

In [10]:
X['Family_type'] = X['Family_size'].apply(myfunc)
X.sample(5)

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size,Family_type
173,21.0,3,0,0,1,0
677,18.0,3,0,0,1,0
397,46.0,2,0,0,1,0
850,4.0,3,4,2,7,2
642,2.0,3,3,2,6,2


In [11]:
X.drop(columns=['SibSp','Parch','Family_size'],inplace=True)
X.sample(5)

Unnamed: 0,Age,Pclass,Family_type
405,34.0,2,1
53,29.0,2,1
821,27.0,3,0
617,26.0,3,1
686,14.0,3,2


In [12]:
np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))

0.7003174603174602

# Feature Splitting

This is a technique by which we split a certain feature into more features which may increase the models performance.

In [13]:
df = pd.read_csv(link)
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
557,558,0,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C
857,858,1,1,"Daly, Mr. Peter Denis",male,51.0,0,0,113055,26.55,E17,S
480,481,0,3,"Goodwin, Master. Harold Victor",male,9.0,5,2,CA 2144,46.9,,S
280,281,0,3,"Duane, Mr. Frank",male,65.0,0,0,336439,7.75,,Q
164,165,0,3,"Panula, Master. Eino Viljami",male,1.0,4,1,3101295,39.6875,,S


In [14]:
df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
df['Title'].sample(5)

134      Mr
208    Miss
152      Mr
832      Mr
262      Mr
Name: Title, dtype: object

In [15]:
(df.groupby('Title').mean()['Survived']).sort_values(ascending=False)

TypeError: Could not convert Crosby, Capt. Edward Gifford to numeric

In [24]:
df.groupby('Title')['Survived'].mean().sort_values(ascending=False).head()

Title
the Countess    1.0
Mlle            1.0
Sir             1.0
Ms              1.0
Lady            1.0
Name: Survived, dtype: float64

In [23]:
df['Is_Married'] = 0
df['Is_Married'].loc[df['Title'] == 'Mrs'] = 1

df['Is_Married'].sample(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Is_Married'].loc[df['Title'] == 'Mrs'] = 1


594    0
571    1
122    0
312    1
443    0
Name: Is_Married, dtype: int64