# 02. Feature Engineering

---

In this notebook, I engineer features in order to improve the performace of regression model. Refer to notebook `03_Modeling` to see the performance.

In [1]:
# Import libraries

import pandas as pd

from sklearn.preprocessing import PolynomialFeatures

In [2]:
# Read in train data

df = pd.read_csv('./datasets/clean_train.csv')

In [3]:
df.head()

Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Street,Neighborhood,Overall Qual,Overall Cond,Year Built,...,Fence_MnWw,Fence_no,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,109,533352170,60,55.228571,13517,1,0,6,8,1976,...,0,1,0,0,0,0,0,0,0,1
1,544,531379050,60,43.0,11492,1,1,7,5,1996,...,0,1,0,0,0,0,0,0,0,1
2,153,535304180,20,68.0,7922,1,0,5,7,1953,...,0,1,0,0,0,0,0,0,0,1
3,318,916386060,60,73.0,9802,1,1,5,5,2006,...,0,1,0,0,0,0,0,0,0,1
4,255,906425045,50,82.0,14235,1,1,6,8,1900,...,0,1,0,0,0,0,0,0,0,1


#### `age_when_sold` 
- (Yr Sold - Year Built)

In [4]:
df['Year Built'].value_counts().sort_index(ascending=False)

2010     1
2009    14
2008    36
2007    78
2006    99
        ..
1885     1
1880     3
1879     1
1875     1
1872     1
Name: Year Built, Length: 113, dtype: int64

In [5]:
df['Yr Sold'].value_counts().sort_index(ascending=False)

2010    234
2009    446
2008    435
2007    498
2006    438
Name: Yr Sold, dtype: int64

In [6]:
# Create 'age_when_sold' column by df[Yr Sold] - df[Year Built]

df['age_when_sold'] = df['Yr Sold'] - df['Year Built']

In [7]:
df['age_when_sold'].isnull().sum().sum()

0

#### `baths`

- (Full Bath - Half Bath/2)

In [8]:
#creating baths
df['baths'] = df['Full Bath'] + (df['Half Bath']/2)

## Get Interaction Terms

- Create interaction terms with qualitative features
- The following columns were not included as they are not qunatatative values that would result in meaningful interaction terms
    - 'Id', 'PID', 'Yr Sold'
- 'SalePrice' column was removed as it is a target column

In [9]:
# Generate list of column names to create interaction terms with
interact_term_cols = list(df.columns)

In [10]:
# Remove columns that would not result in meaningful interaction terms
cols_to_remove = ['Id', 'PID', 'Yr Sold', 'SalePrice']
for col in cols_to_remove:
    interact_term_cols.remove(col)

In [11]:
# Make dataframe of features to make interaction terms with
X_interact_df = df[interact_term_cols]

In [12]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_interact = poly.fit_transform(X_interact_df)
poly_df = pd.DataFrame(X_interact, columns = poly.get_feature_names(interact_term_cols))

In [13]:
# merge interaction terms to the original dataframe

df = pd.merge(df.loc[:, ['Id', 'PID', 'Yr Sold', 'SalePrice']], poly_df, on=df.index).drop('key_0', axis =1)

In [14]:
df.shape

(2051, 25203)

In [15]:
df.to_csv('./datasets/clean_train_engineered_terms.csv', index=False)