# Feature Engineer

This engineers features on the data.

## Installations

In [1]:
! pip install feature-engine



In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# sklearn imputation libraries
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import preprocessing

# Use DataPrep function to remove all rows with missing values
from dataprep.clean import clean_df

# mean data imputer
from feature_engine.imputation import MeanMedianImputer

### Read in data

In [3]:
# Read data from csv and set data types (dtype), except the first column -'calldate'- which will be parsed later.
df = pd.read_csv("../data/raw/raw_data.csv", sep=',', engine='python')

In [4]:
# create list of float variables
float_vars = list()        
for x in df.columns:
    if df[x].dtypes == 'float64':
        float_vars.append(x)

# create list of int variables
int_vars = list()        
for x in df.columns:
    if df[x].dtypes == 'int':
        int_vars.append(x)

# create list of string variables
string_vars = list()        
for x in df.columns:
    if df[x].dtypes == 'str':
        string_vars.append(x)

# create list of X variables
X_vars = list()
for col in df.columns:
    if col.startswith('x'):
        X_vars.append(col)
        
print(float_vars)
print(int_vars)
print(string_vars)
print(X_vars)

['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x24', 'x25', 'x26', 'x27', 'x28', 'x29', 'x30', 'x31', 'x32', 'x33', 'x34', 'x35', 'x36', 'x37', 'x38', 'x39']
['y']
[]
['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x24', 'x25', 'x26', 'x27', 'x28', 'x29', 'x30', 'x31', 'x32', 'x33', 'x34', 'x35', 'x36', 'x37', 'x38', 'x39']


### Remove all rows with missing values

In [5]:
_, cleaned_df = clean_df(df, standardize_missing_values = "remove")
cleaned_df

  return infer_dtype(column[column.apply(_check_valid_values, 0)])
  column_not_na = column[column.apply(_check_valid_values, 0)]


Data Type Detection Report:
	These data types are supported by DataPrep to clean: []
Column Headers Cleaning Report:
	40 values cleaned (97.56%)
Number of Entries Cleaning Report:
	395 entries dropped (3.95%)
Downcast Memory Report:
	Memory reducted from 3621085 to 2017050. New size: (55.7%)


  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[c

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_31,x_32,x_33,x_34,x_35,x_36,x_37,x_38,x_39,y
0,1.563312,-1.417454,-0.49021,-1.568689,0.367421,0.070303,-0.535064,0.161154,0.130255,0.753823,...,-0.611644,1.807765,0.089677,-1.289779,0.303392,0.791001,-1.147663,0.311395,-0.097479,1
1,-1.952178,0.127013,1.538298,0.346906,1.188424,0.812583,-0.931559,0.815114,-0.331135,-1.20211,...,0.126718,-1.478616,-0.57393,-0.300777,-2.489546,1.101826,-0.736897,1.109234,0.411488,0
2,0.966699,-0.476276,0.491713,1.778521,-0.977243,-0.465307,-1.026294,0.526787,-0.423048,1.584421,...,0.372821,2.019703,1.059188,1.710667,-0.395971,0.533768,-1.245003,0.315812,-0.696607,0
3,0.24579,0.217226,0.044321,0.741132,2.287139,-0.013715,-0.359327,0.510777,0.765334,0.655096,...,-0.257412,-1.630353,1.33175,-0.744338,-0.536401,-0.619755,-0.867709,-0.840118,0.388468,0
4,0.224607,0.593479,-0.069498,-1.340076,0.573069,0.926663,-1.141124,-0.172414,-1.868113,0.791384,...,1.622028,-1.032052,-2.15192,-0.81478,-0.248285,-0.529315,1.086025,-0.307067,0.555658,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9600,-0.081902,0.077678,-0.007799,-0.349095,0.715026,0.597308,0.524103,-0.535891,-1.422754,-1.306914,...,-1.097008,-0.665732,1.284669,-1.006048,-0.124857,1.360402,0.86153,-0.351954,-1.562268,1
9601,1.770789,0.937521,-1.520024,-0.534501,-0.216092,-1.143603,-0.012028,0.774776,1.207229,-1.291661,...,-1.244869,0.386814,0.205392,-0.620874,0.515563,-0.540098,1.148493,0.511727,1.041767,1
9602,0.293241,-0.510105,0.837776,-0.192746,-0.176852,-0.638883,-0.715619,-0.973087,0.680165,2.100656,...,0.368002,0.235573,0.997999,0.562664,0.309951,-1.111101,0.671824,0.36459,0.005087,1
9603,0.582251,-0.004711,-1.845315,0.59574,0.552615,0.727767,0.121153,-1.212563,1.309937,0.891129,...,-1.491215,-0.319813,-1.339181,1.332218,0.9534,-0.698423,1.227561,0.480504,-0.023771,0


#### Save resulting dataframes to csv for further analysis

In [6]:
cleaned_df.to_csv('../data/processed/data_nomissingvalues.csv', index=False)

### Set all missing values to be `np.nan`

This essentially preserves the data and ensures missing data can be analytically processed downstream.

In [7]:
_, cleaned_df2 = clean_df(df, standardize_missing_values = "fill")
cleaned_df2

  return infer_dtype(column[column.apply(_check_valid_values, 0)])
  column_not_na = column[column.apply(_check_valid_values, 0)]


Data Type Detection Report:
	These data types are supported by DataPrep to clean: []
Column Headers Cleaning Report:
	40 values cleaned (97.56%)
Downcast Memory Report:
	Memory reducted from 3690128 to 2020128. New size: (54.74%)


  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[column.apply(_check_null_values, 0)] = np.nan
  column[c

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_31,x_32,x_33,x_34,x_35,x_36,x_37,x_38,x_39,y
0,1.563312,-1.417454,-0.49021,-1.568689,0.367421,0.070303,-0.535064,0.161154,0.130255,0.753823,...,-0.611644,1.807765,0.089677,-1.289779,0.303392,0.791001,-1.147663,0.311395,-0.097479,1
1,-1.952178,0.127013,1.538298,0.346906,1.188424,0.812583,-0.931559,0.815114,-0.331135,-1.20211,...,0.126718,-1.478616,-0.57393,-0.300777,-2.489546,1.101826,-0.736897,1.109234,0.411488,0
2,0.966699,-0.476276,0.491713,1.778521,-0.977243,-0.465307,-1.026294,0.526787,-0.423048,1.584421,...,0.372821,2.019703,1.059188,1.710667,-0.395971,0.533768,-1.245003,0.315812,-0.696607,0
3,0.24579,0.217226,0.044321,0.741132,2.287139,-0.013715,-0.359327,0.510777,0.765334,0.655096,...,-0.257412,-1.630353,1.33175,-0.744338,-0.536401,-0.619755,-0.867709,-0.840118,0.388468,0
4,0.224607,0.593479,-0.069498,-1.340076,0.573069,0.926663,-1.141124,-0.172414,-1.868113,0.791384,...,1.622028,-1.032052,-2.15192,-0.81478,-0.248285,-0.529315,1.086025,-0.307067,0.555658,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.081902,0.077678,-0.007799,-0.349095,0.715026,0.597308,0.524103,-0.535891,-1.422754,-1.306914,...,-1.097008,-0.665732,1.284669,-1.006048,-0.124857,1.360402,0.86153,-0.351954,-1.562268,1
9996,1.770789,0.937521,-1.520024,-0.534501,-0.216092,-1.143603,-0.012028,0.774776,1.207229,-1.291661,...,-1.244869,0.386814,0.205392,-0.620874,0.515563,-0.540098,1.148493,0.511727,1.041767,1
9997,0.293241,-0.510105,0.837776,-0.192746,-0.176852,-0.638883,-0.715619,-0.973087,0.680165,2.100656,...,0.368002,0.235573,0.997999,0.562664,0.309951,-1.111101,0.671824,0.36459,0.005087,1
9998,0.582251,-0.004711,-1.845315,0.59574,0.552615,0.727767,0.121153,-1.212563,1.309937,0.891129,...,-1.491215,-0.319813,-1.339181,1.332218,0.9534,-0.698423,1.227561,0.480504,-0.023771,0


#### Save resulting dataframes to csv for further analysis

In [8]:
cleaned_df2.to_csv('../data/processed/data_keepingmissing.csv', index=False)

### Replace missing data with median values of the variable

The MeanMedianImputer() replaces missing data by the mean or median value of the variable. It works only with numerical variables. See the method documentation [here](https://feature-engine.trainindata.com/en/latest/api_doc/imputation/MeanMedianImputer.html)

In [9]:
# impute missing values and standardize values 
imputer = SimpleImputer(missing_values=np.nan, strategy='median')

imputer.fit(df)
dfimp = imputer.transform(df)
dfimp_df = pd.DataFrame(dfimp, columns=df.columns[:])

#### Save resulting dataframes to csv for further analysis

In [10]:
dfimp_df.to_csv('../data/processed/data_medianvalsformissing.csv', index=False)

### Replace missing data with mean values of the variable

In [11]:
# impute missing values and standardize values 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

imputer.fit(df)
dfimp = imputer.transform(df)
dfimp_df = pd.DataFrame(dfimp, columns=df.columns[:])

#### Save resulting dataframes to csv for further analysis

In [12]:
dfimp_df.to_csv('../data/processed/data_meanvalsformissing.csv', index=False)