## Common used techniques
- for categorical variable -> label encoding, one-hot-encode
- remove outlier before modelling
    - a. change min, max range
    - b. directly remove

### Label encoding & one-hot-encode

In [27]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder

# read file
dir_data = '/Users/chloe/Google Drive/Kaggle_Data/kaggle_home_credit_default_risk/'
f_app_train = os.path.join(dir_data, 'application_train.csv')
df_train = pd.read_csv(f_app_train)


# label encode
sub_train = df_train[["WEEKDAY_APPR_PROCESS_START"]]
le = LabelEncoder()
sub_train["label_encode"] = le.fit_transform(sub_train)

print("---- label encode mapping ---- \n\n",
      sub_train.drop_duplicates(subset=["WEEKDAY_APPR_PROCESS_START"]).sort_values(by="label_encode", ascending=True),
     "\n")

# one hot encode
sub_train = pd.get_dummies(sub_train, columns=["WEEKDAY_APPR_PROCESS_START"])
print("one hot encode columns output: \n\n", sub_train.columns)
sub_train.head(2)

---- label encode mapping ---- 

    WEEKDAY_APPR_PROCESS_START  label_encode
11                     FRIDAY             0
1                      MONDAY             1
10                   SATURDAY             2
6                      SUNDAY             3
4                    THURSDAY             4
29                    TUESDAY             5
0                   WEDNESDAY             6 

one hot encode columns output: 

 Index(['label_encode', 'WEEKDAY_APPR_PROCESS_START_FRIDAY',
       'WEEKDAY_APPR_PROCESS_START_MONDAY',
       'WEEKDAY_APPR_PROCESS_START_SATURDAY',
       'WEEKDAY_APPR_PROCESS_START_SUNDAY',
       'WEEKDAY_APPR_PROCESS_START_THURSDAY',
       'WEEKDAY_APPR_PROCESS_START_TUESDAY',
       'WEEKDAY_APPR_PROCESS_START_WEDNESDAY'],
      dtype='object')


Unnamed: 0,label_encode,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY
0,6,0,0,0,0,0,0,1
1,1,0,1,0,0,0,0,0


### Remove outlier

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt


# read files
data_path = '/Users/chloe/Google Drive/Kaggle_Data/kaggle_house_price/'
df_train = pd.read_csv(data_path + 'train.csv')
train_Y = np.log1p(df_train['SalePrice'])
df = df_train.drop(['Id', 'SalePrice'] , axis=1)


num_features = []
for col in df.columns:
    if df[col].dtypes in ("int64", "float64"):
        num_features.append(col)
print(f'{len(num_features)} \n Numeric Features : {num_features}\n')

df = df[num_features]
df = df.fillna(-1)
MMEncoder = MinMaxScaler()


# original 1stFlrSF plot and CV score
sns.regplot(x = df['1stFlrSF'], y=train_Y)
plt.show();

train_X = MMEncoder.fit_transform(df)
estimator = LinearRegression()
print(f' original score: {cross_val_score(estimator, train_X, train_Y, cv=5).mean()}')


# change the range (will not exclude the value) and CV score
df["1stFlrSF"] = df["1stFlrSF"].clip(500, 2250)

train_X = MMEncoder.fit_transform(df)
estimator = LinearRegression()
print(f' change data range (min: 500, max:2250) score: {cross_val_score(estimator, train_X, train_Y, cv=5).mean()}')



# exclude the outlier directly and the cv score
criteria_index = (df["1stFlrSF"]<2250) & (df["1stFlrSF"]>500)
df = df[criteria_index]
train_Y = train_Y[criteria_index]

train_X = MMEncoder.fit_transform(df)
estimator = LinearRegression()
print(f' direct remove outlier score: {cross_val_score(estimator, train_X, train_Y, cv=5).mean()}')

36 
 Numeric Features : ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']



<Figure size 640x480 with 1 Axes>

 original score: 0.8499463091778436
 change data range (min: 500, max:2250) score: 0.8835596149765429
 direct remove outlier score: 0.8894372874380758


  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
