## Lab | Random Forests

#### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.tree import plot_tree
from imblearn.over_sampling import SMOTE

warnings.filterwarnings('ignore')

* Apply the Random Forests algorithm but this time only by upscaling the data using SMOTE.
* Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case.

In [2]:
categorical = pd.read_csv('files_for_lab/categorical.csv')
numeric = pd.read_csv('files_for_lab/numerical.csv')
target = pd.read_csv('files_for_lab/target.csv') 


#### EDA

In [3]:
print(f'The categorical dataframe has {categorical.shape[0]} rows and {categorical.shape[1]} columns')
print(f'The numerical dataframe has {numeric.shape[0]} rows and {numeric.shape[1]} columns')
print(f'The target has {categorical.shape[0]} rows')

The categorical dataframe has 95412 rows and 22 columns
The numerical dataframe has 95412 rows and 315 columns
The target has 95412 rows


Categorical dataframe

In [4]:
categorical.isna().sum()

STATE           0
CLUSTER         0
HOMEOWNR        0
GENDER          0
DATASRCE        0
RFA_2R          0
RFA_2A          0
GEOCODE2        0
DOMAIN_A        0
DOMAIN_B        0
ODATEW_YR       0
ODATEW_MM       0
DOB_YR          0
DOB_MM          0
MINRDATE_YR     0
MINRDATE_MM     0
MAXRDATE_YR     0
MAXRDATE_MM     0
LASTDATE_YR     0
LASTDATE_MM     0
FIRSTDATE_YR    0
FIRSTDATE_MM    0
dtype: int64

In [5]:
categorical.dtypes

STATE           object
CLUSTER          int64
HOMEOWNR        object
GENDER          object
DATASRCE         int64
RFA_2R          object
RFA_2A          object
GEOCODE2        object
DOMAIN_A        object
DOMAIN_B         int64
ODATEW_YR        int64
ODATEW_MM        int64
DOB_YR           int64
DOB_MM           int64
MINRDATE_YR      int64
MINRDATE_MM      int64
MAXRDATE_YR      int64
MAXRDATE_MM      int64
LASTDATE_YR      int64
LASTDATE_MM      int64
FIRSTDATE_YR     int64
FIRSTDATE_MM     int64
dtype: object

The categorical dataframe is not very categorical in nature, it has many numeric values.

In [6]:
categorical.select_dtypes(include='number').columns

Index(['CLUSTER', 'DATASRCE', 'DOMAIN_B', 'ODATEW_YR', 'ODATEW_MM', 'DOB_YR',
       'DOB_MM', 'MINRDATE_YR', 'MINRDATE_MM', 'MAXRDATE_YR', 'MAXRDATE_MM',
       'LASTDATE_YR', 'LASTDATE_MM', 'FIRSTDATE_YR', 'FIRSTDATE_MM'],
      dtype='object')

In [7]:
num_c = categorical.select_dtypes(include='number')
numeric = pd.merge(numeric, num_c, left_index=True, right_index=True)
numeric.shape

(95412, 330)

In [8]:
len(numeric.dtypes) == len(numeric.select_dtypes(include='number').columns) # to check if the number of numeric columns in teh dataframe is the same as the length of the dataframe

True

In [9]:
cat_to_drop = categorical.select_dtypes(include='number')
cat_to_drop.columns


Index(['CLUSTER', 'DATASRCE', 'DOMAIN_B', 'ODATEW_YR', 'ODATEW_MM', 'DOB_YR',
       'DOB_MM', 'MINRDATE_YR', 'MINRDATE_MM', 'MAXRDATE_YR', 'MAXRDATE_MM',
       'LASTDATE_YR', 'LASTDATE_MM', 'FIRSTDATE_YR', 'FIRSTDATE_MM'],
      dtype='object')

In [10]:
categorical = categorical.drop(['CLUSTER', 'DATASRCE', 'DOMAIN_B', 'ODATEW_YR', 'ODATEW_MM', 'DOB_YR', 'DOB_MM', 'MINRDATE_YR', 'MINRDATE_MM', 'MAXRDATE_YR', 'MAXRDATE_MM','LASTDATE_YR', 'LASTDATE_MM', 'FIRSTDATE_YR', 'FIRSTDATE_MM'], axis = 1)
categorical.head()

Unnamed: 0,STATE,HOMEOWNR,GENDER,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A
0,IL,H,F,L,E,C,T
1,CA,H,M,L,G,A,S
2,NC,U,M,L,E,C,R
3,CA,U,F,L,E,C,R
4,FL,H,F,L,F,A,S


In [11]:
# a function to check the uniqu values in a dataframe
def check_unique_values(df):
    for col in df:
        print(list(df[col].unique()))

check_unique_values(categorical)

['IL', 'CA', 'NC', 'FL', 'other', 'IN', 'MI', 'MO', 'TX', 'WA', 'WI', 'GA']
['H', 'U']
['F', 'M', 'other']
['L']
['E', 'G', 'F', 'D']
['C', 'A', 'D', 'B']
['T', 'S', 'R', 'U', 'C']


#### Encoding categoricals

In [12]:
cat = pd.get_dummies(categorical, drop_first = False) # then we encode the remaining categorical columns
cat.head()

Unnamed: 0,STATE_CA,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,...,RFA_2A_G,GEOCODE2_A,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


In [13]:
len(cat) == len(numeric)

True

In [14]:
## Standardizing numericals
transformer = StandardScaler().fit(numeric)
num_standard = transformer.transform(numeric)
num_standard = pd.DataFrame(num_standard, index = numeric.index, columns = numeric.columns.tolist())
num_standard.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,-0.056847,-0.1115292,0.509147,0.717411,-0.356881,-0.206977,0.745798,0.284659,-0.826571,0.719013,...,0.370442,3.012879,-0.172698,0.41908,-0.201169,-1.236357,-0.789204,1.257673,-0.714736,1.237697
1,-0.055799,-1.080356,1.097566,0.717411,1.362283,-0.206977,-1.346527,1.675602,-1.221851,-0.191801,...,1.049599,-0.290923,0.200433,0.932504,0.374914,1.266242,-0.789204,1.257673,0.530642,0.985761
2,-0.055799,1.267017e-10,-0.667692,-2.239375,-0.141985,-0.206977,-0.910626,-0.046518,0.020458,-0.191801,...,-1.304814,-0.290923,-0.545828,1.189216,-1.353334,0.014943,-0.789204,1.257673,-0.403392,-1.281661
3,-0.056847,0.5804901,-1.84453,-1.130581,-0.141985,-0.206977,-0.649086,-1.040049,-0.09248,-0.874911,...,-0.037053,-0.621303,-2.03835,1.189216,-0.201169,1.015982,-0.789204,1.257673,-1.337425,-1.029725
4,-0.056847,1.134106,-0.667692,-1.869777,6.089983,-0.009563,-0.213185,-1.371225,1.149829,4.362269,...,-0.39927,-0.621303,0.200433,0.932504,0.950996,-1.486617,1.115522,-1.336135,-3.828181,-0.77779


In [15]:
full = pd.merge(num_standard, cat, left_index=True, right_index=True )
full.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_G,GEOCODE2_A,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,-0.056847,-0.1115292,0.509147,0.717411,-0.356881,-0.206977,0.745798,0.284659,-0.826571,0.719013,...,0,0,0,1,0,0,0,0,1,0
1,-0.055799,-1.080356,1.097566,0.717411,1.362283,-0.206977,-1.346527,1.675602,-1.221851,-0.191801,...,1,1,0,0,0,0,0,1,0,0
2,-0.055799,1.267017e-10,-0.667692,-2.239375,-0.141985,-0.206977,-0.910626,-0.046518,0.020458,-0.191801,...,0,0,0,1,0,0,1,0,0,0
3,-0.056847,0.5804901,-1.84453,-1.130581,-0.141985,-0.206977,-0.649086,-1.040049,-0.09248,-0.874911,...,0,0,0,1,0,0,1,0,0,0
4,-0.056847,1.134106,-0.667692,-1.869777,6.089983,-0.009563,-0.213185,-1.371225,1.149829,4.362269,...,0,1,0,0,0,0,0,1,0,0


In [16]:
full.shape[0] == target.shape[0] # check if the full df is the same length as the target

True

In [17]:
target['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

#### Train/Test/Split

In [18]:
X = full
y = target['TARGET_B']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)


#### SMOTE

In [19]:
smote = SMOTE()

X_sm, y_sm = smote.fit_resample(X_train, y_train) 

y_sm.value_counts()



0    63369
1    63369
Name: TARGET_B, dtype: int64

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3,random_state=42)
model = RandomForestRegressor(max_depth=None, random_state=42, bootstrap=True, n_jobs=-1)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8957245990893451