In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

## Load data and set up X, y variables

In [None]:
#Load the training set
df_train = pd.read_csv('../data/large_train_sample.csv')
#Drop outlier (99999) found in capital gain that is awfully suspiciousand and either a rare event or a mistake
df_train = df_train.loc[df_train['capital-gain'] < 90000]
# Drop rows with hours per week greater than 90
df_train = df_train.loc[df_train['hours-per-week'] < 90]
# Drop column 'native-country', 90% of this column have the same value
df_train.drop(columns=['native-country'], inplace=True)
# Use one hot encoding to convert categorical columns
df_train = pd.get_dummies(df_train, columns = 
                          ['workclass','marital-status','occupation','relationship', 'sex','education'])

In [None]:
# Set up X, y variables
X_train = df_train.drop(columns='wage')
y_train = df_train['wage']

In [None]:
#Loading the testing set and set up X, y variables
X_test = pd.read_csv('../data/test_data.csv')
X_test = pd.get_dummies(X_test, columns = 
                       ['workclass','marital-status','occupation','relationship', 'sex','education'])
X_test.drop(columns='native-country',inplace=True)
y_test = pd.read_csv('../data/y_test.csv')
y_test = y_test['wage']

## Fit with RandomForestClassifier 

In [None]:
# Instantiate the RandomForest Model
rf1 = RandomForestClassifier(max_depth=15,
                             min_samples_leaf=1,
                             min_samples_split=6,
                             n_estimators=100,
                             random_state=42)
# Fit the model to data
rf1.fit(X_train, y_train)
# Accuracy of training and testing
print(f'accuracy of training set is {round(rf1.score(X_train, y_train),4)}')
print(f'accuracy of testing set is {round(rf1.score(X_test, y_test),4)}')

## Create balanced training and testing with SMOTE

In [None]:
# Instantiate SMOTE
smt = SMOTE()
# Oversample training and testing with SMOTE
X_train_sm, y_train_sm = smt.fit_sample(X_train, y_train)
X_test_sm, y_test_sm = smt.fit_sample(X_test, y_test)

In [None]:
# Instantiate the RandomForest Model
rf2 = RandomForestClassifier(max_depth=15,
                             min_samples_leaf=1,
                             min_samples_split=6,
                             n_estimators=100,
                             random_state=42)
# Fit the model to data
rf2.fit(X_train_sm, y_train_sm)
# Accuracy of training and testing
print(f'accuracy of training set is {round(rf2.score(X_train_sm, y_train_sm),4)}')
print(f'accuracy of testing set is {round(rf2.score(X_test_sm, y_test_sm),4)}')