# Random Forest Modeling

### Import libraries and data

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [2]:
# load data
train = pd.read_csv("train_clean.csv")
val = pd.read_csv("val_clean.csv")

## Preprocess data

In [3]:
# preprocess data

# remove punctuation and lowercase everything
def preprocess(text):
    for char in string.punctuation:
        text = text.replace(char, "")
    text = text.lower()
    return text

train.text = train.text.map(lambda x: preprocess(x))
val.text = val.text.map(lambda x: preprocess(x))

# split x and y
X_train, y_train = train.text, train.real_news
X_val, y_val = val.text, val.real_news

# tf-idv
vz = TfidfVectorizer(stop_words="english")
X_train = vz.fit_transform(X_train)
X_val = vz.transform(X_val)

## Grid Search

In [9]:
param_grid = {"max_depth":[5, 10, 15, 20]}
gs = GridSearchCV(RandomForestClassifier(), param_grid)
gs.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
           

In [10]:
gs.best_params_

{'max_depth': 20}

In [11]:
model = gs.best_estimator_

## Evaluation

In [12]:
# fit and predict using model
model.fit(X_train, y_train)
y_train_hat = model.predict(X_train)
y_val_hat = model.predict(X_val)

# classification reports
print("training performance")
print(classification_report(y_train, y_train_hat))
print("validation performance")
print(classification_report(y_val, y_val_hat))

training performance
              precision    recall  f1-score   support

           0       0.96      0.92      0.94     13757
           1       0.92      0.96      0.94     12796

    accuracy                           0.94     26553
   macro avg       0.94      0.94      0.94     26553
weighted avg       0.94      0.94      0.94     26553

validation performance
              precision    recall  f1-score   support

           0       0.92      0.89      0.90      4545
           1       0.89      0.91      0.90      4307

    accuracy                           0.90      8852
   macro avg       0.90      0.90      0.90      8852
weighted avg       0.90      0.90      0.90      8852

