# Modelling Random Forest Classifier 


## Imports

In [1]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import numpy as np
import pickle
import time
import spacy
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
#import lightgbm as lgb

from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
#this initialize tqdm which is useful to show a progress bar when applying operations in a pandas df
tqdm.pandas()

## Data ExtracT

In [12]:
df = pd.read_csv('data/data_usampl_60_40_cleaned.csv')


In [13]:
df.columns

Index(['raw', 'clean', 'clean_pp', 'clean_pp_lemma', 'clean_pp_lemma_stop',
       'toxic'],
      dtype='object')

## Function to Calculate

In [14]:
# initialize dataframe that will include the results
results_df = pd.DataFrame()

def evaluate_model(model, X_train,y_train,X_test,y_test, model_name="", parameters='', comments=''):
    start_time = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"
    predicted_probs = model.predict_proba(X_test)[:, 1]

    # Calculate metrics using probabilities
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predicted_probs)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    return results

## RFC

### 1. RFC with stopwords_punct_lemma vectorizer Tfidf

In [15]:
X_stop = df['clean_pp_lemma_stop']
y_stop = df['toxic']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform text data into TF-IDF features
X_tfidf = tfidf_vectorizer.fit_transform(X_stop)

# Split the data into training and testing sets
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_tfidf, y_stop, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier()

# Call the evaluate_model function
results = evaluate_model(rf_model, X_train_s, y_train_s, X_test_s, y_test_s, model_name="RandomForestClassifier",parameters="", comments="Using stopwords_punct_lemma, vec - TFIDf")
print(results)


{'Name': 'RandomForestClassifier', 'Parameters': '', 'F1-Score': 0.4649446494464945, 'AUC-ROC': 0.8139334237992326, 'Precision': 0.8513513513513513, 'Recall': 0.3197969543147208, 'Accuracy': 0.71, 'Confusion Matrix': '[[292  11]\n [134  63]]', 'Training Time': '0 minutes and 4.85 seconds', 'Comments': 'Using stopwords_punct_lemma, vec - TFIDf'}


In [16]:
# Convert the dictionary of results into a DataFrame
rfc_results_df = pd.DataFrame([results])

# Append the results to the main results DataFrame (results_df)
results_df = pd.concat([results_df, rfc_results_df], ignore_index=True)

In [17]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,RandomForestClassifier,,0.464945,0.813933,0.851351,0.319797,0.71,[[292 11]\n [134 63]],0 minutes and 4.85 seconds,"Using stopwords_punct_lemma, vec - TFIDf"


### 2. RFC with stopwords_punct_lemma vectorizer word2vec

In [19]:
sentences = [text.split() for text in df['clean_pp_lemma_stop']]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Transform each text into an average Word2Vec vector
word2vec_vectors = []
for text in sentences:
    vectors = [word2vec_model.wv[word] for word in text if word in word2vec_model.wv]
    if vectors:
        text_vector = np.mean(vectors, axis=0)
        word2vec_vectors.append(text_vector)
    else:
        # Handle cases where there are no words found in the Word2Vec model
        word2vec_vectors.append(np.zeros(word2vec_model.vector_size))

# Convert the list of Word2Vec vectors into a matrix
X_word2vec = np.vstack(word2vec_vectors)

# Split the data into training and testing sets
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_word2vec, y_stop, test_size=0.2, random_state=42)



In [20]:
# Initialize the RandomForestClassifier for Word2Vec
rf_model_w2v = RandomForestClassifier()

# Call the evaluate_model function for Word2Vec
results_w2v = evaluate_model(rf_model_w2v, X_train_w2v, y_train_w2v, X_test_w2v, y_test_w2v, model_name="RandomForestClassifier",parameters="", comments="Using stopwords_punct_lemma, vec - Word2Vec")

# Convert the dictionary of results into a DataFrame for Word2Vec
word2vec_results_df = pd.DataFrame([results_w2v])

# Append the Word2Vec results to the main results DataFrame (results_df)
results_df = pd.concat([results_df, word2vec_results_df], ignore_index=True)

In [21]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,RandomForestClassifier,,0.464945,0.813933,0.851351,0.319797,0.71,[[292 11]\n [134 63]],0 minutes and 4.85 seconds,"Using stopwords_punct_lemma, vec - TFIDf"
1,RandomForestClassifier,,0.324324,0.55087,0.484848,0.243655,0.6,[[252 51]\n [149 48]],0 minutes and 2.27 seconds,"Using stopwords_punct_lemma, vec - Word2Vec"


### 3. RFC with stopwords_punct_lemma + count vectorizer 

In [23]:
X_c = df['clean_pp_lemma_stop']
y_c = df['toxic']

# Initialize CountVectorizer
count_vectorizer = CountVectorizer()

# Transform text data into CountVectorized features
X_count = count_vectorizer.fit_transform(X_c)

In [24]:
# Split the data into training and testing sets
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_count, y_c, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier for CountVectorizer
rf_model_c = RandomForestClassifier()

# Call the evaluate_model function for CountVectorizer
results_c = evaluate_model(rf_model_c, X_train_c, y_train_c, X_test_c, y_test_c, model_name="RandomForestClassifier", parameters="", comments="Using stopwords_punct_lemma, vec - CountVectorizer")

# Convert the dictionary of results into a DataFrame for CountVectorizer
count_vectorizer_results_df = pd.DataFrame([results_c])

# Append the CountVectorizer results to the main results DataFrame (results_df)
results_df = pd.concat([results_df, count_vectorizer_results_df], ignore_index=True)

In [25]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,RandomForestClassifier,,0.464945,0.813933,0.851351,0.319797,0.71,[[292 11]\n [134 63]],0 minutes and 4.85 seconds,"Using stopwords_punct_lemma, vec - TFIDf"
1,RandomForestClassifier,,0.324324,0.55087,0.484848,0.243655,0.6,[[252 51]\n [149 48]],0 minutes and 2.27 seconds,"Using stopwords_punct_lemma, vec - Word2Vec"
2,RandomForestClassifier,,0.483271,0.844784,0.902778,0.329949,0.722,[[296 7]\n [132 65]],0 minutes and 4.87 seconds,"Using stopwords_punct_lemma, vec - CountVector..."
