## 5. Modeling -- Regression Prediction on Text Toxicity

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.model_selection import KFold, GridSearchCV
from tqdm import tqdm
from scipy.sparse import issparse

### TF-IDF Vectorizer

In [2]:
# Load and preprocess training data
file_path_train_toxicity = "data/processed-data/train_toxicity.csv"
train_data = pd.read_csv(file_path_train_toxicity)
train_data = train_data.dropna(subset=['comment_text', 'target'])  # Drop rows with NaN in targetted columns

# Split features and target
X_train_text = train_data['comment_text']  
y_train = train_data['target']            

# Text vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train = tfidf_vectorizer.fit_transform(X_train_text)

# Converting to dense matrix will lead the running time to even rise, so sparese matrix kept 
# print(issparse(X_train))
# X_train = X_train.toarray()
# print(issparse(X_train))

### Cross Validation

In [3]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300, 350, 500],
    'max_depth': [10, 15, 20, 25, 50, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ['sqrt', 'log2']
}

# Initialize and train Random Forest Regressor
rf = RandomForestRegressor(random_state=5000)  

# Perform GridSearchCV
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Output the best parameters and score from GridSearchCV
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation MSE: ", grid_search.best_score_)

# Use the best estimator from GridSearchCV
best_rf = grid_search.best_estimator_

# Perform cross-validation with progress tracking
print("Performing Cross Validation...")
cv_scores = []
for train_index, test_index in tqdm(KFold(n_splits=5, shuffle=True, random_state=5000).split(X_train), desc="Cross Validation Progress"):
    # Train-test split
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
    
    # Train and evaluate
    best_rf.fit(X_train_fold, y_train_fold)
    fold_score = mean_squared_error(y_test_fold, best_rf.predict(X_test_fold))
    cv_scores.append(fold_score)


# Output cross-validation results
print("\n--- Cross-validation Results ---")
print(f"Cross-validation MSE (per fold): {cv_scores}")
print(f"Mean Cross-validation MSE: {np.mean(cv_scores):.4f}")
print(f"Standard Deviation of Cross-validation MSE: {np.std(cv_scores):.4f}")

Fitting 3 folds for each of 540 candidates, totalling 1620 fits
Best parameters found:  {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Best cross-validation MSE:  -0.02032021236485095
Performing Cross Validation...


Cross Validation Progress: 5it [06:14, 74.95s/it]


--- Cross-validation Results ---
Cross-validation MSE (per fold): [0.019495475718510954, 0.02001842412618114, 0.020684961097825923, 0.019259745203942604, 0.018090267311264653]
Mean Cross-validation MSE: 0.0195
Standard Deviation of Cross-validation MSE: 0.0009





### Training and Evaluation

In [4]:
# Train the model on the entire training data
best_rf.fit(X_train, y_train)

# Model evaluation (on training data)
y_train_pred = best_rf.predict(X_train)
mse = mean_squared_error(y_train, y_train_pred)
mae = mean_absolute_error(y_train, y_train_pred)
r2 = r2_score(y_train, y_train_pred)
evs = explained_variance_score(y_train, y_train_pred)

print("\n--- Model Evaluation on Training Data ---")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R2): {r2:.4f}")
print(f"Explained Variance Score: {evs:.4f}")


--- Model Evaluation on Training Data ---
Mean Squared Error (MSE): 0.0029
Mean Absolute Error (MAE): 0.0326
R-squared (R2): 0.9088
Explained Variance Score: 0.9089


### Prediction

In [6]:
# Load test data
file_path_text_lean = "data/processed-data/text_lean.csv"
test_data = pd.read_csv(file_path_text_lean)
test_data = test_data.dropna(subset=['text'])

# Use the same vectorizer
X_test_text = test_data['text']
X_test = tfidf_vectorizer.transform(X_test_text)

# Predict toxicity values on test data
test_data['rf_toxicity'] = best_rf.predict(X_test)
df_text_toxicity = test_data

# Save predictions
file_path_text_toxicity = "data/processed-data/text_toxicity.csv"
df_text_toxicity.to_csv(file_path_text_toxicity, index=False)
print(f"Modling complete. Results saved to {file_path_text_toxicity}")
df_text_toxicity.head(13)


Modling complete. Results saved to data/processed-data/text_toxicity.csv


Unnamed: 0,subreddit,id,type,depth,score,time,text,nmf_topic,dt_lean,rf_toxicity
0,Libertarian,1hf706u,submission_hot,0,100,2024/12,road serfdom new libertarian economic let borr...,abortion,Conservative,0.041974
1,Libertarian,m29svuv,comment,1,1,2024/12,fredrich bastiat also good actually make funct...,abortion,Liberal,0.148867
2,Libertarian,m2a5co6,comment,2,1,2024/12,libertarian exclusively anarchist,abortion,Conservative,0.088044
3,Libertarian,m2a694w,comment,3,1,2024/12,true socialist communist always say fix ideolo...,abortion,Conservative,0.059692
4,Libertarian,m29bopi,comment,1,7,2024/12,good favorite always recommend start revolutio...,abortion,Liberal,0.013902
5,Libertarian,m2a1cm0,comment,1,1,2024/12,salma write book become movie star,abortion,Liberal,0.054371
6,Libertarian,m2abas1,comment,1,1,2024/12,sure really good place start recommend economi...,abortion,Liberal,0.00903
7,Libertarian,m29pifi,comment,1,1,2024/12,classic,abortion,Liberal,0.053227
8,Libertarian,1hf07so,submission_hot,0,233,2024/12,pay income tax due norwegian wealth tax fuck,tax,Liberal,0.78327
9,Libertarian,m27qglj,comment,1,93,2024/12,free,tax,Liberal,9.5e-05
