---
title: "Supervised Learning"
format:
    html: 
        code-fold: false
---

<!-- After digesting the instructions, you can delete this cell, these are assignment instructions and do not need to be included in your final submission.  -->

{{< include supervised.qmd >}} 

# Codes 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.model_selection import cross_val_score, GridSearchCV, validation_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV
from tqdm import tqdm


## Decision Tree

In [None]:
# Load training data
file_path_train_lean = "data/processed-data/train_lean.csv"
train_data = pd.read_csv(file_path_train_lean)
train_data = train_data.dropna(subset=['Text', 'Political Lean'])  

# Select features and target
X_train_text = train_data['Text']
y_train = train_data['Political Lean']

# Text vectorization (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')  
X_train = tfidf_vectorizer.fit_transform(X_train_text)

In [None]:
# Define hyperparameter grid for Decision Tree
param_grid = {
    'max_depth': [10, 15, 20, 25, 30, 50, None],  # Adjust depth of the tree
    'min_samples_split': [2, 5, 10, 20, 50],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 5, 10, 20],  # Minimum samples required at a leaf node
    'criterion': ['gini', 'entropy']  # Criterion for splitting
}

# Initialize Decision Tree classifier
clf = DecisionTreeClassifier(random_state=5000)

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Output best parameters from GridSearchCV
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.4f}".format(grid_search.best_score_))

# Perform cross-validation on the best estimator found by GridSearchCV
cv_scores = cross_val_score(grid_search.best_estimator_, X_train, y_train, cv=5, scoring='accuracy')

print("Cross Validation Results:")
print(f"Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")

# Train the best model on the full training set
best_clf = grid_search.best_estimator_
best_clf.fit(X_train, y_train)

In [None]:
# Visualize the trained decision tree
plt.figure(figsize=(20, 10))
plot_tree(best_clf, feature_names=tfidf_vectorizer.get_feature_names_out(), class_names=best_clf.classes_, filled=True)
plt.title("Optimized Decision Tree Visualization")
plt.show()

# Model evaluation (on training data)
y_train_pred = best_clf.predict(X_train)
print("Training Set Evaluation:")
print(classification_report(y_train, y_train_pred))
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))


In [None]:
# Load test data
file_path_text_topic = "data/processed-data/text_topic.csv"
test_data = pd.read_csv(file_path_text_topic)
test_data = test_data.dropna(subset=['text'])

# Use the same vectorizer
X_test_text = test_data['text']
X_test = tfidf_vectorizer.transform(X_test_text)

# Predict on test data
predictions = best_clf.predict(X_test)

# Add predictions to the test set
test_data['dt_lean'] = predictions

# Save predictions
file_path_text_lean = "data/processed-data/text_lean.csv"
df_text_lean = test_data
df_text_lean.to_csv(file_path_text_lean, index=False)

print(f"Modeling complete. Results saved to {file_path_text_lean}")
df_text_lean.head(6)

## Random Forest

In [None]:
# Load and preprocess training data
file_path_train_toxicity = "data/processed-data/train_toxicity.csv"
train_data = pd.read_csv(file_path_train_toxicity)
train_data = train_data.dropna(subset=['comment_text', 'target'])  # Drop rows with NaN in targetted columns

# Split features and target
X_train_text = train_data['comment_text']  
y_train = train_data['target']            

# Text vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train = tfidf_vectorizer.fit_transform(X_train_text)

In [None]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300, 350, 500],
    'max_depth': [10, 15, 20, 25, 50, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ['sqrt', 'log2']
}

# Initialize and train Random Forest Regressor
rf = RandomForestRegressor(random_state=5000)  

# Perform GridSearchCV
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Output the best parameters and score from GridSearchCV
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation MSE: ", grid_search.best_score_)

# Use the best estimator from GridSearchCV
best_rf = grid_search.best_estimator_

# Perform cross-validation with progress tracking
print("Performing Cross Validation...")
cv_scores = []
for train_index, test_index in tqdm(KFold(n_splits=5, shuffle=True, random_state=5000).split(X_train), desc="Cross Validation Progress"):
    # Train-test split
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
    
    # Train and evaluate
    best_rf.fit(X_train_fold, y_train_fold)
    fold_score = mean_squared_error(y_test_fold, best_rf.predict(X_test_fold))
    cv_scores.append(fold_score)


# Output cross-validation results
print("\n--- Cross-validation Results ---")
print(f"Cross-validation MSE (per fold): {cv_scores}")
print(f"Mean Cross-validation MSE: {np.mean(cv_scores):.4f}")
print(f"Standard Deviation of Cross-validation MSE: {np.std(cv_scores):.4f}")

In [None]:
# Train the model on the entire training data
best_rf.fit(X_train, y_train)

# Model evaluation (on training data)
y_train_pred = best_rf.predict(X_train)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_train, y_train_pred)
r2 = r2_score(y_train, y_train_pred)
evs = explained_variance_score(y_train, y_train_pred)

print("\n--- Model Evaluation on Training Data ---")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R2): {r2:.4f}")
print(f"Explained Variance Score: {evs:.4f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Parity plot
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_train, y=y_train_pred, color='blue', alpha=0.6)
plt.plot([min(y_train), max(y_train)], [min(y_train), max(y_train)], color='red', linestyle='--')  # 对角线

plt.title("Parity Plot: Actual vs Predicted Values")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.show()


In [None]:
# Load test data
file_path_text_lean = "data/processed-data/text_lean.csv"
test_data = pd.read_csv(file_path_text_lean)
test_data = test_data.dropna(subset=['text'])

# Use the same vectorizer
X_test_text = test_data['text']
X_test = tfidf_vectorizer.transform(X_test_text)

# Predict toxicity values on test data
test_data['rf_toxicity'] = best_rf.predict(X_test)
df_text_toxicity = test_data

# Save predictions
file_path_text_toxicity = "data/processed-data/text_toxicity.csv"
df_text_toxicity.to_csv(file_path_text_toxicity, index=False)
print(f"Modling complete. Results saved to {file_path_text_toxicity}")
df_text_toxicity.head(13)
