# Import, Download, & Variable Statements

In [10]:
# Import & download statements
!git clone https://github.com/d-atallah/implicit_gender_bias.git
import pandas as pd
import string
import re
import joblib
from implicit_gender_bias import config as cf
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,precision_score, recall_score, f1_score, accuracy_score,roc_curve, roc_auc_score,log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier


import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

fatal: destination path 'implicit_gender_bias' already exists and is not an empty directory.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# Variables
# Inputs
folder_path = cf.filepath()
csv_files = ['facebook_wiki_posts','facebook_wiki_responses','fitocracy_posts','fitocracy_responses','reddit_posts','reddit_responses','ted_responses','facebook_congress_posts','annotations','facebook_congress_responses']

annotations = pd.read_csv(folder_path+'annotations_combined.csv')
#posts_combined = pd.read_csv(folder_path+'posts_combined.csv')
#sources_combined = pd.read_csv(folder_path+'sources_combined_output.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Define Functions


In [6]:
# Evaluate a model
def model_eval(model, X_test, y_test, y_pred):
    """
    Evaluates a specified model using accuracy, precision, recall, F-1 score, AUC, log-Loss, and a confusion matrix.

    Parameters:
    - model: The trained model to be evaluated.
    - X_test (list or array): Test set features.
    - y_test (list or array): True labels.
    - y_pred (list or array): Predicted labels.

    Returns:
    - metrics_df (pd.DataFrame): DataFrame containing the metrics and scores.
    - confusion_df (pd.DataFrame): DataFrame containing a confusion matrix.
    """
    # Initialize dataframes
    metrics_df = pd.DataFrame(columns=['Metric', 'Score'])
    confusion_df = pd.DataFrame(columns=['Actual Positive', 'Actual Negative', 'Predicted Positive', 'Predicted Negative'])

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    metrics_df = pd.concat([metrics_df, pd.DataFrame({'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],
                                                      'Score': [accuracy, precision, recall, f1]})])

    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    auc = roc_auc_score(y_test, y_pred)
    metrics_df = pd.concat([metrics_df, pd.DataFrame({'Metric': ['AUC'],
                                                      'Score': [auc]})])

    logloss = log_loss(y_test, model.predict_proba(X_test))
    metrics_df = pd.concat([metrics_df, pd.DataFrame({'Metric': ['Log-Loss'],
                                                      'Score': [logloss]})])

    # Reset index
    metrics_df = metrics_df.reset_index(drop=True)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    confusion_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])

    # Print dataframes
    print("Metrics:")
    print(metrics_df)

    print("\nConfusion Matrix:")
    print(confusion_df)

    return metrics_df, confusion_df

# Train, Validate, Test Split

In [7]:
# Annotation only
# Set train-test split variables
X = annotations['response_text']
y = annotations['op_gender_binary']

# Perform stratified train-test split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Then, split the temp set into validation and test sets
X_validation, X_test, y_validation, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

In [None]:
# All responses combined
# Set train-test split variables
X = responses_combined['response_text']
y = responses_combined['op_gender_binary']

# Perform stratified train-test split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=responses_combined['source']
)

# Then, split the temp set into validation and test sets
X_validation, X_test, y_validation, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=responses_combined['source']
)

## Vectorization

## Bag of Words

### Unigram

In [8]:
# CountVectorizer
vectorizer_count = CountVectorizer()#stop_words='english'
X_train_vcount = vectorizer_count.fit_transform(X_train)
X_test_vcount = vectorizer_count.transform(X_test)

In [None]:
# Save the vectorizer and associated data
joblib.dump(vectorizer_count,folder_path+'count_vectorizer.pkl')
joblib.dump(X_train_vcount, folder_path+'X_train_vcount.pkl')
joblib.dump(X_test_vcount, folder_path+'X_test_vectorized.pkl')

In [None]:
# TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer()#stop_words='english'
X_train_vtfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_vtfidf = vectorizer_tfidf.transform(X_test)

In [None]:
# Save the vectorizer and associated data
joblib.dump(vectorizer_tfidf,folder_path+'tfidf_vectorizer.pkl')
joblib.dump(X_train_vtfidf, folder_path+'X_train_vtfidf.pkl')
joblib.dump(X_test_vtfidf, folder_path+'X_test_vtfidf.pkl')

## Random forest

In [11]:
# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(
    rf_classifier, param_grid, cv=3, scoring='accuracy', n_jobs=-1
)

# Fit the grid search to the data
grid_search.fit(X_train_vcount, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train a new Random Forest model with the best parameters
best_rf_model = RandomForestClassifier(random_state=42, **best_params)
best_rf_model.fit(X_train_vcount, y_train)

# Evaluate the model on the validation set
y_val_pred = best_rf_model.predict(vectorizer_count.transform(X_validation))
metrics_val_df, confusion_val_df = model_eval(best_rf_model, vectorizer_count.transform(X_validation), y_validation, y_val_pred)


Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}
Metrics:
      Metric     Score
0   Accuracy  0.592528
1  Precision  0.595257
2     Recall  0.638677
3   F1-Score  0.616203
4        AUC  0.591378
5   Log-Loss  0.655784

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive                 611                 512
Actual Negative                 426                 753


In [None]:
# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_vcount, y_train)

# Continue with predictions and evaluation
y_pred = model.predict(X_test_vcount)
metrics_df, confusion_df = model_eval(model, X_test_vcount, y_test, y_pred)

In [None]:
# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_vtfidf, y_train)

# Continue with predictions and evaluation
y_pred = model.predict(X_test_vtfidf)
metrics_df, confusion_df = model_eval(model, X_test_vtfidf, y_test, y_pred)

## Logistic Regression

In [None]:
# Logistic Regression model using count vectorization
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vcount, y_train)

# Predictions on the test set
y_pred = model.predict(X_test_vcount)

# Evaluate model
metrics_df, confusion_df = model_eval(model, X_test_vtfidf, y_test, y_pred)

In [None]:
# Logistic Regression model using TF-IDF vectorization
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vtfidf, y_train)

# Predictions on the test set
y_pred = model.predict(X_test_vtfidf)

# Evaluate model
metrics_df, confusion_df = model_eval(model, X_test_vtfidf, y_test, y_pred)

### Bigram

In [12]:
# CountVectorizer
vectorizer_count = CountVectorizer( ngram_range=(2, 2))#stop_words='english',
X_train_vcount = vectorizer_count.fit_transform(X_train)
X_test_vcount = vectorizer_count.transform(X_test)

In [None]:
# Save the vectorizer and associated data
joblib.dump(vectorizer_count,folder_path+'count_vectorizer_bi.pkl')
joblib.dump(X_train_vcount, folder_path+'X_train_vcount_bi.pkl')
joblib.dump(X_test_vcount, folder_path+'X_test_vectorized_bi.pkl')

In [13]:
# TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer( ngram_range=(2, 2))#stop_words='english',
X_train_vtfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_vtfidf = vectorizer_tfidf.transform(X_test)

In [None]:
# Save the vectorizer and associated data
joblib.dump(vectorizer_tfidf,folder_path+'tfidf_vectorizer_bi.pkl')
joblib.dump(X_train_vtfidf, folder_path+'X_train_vtfidf_bi.pkl')
joblib.dump(X_test_vtfidf, folder_path+'X_test_vtfidf_bi.pkl')

In [None]:
# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(
    rf_classifier, param_grid, cv=3, scoring='accuracy', n_jobs=-1
)

# Fit the grid search to the data
grid_search.fit(X_train_vcount, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train a new Random Forest model with the best parameters
best_rf_model = RandomForestClassifier(random_state=42, **best_params)
best_rf_model.fit(X_train_vcount, y_train)

# Evaluate the model on the validation set
y_val_pred = best_rf_model.predict(vectorizer_count.transform(X_validation))
metrics_val_df, confusion_val_df = model_eval(best_rf_model, vectorizer_count.transform(X_validation), y_validation, y_val_pred)


## Logistic Regression

In [None]:
# Logistic Regression model using count vectorization
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vcount, y_train)

# Predictions on the test set
y_pred = model.predict(X_test_vcount)

# Evaluate model
metrics_df, confusion_df = model_eval(model, X_test_vtfidf, y_test, y_pred)

In [None]:
# Logistic Regression model using TF-IDF vectorization
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vtfidf, y_train)

# Predictions on the test set
y_pred = model.predict(X_test_vtfidf)

# Evaluate model
metrics_df, confusion_df = model_eval(model, X_test_vtfidf, y_test, y_pred)

In [None]:
# Create DataFrame with the predictions
df_predictions = pd.DataFrame({'Predictions': y_pred})

# Save the DataFrame to a CSV file
df_predictions.to_csv(log_ngram_pred_output, index=False)

## XGBoost

In [None]:
# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

# Create an XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(
    xgb_classifier, param_grid, cv=3, scoring='accuracy', n_jobs=-1
)

# Fit the grid search to the data
grid_search.fit(X_train_vcount, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train a new XGBoost model with the best parameters
best_xgb_model = XGBClassifier(random_state=42, **best_params)
best_xgb_model.fit(X_train_vcount, y_train)

# Evaluate the model on the validation set
y_val_pred = best_xgb_model.predict(vectorizer_count.transform(X_validation))
metrics_val_df, confusion_val_df = model_eval(best_xgb_model, vectorizer_count.transform(X_validation), y_validation, y_val_pred)
