<a href="https://colab.research.google.com/github/d-atallah/implicit_gender_bias/blob/main/Supervised_Learning_Dev.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Next Steps:

*  Test not removing stop words
*  Fix Log regression issue
  * May be a class imbalance
* Score random search on AUC-PR
* Use grid search rather than random search in great lakes cluster.




# Import, Download, & Variable Statements

In [3]:
# Import & download statements
# General Statements
!git clone https://github.com/d-atallah/implicit_gender_bias.git
import pandas as pd
import string
import re
import joblib
from implicit_gender_bias import config as cf
import os
import numpy as np

# Feature selection & Model tuning
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD,PCA, NMF
from sklearn.metrics import confusion_matrix,precision_score, recall_score, f1_score, accuracy_score, roc_curve, roc_auc_score, log_loss, make_scorer

# Model options
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# NLTK resources
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()

fatal: destination path 'implicit_gender_bias' already exists and is not an empty directory.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Variables
# Inputs
folder_path = cf.filepath()
csv_files = ['facebook_wiki_posts','facebook_wiki_responses','fitocracy_posts','fitocracy_responses','reddit_posts','reddit_responses','ted_responses','facebook_congress_posts','annotations','facebook_congress_responses']

annotations = pd.read_csv(folder_path+'annotations_combined.csv')
#posts_combined = pd.read_csv(folder_path+'posts_combined.csv')
#sources_combined = pd.read_csv(folder_path+'sources_combined_output.csv')

Mounted at /content/drive


## Define Functions


In [5]:
# Evaluate a model
def model_eval(model, X_test, y_test, y_pred):
    """
    Evaluates a specified model using accuracy, precision, recall, F-1 score, AUC, log-Loss, and a confusion matrix.

    Parameters:
    - model: The trained model to be evaluated.
    - X_test (list or array): Test set features.
    - y_test (list or array): True labels.
    - y_pred (list or array): Predicted labels.

    Returns:
    - metrics_df (pd.DataFrame): DataFrame containing the metrics and scores.
    - confusion_df (pd.DataFrame): DataFrame containing a confusion matrix.
    """
    # Initialize dataframes
    metrics_df = pd.DataFrame(columns=['Metric', 'Score'])
    confusion_df = pd.DataFrame(columns=['Actual Positive', 'Actual Negative', 'Predicted Positive', 'Predicted Negative'])

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    metrics_df = pd.concat([metrics_df, pd.DataFrame({'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],
                                                      'Score': [accuracy, precision, recall, f1]})])

    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    auc = roc_auc_score(y_test, y_pred)
    metrics_df = pd.concat([metrics_df, pd.DataFrame({'Metric': ['AUC'],
                                                      'Score': [auc]})])

    logloss = log_loss(y_test, model.predict_proba(X_test))
    metrics_df = pd.concat([metrics_df, pd.DataFrame({'Metric': ['Log-Loss'],
                                                      'Score': [logloss]})])

    # Reset index
    metrics_df = metrics_df.reset_index(drop=True)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    confusion_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])

    # Print dataframes
    print("Metrics:")
    print(metrics_df)

    print("\nConfusion Matrix:")
    print(confusion_df)

    return metrics_df, confusion_df

In [6]:
def preprocess_text(text):
    """
    Applies text preprocessing to a given text, including:
    - Removing special characters and digits
    - Converting to lowercase
    - Tokenization and removing stopwords
    - Lemmatization and stemming

    Parameters:
    - text (str): Input text to be preprocessed.

    Returns:
    - processed_text (str): Preprocessed text after applying the specified steps.
    """
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenization and removing stopwords
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization and stemming
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [porter.stem(word) for word in tokens]

    # Rejoin tokens into a processed text
    processed_text = ' '.join(tokens)

    return processed_text

In [7]:
def model_search(X_train, y_train, X_validation, y_validation, X_test, model_type, vectorizer, ngram, search_type, param_grid):
  """
  Searches for the best hyperparameters for a specified model and dimensionality reduction method using GridSearchCV or RandomizedSearchCV.

  Parameters:
  - X_train (array-like): Training set features, preprocessed.
  - y_train (array-like): Training set labels.
  - X_validation (array-like): Validation set features, preprocessed.
  - y_validation (array-like): Validation set labels.
  - X_test (array-like): Test set features, preprocessed.
  - model_type (str): Type of model to test. Choose from 'log' (Logistic Regression), 'xgb' (XGBoost), or 'rf' (Random Forest).
  - vectorizer (str): Type of vectorizer to test. Choose from 'count' (Count Vecotizer) or 'tfidf' (TF-IDF Vecotizer).
  - ngram (int): Feature representation to test. Choose 1 for unigrams, 2 for bigrams, and so on.
  - search_type (str): Defines grid search or random search style. Choose from 'grid' (Grid Search), 'rand' (Random Search).
  - param_grid (dict): Hyperparameter grid for the specified model and dimensionality reduction method.

  Returns:
  - selected_model: Trained model with the best hyperparameters.
  - selected_params (dict): Best hyperparameters found during the search.
  - X_train_ (array-like): Vectorized training set features.
  - X_validation_ (array-like): Vectorized validation set features.
  - X_test_ (array-like): Vectorized test set features.
  """
  if vectorizer == 'count':
    vect = CountVectorizer(ngram_range=(ngram, ngram))
    X_train_ = vect.fit_transform(X_train)
    X_validation_ = vect.transform(X_validation)
    X_test_ = vect.transform(X_test)

  elif vectorizer == 'tfidf':
    vect = TfidfVectorizer(ngram_range=(ngram, ngram))
    X_train_ = vect.fit_transform(X_train)
    X_validation_ = vect.transform(X_validation)
    X_test_ = vect.transform(X_test)

  else:
      raise ValueError("Invalid vector type. Use 'count' or 'tfidf'.")

  if model_type == 'log':
      model = LogisticRegression(max_iter=1000, random_state=42)
  elif model_type == 'xgb':
      model = XGBClassifier(random_state=42)
  elif model_type == 'rf':
      model = RandomForestClassifier(random_state=42)
  else:
      raise ValueError("Invalid model type. Use 'xgb', 'rf', or 'log'.")

  # Pipeline with dimensionality reduction method and model to test
  #Chose SVD ad reduction method because the data is sparse (PCA and NMF not applicable)
  pipeline = make_pipeline(
    TruncatedSVD(random_state=42),
    model
  )

  # Cross-validation StratifiedKFold for classification (Reduce risk of overfitting )
  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  if search_type == 'grid':
    # Use F1-score as the scoring metric in GridSearchCV (This can be changed to any other metric)

    search = GridSearchCV(
        pipeline, param_grid, cv=cv, scoring=make_scorer(f1_score), n_jobs=-1, random_state=42
    )
    # Fit the grid search to the data
    search.fit(X_train_, y_train)

  elif search_type == 'random':

    # Use F1-score as the scoring metric in RandomizedSearchCV
    search = RandomizedSearchCV(
      pipeline, param_distributions=param_grid, cv=cv, scoring='f1', n_iter=10, n_jobs=-1, random_state=42
    )
    # Fit random search to the data
    search.fit(X_train_, y_train)

  else:
    raise ValueError("Invalid search type. Use 'grid' or 'random'.")

  # Get best parameters
  selected_params = search.best_params_
  print(f"Hyperparameters:", selected_params)

  # Train a new model with the best hyperparameters
  selected_model = search.best_estimator_

  # Evaluate the model on the validation set
  y_val_pred = selected_model.predict(X_validation_)
  metrics_val_df, confusion_val_df = model_eval(selected_model, X_validation_, y_validation, y_val_pred)

  return selected_model, selected_params, X_train_, X_validation_, X_test_

# Train, Validate, Test Split

In [8]:
# Annotation only
# Set train-test split variables
X = annotations['response_text']
y = annotations['op_gender_binary']

# Perform stratified train-test split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Then, split the temp set into validation and test sets
X_validation, X_test, y_validation, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

In [None]:
# All responses combined
# Set train-test split variables
X = responses_combined['response_text']
y = responses_combined['op_gender_binary']

# Perform stratified train-test split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=responses_combined['source']
)

# Then, split the temp set into validation and test sets
X_validation, X_test, y_validation, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=responses_combined['source']
)

In [9]:
# Apply preprocessing to each set (X_train, X_validation, X_test)
X_train_preprocessed = X_train.apply(preprocess_text)
X_validation_preprocessed = X_validation.apply(preprocess_text)
X_test_preprocessed = X_test.apply(preprocess_text)

## XGBoost

In [10]:
# Define the parameter grid
param_grid = {
  'truncatedsvd__n_components': [150, 200, 250],  # Number of components to keep after dimensionality reduction using Truncated SVD
  'xgbclassifier__n_estimators': [50, 100, 150],  # Number of boosting rounds (trees) in the XGBoost model
  'xgbclassifier__max_depth': [3, 5, 7],  # Maximum depth of each tree in the XGBoost model
  'xgbclassifier__learning_rate': [0.01, 0.1, 0.2],  # Step size shrinkage used in boosting (controls the learning rate)
  'xgbclassifier__subsample': [0.8, 1.0],  # Fraction of samples used for training each tree (subsample ratio)
  'xgbclassifier__colsample_bytree':  [0.8, 1.0],  # Fraction of features used for training each tree (column subsampling ratio)
}

Below is a test based on random search (Will implement in near future)

In [None]:
enhanced_param_grid = {
  'truncatedsvd__n_components': [150, 175, 200, 225, 250],  # Enhanced range for the number of components in Truncated SVD
  'xgbclassifier__n_estimators': [30, 50, 70],  # Enhanced range for the number of boosting rounds in XGBoost
  'xgbclassifier__max_depth': [2, 3, 4],  # Enhanced range for the maximum depth of each tree in XGBoost
  'xgbclassifier__learning_rate': [0.005, 0.01, 0.05, 0.1, 0.15],  # Enhanced range for the learning rate in XGBoost
  'xgbclassifier__subsample': [0.5, 0.6, 0.8],  # Enhanced range for the subsample ratio in XGBoost
  'xgbclassifier__colsample_bytree':  [0.5, 0.6, 0.8],  # Enhanced range for the column subsampling ratio in XGBoost
}

### XGB Model Method:
*   Vectorization: Count
*   Feature Representation: Unigram



In [None]:
# Define variables
model = 'xgb'
vectorization = 'count'
ngram = 1
search_type = 'random'

# Run model search
model,params,train,validation,test = model_search(X_train_preprocessed, y_train, X_validation_preprocessed, y_validation, X_test_preprocessed, model, vectorization, ngram, search_type, param_grid)

# Save results to dictionary
xgb_count_1 = {
    'model': model,
    'params': params,
    'X_train': train,
    'X_validation': validation,
    'X_test': test
}

Hyperparameters: {'xgbclassifier__subsample': 0.8, 'xgbclassifier__n_estimators': 50, 'xgbclassifier__max_depth': 3, 'xgbclassifier__learning_rate': 0.01, 'xgbclassifier__colsample_bytree': 0.8, 'truncatedsvd__n_components': 200}
Metrics:
      Metric     Score
0   Accuracy  0.533884
1  Precision  0.531361
2     Recall  0.761662
3   F1-Score  0.626002
4        AUC  0.528204
5   Log-Loss  0.691006

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive                 331                 792
Actual Negative                 281                 898


### XGB Model Method:
*   Vectorization: TF-IDF
*   Feature Representation: Unigram

In [None]:
# Define variables
model = 'xgb'
vectorization = 'tfidf'
ngram = 1
search_type = 'random'

# Run model search
model,params,train,validation,test = model_search(X_train_preprocessed, y_train, X_validation_preprocessed, y_validation, X_test_preprocessed, model, vectorization, ngram, search_type, param_grid)

# Save results to dictionary
xgb_tfidf_1 = {
    'model': model,
    'params': params,
    'X_train': train,
    'X_validation': validation,
    'X_test': test
}

Hyperparameters: {'xgbclassifier__subsample': 0.8, 'xgbclassifier__n_estimators': 50, 'xgbclassifier__max_depth': 3, 'xgbclassifier__learning_rate': 0.01, 'xgbclassifier__colsample_bytree': 0.8, 'truncatedsvd__n_components': 200}
Metrics:
      Metric     Score
0   Accuracy  0.538662
1  Precision  0.532810
2     Recall  0.805768
3   F1-Score  0.641458
4        AUC  0.532002
5   Log-Loss  0.690338

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive                 290                 833
Actual Negative                 229                 950


### XGB Model Method:
*   Vectorization: Count
*   Feature Representation: Bigram

In [None]:
# Define variables
model = 'xgb'
vectorization = 'count'
ngram = 2
search_type = 'random'

# Run model search
model,params,train,validation,test = model_search(X_train_preprocessed, y_train, X_validation_preprocessed, y_validation, X_test_preprocessed, model, vectorization, ngram, search_type, param_grid)

# Save results to dictionary
xgb_count_2 = {
    'model': model,
    'params': params,
    'X_train': train,
    'X_validation': validation,
    'X_test': test
}

Hyperparameters: {'xgbclassifier__subsample': 0.8, 'xgbclassifier__n_estimators': 50, 'xgbclassifier__max_depth': 3, 'xgbclassifier__learning_rate': 0.01, 'xgbclassifier__colsample_bytree': 0.8, 'truncatedsvd__n_components': 200}
Metrics:
      Metric     Score
0   Accuracy  0.517811
1  Precision  0.515935
2     Recall  0.947413
3   F1-Score  0.668062
4        AUC  0.507099
5   Log-Loss  0.692302

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive                  75                1048
Actual Negative                  62                1117


### XGB Model Method:
*   Vectorization: TF-IDF
*   Feature Representation: Bigram

In [None]:
# Define variables
model = 'xgb'
vectorization = 'tfidf'
ngram = 2
search_type = 'random'

# Run model search
model,params,train,validation,test = model_search(X_train_preprocessed, y_train, X_validation_preprocessed, y_validation, X_test_preprocessed, model, vectorization, ngram, search_type, param_grid)

# Save results to dictionary
xgb_tfidf_2 = {
    'model': model,
    'params': params,
    'X_train': train,
    'X_validation': validation,
    'X_test': test
}

Hyperparameters: {'xgbclassifier__subsample': 0.8, 'xgbclassifier__n_estimators': 50, 'xgbclassifier__max_depth': 3, 'xgbclassifier__learning_rate': 0.01, 'xgbclassifier__colsample_bytree': 0.8, 'truncatedsvd__n_components': 200}
Metrics:
      Metric     Score
0   Accuracy  0.517376
1  Precision  0.517400
2     Recall  0.857506
3   F1-Score  0.645388
4        AUC  0.508896
5   Log-Loss  0.692628

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive                 180                 943
Actual Negative                 168                1011


## Logistic Regression

In [None]:
# Define the parameter grid
param_grid = {
    'logisticregression__solver': ['saga'],
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Adjust the range based on the characteristics of your data
}

### Logistic Regression Model Method:
*   Vectorization: Count
*   Feature Representation: Unigram

In [None]:
# Define variables
model = 'log'
vectorization = 'count'
ngram = 1
search_type = 'random'

# Run model search
model,params,train,validation,test = model_search(X_train_preprocessed, y_train, X_validation_preprocessed, y_validation, X_test_preprocessed, model, vectorization, ngram, search_type, param_grid)

# Save results to dictionary
log_count_1 = {
    'model': model,
    'params': params,
    'X_train': train,
    'X_validation': validation,
    'X_test': test
}

Hyperparameters: {'logisticregression__solver': 'saga', 'logisticregression__penalty': 'l1', 'logisticregression__C': 0.01}
Metrics:
      Metric     Score
0   Accuracy  0.512163
1  Precision  0.512163
2     Recall  1.000000
3   F1-Score  0.677392
4        AUC  0.500000
5   Log-Loss  0.692876

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive                   0                1123
Actual Negative                   0                1179


### Logistic Regression Model Method:
*   Vectorization: Count
*   Feature Representation: Bigram

In [None]:
# Define variables
model = 'log'
vectorization = 'count'
ngram = 2
search_type = 'random'

# Run model search
model,params,train,validation,test = model_search(X_train_preprocessed, y_train, X_validation_preprocessed, y_validation, X_test_preprocessed, model, vectorization, ngram, search_type, param_grid)

# Save results to dictionary
log_count_2 = {
    'model': model,
    'params': params,
    'X_train': train,
    'X_validation': validation,
    'X_test': test
}

Hyperparameters: {'logisticregression__solver': 'saga', 'logisticregression__penalty': 'l1', 'logisticregression__C': 0.01}
Metrics:
      Metric     Score
0   Accuracy  0.512163
1  Precision  0.512163
2     Recall  1.000000
3   F1-Score  0.677392
4        AUC  0.500000
5   Log-Loss  0.694080

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive                   0                1123
Actual Negative                   0                1179


### Logistic Regression Model Method:
*   Vectorization: TF-IDF
*   Feature Representation: Unigram

In [None]:
# Define variables
model = 'log'
vectorization = 'tfidf'
ngram = 1
search_type = 'random'

# Run model search
model,params,train,validation,test = model_search(X_train_preprocessed, y_train, X_validation_preprocessed, y_validation, X_test_preprocessed, model, vectorization, ngram, search_type, param_grid)

# Save results to dictionary
log_tfidf_1 = {
    'model': model,
    'params': params,
    'X_train': train,
    'X_validation': validation,
    'X_test': test
}

Hyperparameters: {'logisticregression__solver': 'liblinear', 'logisticregression__penalty': 'l1', 'logisticregression__C': 0.1}
Metrics:
      Metric     Score
0   Accuracy  0.512163
1  Precision  0.512163
2     Recall  1.000000
3   F1-Score  0.677392
4        AUC  0.500000
5   Log-Loss  0.692893

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive                   0                1123
Actual Negative                   0                1179


### Logistic Regression Model Method:
*   Vectorization: TF-IDF
*   Feature Representation: Bigram

In [None]:
# Define variables
model = 'log'
vectorization = 'tfidf'
ngram = 2
search_type = 'random'

# Run model search
model,params,train,validation,test = model_search(X_train_preprocessed, y_train, X_validation_preprocessed, y_validation, X_test_preprocessed, model, vectorization, ngram, search_type, param_grid)

# Save results to dictionary
log_tfidf_2 = {
    'model': model,
    'params': params,
    'X_train': train,
    'X_validation': validation,
    'X_test': test
}

Hyperparameters: {'logisticregression__solver': 'liblinear', 'logisticregression__penalty': 'l1', 'logisticregression__C': 0.1}
Metrics:
      Metric     Score
0   Accuracy  0.512163
1  Precision  0.512163
2     Recall  1.000000
3   F1-Score  0.677392
4        AUC  0.500000
5   Log-Loss  0.692893

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive                   0                1123
Actual Negative                   0                1179


## Random forest

In [12]:
# Define the parameter grid
param_grid = {
    'truncatedsvd__n_components': [150, 200, 250],  # Number of components to keep after dimensionality reduction using Truncated SVD
    'randomforestclassifier__n_estimators': [int(x) for x in np.linspace(start=200, stop=2000, num=10)],  # Number of trees in the forest
    'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider at every split
    'randomforestclassifier__max_depth': [int(x) for x in np.linspace(10, 110, num=11)],  # Maximum depth of the tree
    'randomforestclassifier__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'randomforestclassifier__bootstrap': [True, False]  # Method of selecting samples for training each tree
}

### Random Forest Model Method:
*   Vectorization: Count
*   Feature Representation: Unigram

In [None]:
# Define variables
model = 'rf'
vectorization = 'count'
ngram = 1
search_type = 'random'

# Run model search
model,params,train,validation,test = model_search(X_train_preprocessed, y_train, X_validation_preprocessed, y_validation, X_test_preprocessed, model, vectorization, ngram, search_type, param_grid)

# Save results to dictionary
rf_count_1 = {
    'model': model,
    'params': params,
    'X_train': train,
    'X_validation': validation,
    'X_test': test
}



Hyperparameters: {'truncatedsvd__n_components': 200, 'randomforestclassifier__n_estimators': 1800, 'randomforestclassifier__min_samples_split': 5, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__max_features': 'log2', 'randomforestclassifier__max_depth': 80, 'randomforestclassifier__bootstrap': True}
Metrics:
      Metric     Score
0   Accuracy  0.567333
1  Precision  0.566161
2     Recall  0.664122
3   F1-Score  0.611241
4        AUC  0.564919
5   Log-Loss  0.666130

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive                 523                 600
Actual Negative                 396                 783


### Random Forest Model Method:
*   Vectorization: Count
*   Feature Representation: Bigram

In [None]:
# Define variables
model = 'rf'
vectorization = 'count'
ngram = 2
search_type = 'random'

# Run model search
model,params,train,validation,test = model_search(X_train_preprocessed, y_train, X_validation_preprocessed, y_validation, X_test_preprocessed, model, vectorization, ngram, search_type, param_grid)

# Save results to dictionary
rf_count_2 = {
    'model': model,
    'params': params,
    'X_train': train,
    'X_validation': validation,
    'X_test': test
}

### Random Forest Model Method:
*   Vectorization: TF-IDF
*   Feature Representation: Unigram

In [None]:
# Define variables
model = 'rf'
vectorization = 'tfidf'
ngram = 1
search_type = 'random'

# Run model search
model,params,train,validation,test = model_search(X_train_preprocessed, y_train, X_validation_preprocessed, y_validation, X_test_preprocessed, model, vectorization, ngram, search_type, param_grid)

# Save results to dictionary
rf_tfidf_1 = {
    'model': model,
    'params': params,
    'X_train': train,
    'X_validation': validation,
    'X_test': test
}

### Random Forest Model Method:
*   Vectorization: TF-IDF
*   Feature Representation: Bigram

In [None]:
# Define variables
model = 'rf'
vectorization = 'tfidf'
ngram = 2
search_type = 'random'

# Run model search
model,params,train,validation,test = model_search(X_train_preprocessed, y_train, X_validation_preprocessed, y_validation, X_test_preprocessed, model, vectorization, ngram, search_type, param_grid)

# Save results to dictionary
rf_tfidf_2 = {
    'model': model,
    'params': params,
    'X_train': train,
    'X_validation': validation,
    'X_test': test
}

# Evaluate model on test set




In [None]:
# Evaluate the model on the test set
y_test_pred = xgb_svd_model.predict(X_test_vcount_bi)
metrics_test_df, confusion_test_df = model_eval(xgb_svd_model, X_test_vcount_bi, y_test, y_test_pred)

# Write best model and data to shared drive

In [None]:
# Save the vectorizer and associated data
joblib.dump(vectorizer_tfidf_bi,folder_path+'tfidf_vectorizer_bi.pkl')
joblib.dump(X_train_vtfidf_bi, folder_path+'X_train_vtfidf_bi.pkl')
joblib.dump(X_validation_vtfidf_bi, folder_path+'X_validation_vtfidf_bi.pkl')
joblib.dump(X_test_vtfidf_bi, folder_path+'X_test_vtfidf_bi.pkl')