In [None]:
# Using TF embedding approach and RF, GB, and XGB to classify Decision according to Issue

In [None]:
#mount the google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [None]:
INPUT_EXCEL_FILE1 = "/content/drive/MyDrive/worker_comp_work/WC_Final/research_ready_Issues_train.xlsx"
INPUT_EXCEL_FILE2 = "/content/drive/MyDrive/worker_comp_work/WC_Final/research_ready_Issues_test.xlsx"
INPUT_EXCEL_FILE3 = "/content/drive/MyDrive/worker_comp_work/WC_Final/research_ready_Facts_train.xlsx"
INPUT_EXCEL_FILE4 = "/content/drive/MyDrive/worker_comp_work/WC_Final/research_ready_Facts_test.xlsx"

In [None]:
#load training and testing files for Issues
df_train_issues = pd.read_excel(INPUT_EXCEL_FILE1)
df_test_issues = pd.read_excel(INPUT_EXCEL_FILE2)

In [None]:
df_train_issues.columns

In [None]:
df_train_issues.shape
#should be 4882

In [None]:
df_test_issues.shape
#should be 1221

In [None]:
#check unique values in 'Decision' column
df_train_issues['Decision'].unique()

In [None]:
#check the distribution of Decsion variable in train and test as percentage
df_train_issues['Decision'].value_counts(normalize=True)
#should be about 63%-37%

In [None]:
df_test_issues['Decision'].value_counts(normalize=True)

In [None]:
train_text = df_train_issues['Preprocesses_Issues'].tolist()
test_text = df_test_issues['Preprocesses_Issues'].tolist()

train_labels = df_train_issues['Decision'].tolist()
test_labels = df_test_issues['Decision'].tolist()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import nltk
nltk.download('punkt')  # Download tokenizer models

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
# Tokenization function
def tokenize_text(texts):
    """
    Tokenize a list of sentences into words.

    Parameters:
    - texts: List of raw sentences (strings)

    Returns:
    - List of tokenized sentences (lists of words)
    """
    return [word_tokenize(text.lower()) for text in texts]  # Lowercase and tokenize

# Function to convert text data into TF-IDF features
def loadDataTfidf(X_train, X_test, max_features=75000):
    """
    Convert the training and test data into TF-IDF-based numerical features.

    Parameters:
    - X_train: List of raw training sentences
    - X_test: List of raw test sentences
    - max_features: Maximum number of features (vocabulary size) for TF-IDF

    Returns:
    - X_train_tfidf: TF-IDF-based numerical features for training data
    - X_test_tfidf: TF-IDF-based numerical features for test data
    """
    # Step 1: Initialize TfidfVectorizer
    vectorizer = TfidfVectorizer(max_features=max_features)

    # Step 2: Fit the vectorizer on the training data and transform both train and test data
    X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
    X_test_tfidf = vectorizer.transform(X_test).toarray()

    print(f"TF-IDF with {X_train_tfidf.shape[1]} features")

    return X_train_tfidf, X_test_tfidf


In [None]:
# Convert to numerical features using TF-IDF
X_train_tfidf, X_test_tfidf = loadDataTfidf(train_text, test_text, max_features=100)

In [None]:
X_train_tfidf[0]

In [None]:
len(X_train_tfidf)

In [None]:
y_train=train_labels
y_test=test_labels

In [None]:
from imblearn.metrics import specificity_score

from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
)


In [None]:
# create a code for calculating model performance metrics and deliver it as one single data frame
def model_metrics(model, predictors, targets):
  pred=model.predict(predictors)
  pred_prob=model.predict_proba(predictors)[:,1]

  acc=accuracy_score(targets, pred)
  rec=recall_score(targets, pred)
  spec=specificity_score(targets, pred, average='binary')
  prec=precision_score(targets, pred)
  f1=f1_score(targets, pred)
  auc=roc_auc_score(targets,pred_prob)

  df_metrics=pd.DataFrame({}, index=['Metrics'])
  df_metrics['Accuracy']=acc
  df_metrics['Recall']=rec
  df_metrics['Specificity']=spec
  df_metrics['Precision']=prec
  df_metrics['F1']=f1
  df_metrics['AUC']=auc

  return df_metrics

In [None]:
# Calculate and Display confusion matrix

def display_confusion_matrix(model, predictors, targets):
  pred=model.predict(predictors)
  cm=confusion_matrix(targets, pred)
  #cm_percentage=cm.astype('float')/cm.sum()*100
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
  disp.plot(cmap=plt.cm.Blues)
  plt.title("Confusion Matrix")
  plt.show()

## Training Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 2: Train the model using X_train_tfidf and y_train
rf_classifier.fit(X_train_tfidf, y_train)

# Step 3: Make predictions on the test set
test_performance=model_metrics(rf_classifier, X_test_tfidf, y_test)
test_performance

In [None]:
display_confusion_matrix(rf_classifier, X_test_tfidf, y_test)

In [None]:
# Tune the model
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
# defining model
Model =RandomForestClassifier(random_state=42, class_weight='balanced')

# Parameter grid to pass in RandomSearchCV
param_grid = {
    "n_estimators": [50,110,25],
    "min_samples_leaf": np.arange(1, 4),
    "max_features": [0.3, 0.4, 0.5, 'sqrt'],
    "max_samples": np.arange(0.4, 0.7, 0.1)
}

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)

#Calling RandomizedSearchCV
randomized_cv_RF = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=10, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv_RF.fit(X_train_tfidf, y_train)

print("Best parameters are {} with CV score={}:" .format(randomized_cv_RF.best_params_,randomized_cv_RF.best_score_))

In [None]:
best_params=randomized_cv_RF.best_params_
tuned_RF=RandomForestClassifier(random_state=42, class_weight='balanced', **best_params)
tuned_RF.fit(X_train_tfidf, y_train)

In [None]:
test_performance_tunedRF=model_metrics(tuned_RF, X_test_tfidf, y_test)
test_performance_tunedRF.round(4)

## Training Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Initialize the Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Step 2: Train the model using X_train_tfidf and y_train
gb_classifier.fit(X_train_tfidf, y_train)

# Step 3: Make predictions on the test set
test_performance=model_metrics(gb_classifier, X_test_tfidf, y_test)
test_performance

In [None]:
display_confusion_matrix(gb_classifier, X_test_tfidf, y_test)

In [None]:
# defining model

# To help with model building
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression


Model =GradientBoostingClassifier(random_state=42)

# Parameter grid to pass in RandomSearchCV
param_grid = {
    "n_estimators": np.arange(50,110,25),
    "learning_rate": [0.01,0.1,0.05],
    "subsample":[0.7,0.9],
    "max_features":[0.5,0.7,1],
    "max_depth": [3, 5, 7],
}

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)

#Calling RandomizedSearchCV
randomized_cv_GB = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=10, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv_GB.fit(X_train_tfidf, y_train)

print("Best parameters are {} with CV score={}:" .format(randomized_cv_GB.best_params_,randomized_cv_GB.best_score_))

In [None]:
best_params=randomized_cv_GB.best_params_
tuned_GB=GradientBoostingClassifier(random_state=42,  **best_params)
tuned_GB.fit(X_train_tfidf, y_train)

In [None]:
test_performance_tunedGB=model_metrics(tuned_GB, X_test_tfidf, y_test)
test_performance_tunedGB.round(4)

## Using XGBoost classifier

In [None]:
!pip install xgboost

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Initialize the XGBoost Classifier
xgb_classifier = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Step 2: Train the model using X_train_tfidf and y_train
xgb_classifier.fit(X_train_tfidf, y_train)

# Step 3: Make predictions on the test set
test_performance=model_metrics(xgb_classifier, X_test_tfidf, y_test)
test_performance

In [None]:
from sklearn import metrics

# defining model
Model = XGBClassifier(random_state=42)

# Parameter grid to pass in RandomSearchCV
param_grid={'n_estimators':np.arange(50,110,25),
            'scale_pos_weight':[1,2,5],
            'learning_rate':[0.01,0.1,0.05],
            'gamma':[1,3],
            'subsample':[0.7,0.9]
}

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)

#Calling RandomizedSearchCV
randomized_cv_XGB = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=10, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv_XGB.fit(X_train_tfidf, y_train)

print("Best parameters are {} with CV score={}:" .format(randomized_cv_XGB.best_params_,randomized_cv_XGB.best_score_))

In [None]:
best_params=randomized_cv_XGB.best_params_
tuned_XGB=XGBClassifier(random_state=42,  **best_params)
tuned_XGB.fit(X_train_tfidf, y_train)

In [None]:
test_performance_tunedXGB=model_metrics(tuned_XGB, X_test_tfidf, y_test)
test_performance_tunedXGB.round(4)

## XGBoost- Using Oversampling Approach

In [None]:
# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [None]:
print("Before Oversampling, counts of label '1': {}".format(sum(y_train)))
print("Before Oversampling, counts of label '0': {} \n".format(len(y_train)-sum(y_train)))

In [None]:
#For oversampling
sm = SMOTE(
    sampling_strategy=1, k_neighbors=5, random_state=1
)  # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(
  X_train_tfidf, y_train)

In [None]:
#for undersampling

rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train_tfidf, y_train)

In [None]:
print("After Oversampling, counts of label '1': {}".format(sum(y_train_over)))
print("After Oversampling, counts of label '0': {} \n".format(len(y_train_over)-sum(y_train_over)))

In [None]:
print("After undersampling, counts of label '1': {}".format(sum(y_train_un)))
print("After undersampling, counts of label '0': {} \n".format(len(y_train_un)-sum(y_train_un)))

In [None]:
# Oversampling and XGB
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Initialize the XGBoost Classifier
xgb_classifier = xgb.XGBClassifier(random_state=42,  **best_params)

# Step 2: Train the model using X_train_tfidf and y_train
xgb_classifier.fit(X_train_over, y_train_over)

# Step 3: Make predictions on the test set
test_performance_xgbover=model_metrics(xgb_classifier, X_test_tfidf, y_test)
test_performance_xgbover.round(4)

In [None]:
# Undersampling and XGB
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Initialize the XGBoost Classifier
xgb_classifier = xgb.XGBClassifier(random_state=42,  **best_params)

# Step 2: Train the model using X_train_tfidf and y_train
xgb_classifier.fit(X_train_un, y_train_un)

# Step 3: Make predictions on the test set
test_performance_xgbunder=model_metrics(xgb_classifier, X_test_tfidf, y_test)
test_performance_xgbunder.round(4)

In [None]:
print(test_performance_tunedRF.round(4))
print(test_performance_tunedGB.round(4))
print(test_performance_tunedXGB.round(4))
print(test_performance_xgbover.round(4))
print(test_performance_xgbunder.round(4))