# Modelling GRADIENT BOOSTING and LSTM


## ImportS

In [1]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import numpy as np
import pickle
import time
import spacy
import ast
import joblib


from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from ast import literal_eval



from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix
from gensim.models import Word2Vec

from sklearn.ensemble import GradientBoostingClassifier

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense



In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
#this initialize tqdm which is useful to show a progress bar when applying operations in a pandas df
tqdm.pandas()

## Data Extract

In [4]:
df = pd.read_csv('data/data_usampl_60_40_cleaned.csv')


In [5]:
df.columns

Index(['raw', 'clean', 'clean_pp', 'clean_pp_lemma', 'clean_pp_lemma_stop',
       'toxic'],
      dtype='object')

## Function to include results

In [6]:
# initialize dataframe that will include the results
results_df = pd.DataFrame()

def evaluate_model(model, X_train,y_train,X_test,y_test, model_name="", parameters='', comments=''):
    start_time = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"
    predicted_probs = model.predict_proba(X_test)[:, 1]

    # Calculate metrics using probabilities
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predicted_probs)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    return results

## Gradient Boosting

In [7]:
# Prepare the data
X = df['clean_pp_lemma']
y = df['toxic']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform on the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [8]:
# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier()

# Evaluate Gradient Boosting Classifier using evaluate_model function
results_gb = evaluate_model(gb_classifier, X_train_tfidf, y_train, X_test_tfidf, y_test, model_name="Gradient Boosting Classifier", parameters="TF-IDF")

# Convert the dictionary to a DataFrame
results_df_gb = pd.DataFrame([results_gb])

# Concatenate the new results DataFrame to the existing one
results_df = pd.concat([results_df, results_df_gb], ignore_index=True)

# Display the results
print(results_df)


                           Name Parameters  F1-Score   AUC-ROC  Precision  \
0  Gradient Boosting Classifier     TF-IDF  0.576923  0.736484   0.882353   

     Recall  Accuracy     Confusion Matrix               Training Time  \
0  0.428571      0.78  [[63  2]\n [20 15]]  0 minutes and 1.57 seconds   

  Comments  
0           


In [9]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,Gradient Boosting Classifier,TF-IDF,0.576923,0.736484,0.882353,0.428571,0.78,[[63 2]\n [20 15]],0 minutes and 1.57 seconds,


## LSTM

In [10]:
# Load the dataset
X = df['clean_pp_lemma'].values # Extract the input feature 'clean_pp_lemma'
y = df['toxic'].values # Extract the target variable 'toxic'

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Here, 80% of data is used for training and 20% for testing. Random state ensures reproducibility.

# Tokenize and convert text to sequences
max_words = 10000  # Set the maximum number of words to consider in the vocabulary
max_len = 100  # Set the maximum length of each sequence
tokenizer = Tokenizer(num_words=max_words) # Initialize the Tokenizer
tokenizer.fit_on_texts(X_train) # Fit the tokenizer on the training data
X_train_seq = tokenizer.texts_to_sequences(X_train) # Convert training text to sequences of integers
X_test_seq = tokenizer.texts_to_sequences(X_test) # Convert test text to sequences of integers

In [11]:
# Save the tokenizer to a file
tokenizer_file_path = 'data/tokenizer.pkl'
with open(tokenizer_file_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
# The tokenizer is saved to a file for later use (e.g., during model deployment)


In [12]:
# Pad sequences to a fixed length
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len)  # Pad/truncate training sequences
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len) # Pad/truncate test sequences

# Build the LSTM model
model = Sequential() # Initialize the Sequential model
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len)) # Add embedding layer
model.add(LSTM(units=64)) # Add LSTM layer with 64 units
model.add(Dense(units=1, activation='sigmoid'))  # Add output layer with sigmoid activation for binary classification

In [13]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Compile the model with binary crossentropy loss and adam optimizer

# Train the model
model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_data=(X_test_padded, y_test))
# Train the model for 5 epochs with a batch size of 32, using validation data for evaluation


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1f126bb3850>

In [14]:
# Save the model architecture as JSON
model_json = model.to_json()
with open('data/model5.json', 'w') as json_file:
    json_file.write(model_json)

# Save the model weights
model.save_weights('data/model_weights5.h5')

In [15]:
# Generate predictions
y_pred = (model.predict(X_test_padded) > 0.5).astype(int)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict(X_test_padded))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"AUC-ROC: {roc_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.65
Precision: 0.5
Recall: 0.7714285714285715
F1 Score: 0.6067415730337079
AUC-ROC: 0.7296703296703297
Confusion Matrix:
[[38 27]
 [ 8 27]]
