In [4]:
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [5]:
df_combine = pd.read_csv('../datasets/00_output_datasets/dataset_combined.csv')

## Pre-processing standard Workflow

### Text Cleaning (Removing special characters & numbers and handling contractions)

In [6]:
# Function to clean text: remove special characters, numbers, and expand contractions
def clean_text(text):
    # Dictionary of English Contractions
    contractions_dict = {
        "I'm": "I am",
        "you're": "you are",
        "he's": "he is",
        "she's": "she is",
        "it's": "it is",
        "we're": "we are",
        "they're": "they are",
        "don't": "do not",
        "can't": "cannot",
        "won't": "will not",
        "isn't": "is not",
        "aren't": "are not",
        "didn't": "did not",
        "haven't": "have not",
        "wouldn't": "would not",
        "shouldn't": "should not",
        "couldn't": "could not"
            # Add more contractions as needed
    }
    # Regular expression for finding contractions
    contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

    # Function for expanding contractions
    def expand_contractions(s, contractions_dict=contractions_dict):
        def replace(match):
            return contractions_dict[match.group(0)]
        return contractions_re.sub(replace, s)

    # Expand Contractions
    text = expand_contractions(text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    
    return text

# Apply the cleaning function to both DataFrames
df_combine['text'] = df_combine['text'].apply(clean_text)

# Display the head of the combined DataFrame to verify the changes
display(df_combine.head())

Unnamed: 0,label,text
0,hate,I hate women
1,hate,I hate trans people
2,hate,I hate gay people
3,hate,I hate black people
4,hate,I hate disabled people


### Normalization

In [7]:
# Download necessary NLTK resources
nltk.download('punkt')  # For tokenization
nltk.download('wordnet')  # For lemmatization

# Now, initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /Users/jlangela/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jlangela/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
# Download the 'omw-1.4' resource to fix an error that I encountered further down below
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jlangela/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [9]:
# Convert text to lowercase
df_combine['text'] = df_combine['text'].str.lower()

# Initialize the NLTK WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize text
def lemmatize_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Lemmatize each word in the text
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Rejoin lemmatized tokens into a single string
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

# Apply the lemmatization function to the 'text' column
df_combine['text'] = df_combine['text'].apply(lemmatize_text)

# Display the head of the DataFrame to verify the changes
df_combine.head()

Unnamed: 0,label,text
0,hate,i hate woman
1,hate,i hate trans people
2,hate,i hate gay people
3,hate,i hate black people
4,hate,i hate disabled people


In [10]:
for index, row in df_combine.iterrows():
    if row['text'] == '':
        df_combine.drop(index, inplace=True)

In [11]:
## saving for BERT Model which tokenizes the text on its own
df_combine.to_csv('../datasets/01_preprocessed_datasets/dataset_preprocessed_no_transformation.csv', index=False)

### Tokenization

In [12]:
# Ensure you have the necessary NLTK resource downloaded
nltk.download('punkt')

# Assuming df_combine is your DataFrame and it has a column named 'text' with normalized text
# Define a function to tokenize text
def tokenize_text(text):
    # Use NLTK's word_tokenize function to split the text into tokens
    tokens = word_tokenize(text)
    return tokens

# Apply the tokenization function to each row in the 'text' column
df_combine['tokens'] = df_combine['text'].apply(tokenize_text)

# Display the first few rows to check the tokenized text
display(df_combine.head())

[nltk_data] Downloading package punkt to /Users/jlangela/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,label,text,tokens
0,hate,i hate woman,"[i, hate, woman]"
1,hate,i hate trans people,"[i, hate, trans, people]"
2,hate,i hate gay people,"[i, hate, gay, people]"
3,hate,i hate black people,"[i, hate, black, people]"
4,hate,i hate disabled people,"[i, hate, disabled, people]"


### Removing Stop Words

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jlangela/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
from nltk.corpus import stopwords

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

def remove_stop_words(tokens):
    """Remove stop words from a list of tokens"""
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

In [15]:
#Applying it to tokens column
df_combine['text'] = df_combine['tokens'].apply(remove_stop_words)
df_combine['text_processed'] = df_combine['text'].apply(lambda x: ' '.join(x).replace(',', ''))

df_combine.drop(columns=['tokens', 'text'], inplace=True)
df_combine = df_combine[df_combine['text_processed'] != ""]
df_combine.rename(columns={'text_processed': 'text'}, inplace=True)

# Display the DataFrame to verify stop words are removed
display(df_combine.head())

Unnamed: 0,label,text
0,hate,hate woman
1,hate,hate trans people
2,hate,hate gay people
3,hate,hate black people
4,hate,hate disabled people


In [16]:
## saving for BERT Model which tokenizes the text on its own
df_combine.to_csv('../datasets/01_preprocessed_datasets/dataset_preprocessed_stopwords.csv', index=False)

### Vectorization 

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

training_data_raw = pd.read_csv('../datasets/00_output_datasets/dataset_combined.csv')
training_data_preprocessed = pd.read_csv('../datasets/01_preprocessed_datasets/dataset_preprocessed_stopwords.csv')

training_data_raw.head()


Unnamed: 0,label,text
0,hate,I hate women.
1,hate,I hate trans people.
2,hate,I hate gay people.
3,hate,I hate black people.
4,hate,I hate disabled people.


In [29]:
chosen_dataset = training_data_preprocessed

In [30]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(chosen_dataset['text'], 
                                                    chosen_dataset['label'], 
                                                    test_size=0.3, 
                                                    random_state=42)

# Applying TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Training a Logistic Regression model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train_tfidf, y_train)

# Predicting the labels for the test set
y_pred = logistic_regression_model.predict(X_test_tfidf)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Accuracy: 0.6861448780834577
Classification Report:
               precision    recall  f1-score   support

        hate       0.70      0.74      0.72      7752
    not_hate       0.66      0.62      0.64      6315

    accuracy                           0.69     14067
   macro avg       0.68      0.68      0.68     14067
weighted avg       0.68      0.69      0.68     14067



In [31]:
import numpy as np

glove_dimensions = 300
glove_billion_tokens = 840

def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.rsplit(' ', glove_dimensions)
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

glove_path = f'../datasets/embeddings/glove/glove.{glove_billion_tokens}B.{glove_dimensions}d.txt'
glove_embeddings = load_glove_embeddings(glove_path)


In [32]:
def vectorize_text(text, embeddings_dict, dimensions=glove_dimensions):
    words = text.split()
    vectors = [embeddings_dict.get(word, np.zeros((dimensions,))) for word in words]
    return np.mean(vectors, axis=0) if vectors else np.zeros((dimensions,))

X_train_glove = np.array([vectorize_text(text, glove_embeddings) for text in X_train])
X_test_glove = np.array([vectorize_text(text, glove_embeddings) for text in X_test])


In [33]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

word2vec_path = '../datasets/embeddings/word2vec/GoogleNews-vectors-negative300.bin'

# Load the pre-trained Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Vectorize your text data using Word2Vec embeddings
def vectorize_text_with_word2vec(text, word2vec_model):
    vectors = [word2vec_model[word] for word in text if word in word2vec_model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(word2vec_model.vector_size)

# Assuming you have your text data stored in a list called text_data
X_train_word2vec = np.array([vectorize_text_with_word2vec(text, word2vec_model) for text in X_train])
X_test_word2vec = np.array([vectorize_text_with_word2vec(text, word2vec_model) for text in X_test])

In [34]:
logistic_regression_model_glove = LogisticRegression()
logistic_regression_model_glove.fit(X_train_glove, y_train)

# Predicting and evaluating
y_pred_glove = logistic_regression_model_glove.predict(X_test_glove)
accuracy_glove = accuracy_score(y_test, y_pred_glove)
classification_rep_glove = classification_report(y_test, y_pred_glove)

print("Accuracy with GloVe:", accuracy_glove)
print("Classification Report with GloVe:\n", classification_rep_glove)


Accuracy with GloVe: 0.672709177507642
Classification Report with GloVe:
               precision    recall  f1-score   support

        hate       0.69      0.73      0.71      7752
    not_hate       0.65      0.60      0.62      6315

    accuracy                           0.67     14067
   macro avg       0.67      0.67      0.67     14067
weighted avg       0.67      0.67      0.67     14067



In [35]:
def benchmark_models_vectorizations(models, vectorized_data_sets, y_train, y_test):
    """
    Train and evaluate multiple models on multiple vectorized data sets.

    Args:
    - models (list of tuples): A list where each tuple contains a model instance and its name as a string.
    - vectorized_data_sets (list of tuples): A list where each tuple contains vectorized training and testing data (X_train_vec, X_test_vec) along with a descriptor string.
    - y_train (array-like): Training labels.
    - y_test (array-like): Testing labels.

    Returns:
    - results (dict): A dictionary containing accuracy and classification report for each model and vectorization approach.
    """
    results = {}
    for model_name, model in models:
        for vec_name, (X_train_vec, X_test_vec) in vectorized_data_sets:
            # Train the model
            model.fit(X_train_vec, y_train)

            # Predict and evaluate the model
            y_pred = model.predict(X_test_vec)
            accuracy = accuracy_score(y_test, y_pred)
            report = classification_report(y_test, y_pred, output_dict=True)

            # Store the results
            key_name = f"{model_name} with {vec_name}"
            results[key_name] = {
                'Accuracy': accuracy,
                'Classification_Report': report
            }

    return results


In [36]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
]

vectorized_data_sets = [
    ('TF-IDF', (X_train_tfidf, X_test_tfidf)),
    ('GloVe', (X_train_glove, X_test_glove)),
    ('Word2Vec', (X_train_word2vec, X_test_word2vec))
]

results = benchmark_models_vectorizations(models, vectorized_data_sets, y_train, y_test)

for setup, result in results.items():
    print(f"{setup}:")
    print(f"Accuracy: {result['Accuracy']}")
    for label, scores in result['Classification_Report'].items():
        if isinstance(scores, dict):
            print(f"  {label} - Precision: {scores['precision']:.2f}, Recall: {scores['recall']:.2f}, F1-Score: {scores['f1-score']:.2f}")
    print()


Logistic Regression with TF-IDF:
Accuracy: 0.6861448780834577
  hate - Precision: 0.70, Recall: 0.74, F1-Score: 0.72
  not_hate - Precision: 0.66, Recall: 0.62, F1-Score: 0.64
  macro avg - Precision: 0.68, Recall: 0.68, F1-Score: 0.68
  weighted avg - Precision: 0.68, Recall: 0.69, F1-Score: 0.68

Logistic Regression with GloVe:
Accuracy: 0.672709177507642
  hate - Precision: 0.69, Recall: 0.73, F1-Score: 0.71
  not_hate - Precision: 0.65, Recall: 0.60, F1-Score: 0.62
  macro avg - Precision: 0.67, Recall: 0.67, F1-Score: 0.67
  weighted avg - Precision: 0.67, Recall: 0.67, F1-Score: 0.67

Logistic Regression with Word2Vec:
Accuracy: 0.5709817302907514
  hate - Precision: 0.58, Recall: 0.82, F1-Score: 0.68
  not_hate - Precision: 0.55, Recall: 0.26, F1-Score: 0.36
  macro avg - Precision: 0.56, Recall: 0.54, F1-Score: 0.52
  weighted avg - Precision: 0.56, Recall: 0.57, F1-Score: 0.53

