In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk.classify.util as util
import itertools
import pickle
import csv
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors
import xgboost as xgb
import numpy as np

import os.path
from statistics import mode

In [2]:
emoji_dict = {}
with open('emoji.txt', 'r', encoding='latin-1') as emoji_file:
    for line in emoji_file:
        line = line.strip()
        if line:
            emoji, value = line.split('\t')
            emoji_dict[emoji] = int(value)

In [3]:
def replace_emojis(text, emoji_dict):
    for emoji, value in emoji_dict.items():
        if value == 1:
            text = re.sub(re.escape(emoji), 'happy', text)
        elif value == -1:
            text = re.sub(re.escape(emoji), 'sad', text)
    return text


def preprocess_text(text):
    # Remove special characters
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    # Convert to lowercase
    text = text.lower()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    text = ' '.join(filtered_words)

    return text


In [4]:
hatefulCorpus=[]
nHatefulCorpus=[]
neutralCorpus=[]
with open("firstIter.csv", "r", encoding="utf-8") as file:
    reader = csv.DictReader(file)
    for row in reader:
        if not all(value == "" for value in row.values()):
            text = row["Comment Text"]
            newText = text.strip()
            newText = replace_emojis(newText, emoji_dict)
            newText = preprocess_text(newText)
            label = row["Label"]
            if label=='Neutral':
                neutralCorpus.append(newText)
            elif label=='Hateful':
                hatefulCorpus.append(newText)
            else:
                nHatefulCorpus.append(newText)

In [6]:
word_embedding_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [7]:
# Create labels
hateful_labels = np.ones(len(hatefulCorpus))
nHateful_labels = np.zeros(len(nHatefulCorpus))
neutral_labels = np.full(len(neutralCorpus), 2)

# Combine data and labels
data = nHatefulCorpus + hatefulCorpus + neutralCorpus
labels = np.concatenate([hateful_labels, nHateful_labels, neutral_labels])

# Convert text to word embeddings
data_embeddings = []
for text in data:
    words = word_tokenize(text)
    embeddings = []
    for word in words:
        if word in word_embedding_model:
            embeddings.append(word_embedding_model[word])
    if embeddings:
        text_embedding = np.mean(embeddings, axis=0)
        data_embeddings.append(text_embedding)

data_embeddings = np.array(data_embeddings)
#data_embeddings

In [8]:
len(data_embeddings[1])

300

In [9]:
labels = labels[:data_embeddings.shape[0]]
X_train, X_test, y_train, y_test = train_test_split(data_embeddings, labels, test_size=0.2, random_state=42)

xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train, y_train)
y_pred = xgb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7834008097165992
