In [23]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk.classify.util as util
import itertools
import pickle
import csv
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors
import xgboost as xgb
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
import os.path
from statistics import mode

In [24]:
emoji_dict = {}
with open('emoji.txt', 'r', encoding='latin-1') as emoji_file:
    for line in emoji_file:
        line = line.strip()
        if line:
            emoji, value = line.split('\t')
            emoji_dict[emoji] = int(value)

In [25]:
def replace_emojis(text, emoji_dict):
    for emoji, value in emoji_dict.items():
        if value == 1:
            text = re.sub(re.escape(emoji), 'happy', text)
        elif value == -1:
            text = re.sub(re.escape(emoji), 'sad', text)
    return text

def preprocess_text(text):
    # Remove special characters
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    # Convert to lowercase
    text = text.lower()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    text = ' '.join(filtered_words)

    return text

In [26]:
hatefulCorpus=[]
nHatefulCorpus=[]
neutralCorpus=[]
with open("firstIter.csv", "r", encoding="utf-8") as file:
    reader = csv.DictReader(file)
    for row in reader:
        if not all(value == "" for value in row.values()):
            text = row["Comment Text"]
            newText = text.strip()
            newText = replace_emojis(newText, emoji_dict)
            newText = preprocess_text(newText)
            label = row["Label"]
            if label=='Neutral':
                neutralCorpus.append(newText)
            elif label=='Hateful':
                hatefulCorpus.append(newText)
            else:
                nHatefulCorpus.append(newText)

In [27]:
data = hatefulCorpus + nHatefulCorpus + neutralCorpus
labels = [0] * len(hatefulCorpus) + [1] * len(nHatefulCorpus) + [2] * len(neutralCorpus)
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(data)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Step 3: XGBoost Classifier Training
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train, y_train)

# Step 4: Evaluation
y_pred = xgb_classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5566600397614314
              precision    recall  f1-score   support

           0       0.40      0.31      0.35       143
           1       0.62      0.77      0.69       306
           2       0.07      0.02      0.03        54

    accuracy                           0.56       503
   macro avg       0.36      0.36      0.35       503
weighted avg       0.50      0.56      0.52       503



In [28]:
type(X_test)

scipy.sparse.csr.csr_matrix

In [29]:
import os
import pickle
save_directory = 'savedModels'
os.makedirs(save_directory, exist_ok=True)
filename = os.path.join(save_directory, 'xgb_model.pkl')
filename2 = os.path.join(save_directory, 'xgb_tfidf.pkl')
pickle.dump(vectorizer, open(filename2, 'wb'))
pickle.dump(xgb_classifier, open(filename, 'wb'))
loaded_model = pickle.load(open(filename, 'rb'))
#new_text = ["Send all the criminal immigrants out of Europe...NOW...."]
loaded_vect = pickle.load(open(filename2, 'rb'))
new_features = loaded_vect.transform(data)
prediction = loaded_model.predict(new_features)

#count of zeros, ones, and twos
count_zeros = 0
count_ones = 0
count_twos = 0


for pred in prediction.flat:
    if pred == 0:
        count_zeros = count_zeros + 1
    elif pred == 1:
        count_ones = count_ones + 1
    elif pred == 2:
        count_twos = count_twos + 1
print("Number of Hateful comments: %d" % count_zeros)
print("Number of Non-Hateful comments: %d" % count_ones)
print("Number of Neutral comments: %d" % count_twos)

print(prediction.size)
print(count_zeros + count_ones + count_twos)

print("**************************************")
class_labels = ['Hateful', 'Non-Hateful', 'Neutral']
prediction_summary = {label: 0 for label in class_labels}

values, counts = np.unique(prediction, return_counts=True)

for val, cnt in np.nditer([values,counts]):
    label = class_labels[val]
    prediction_summary[label] = cnt
    
print(prediction_summary['Hateful'])
print(prediction_summary['Non-Hateful'])
print(prediction_summary['Neutral'])

#label_mapping = {0: 'hateful', 1: 'non-hateful', 2: 'neutral'}
#predicted_label = label_mapping[prediction[0]]
#print("Predicted label:", predicted_label)

Number of Hateful comments: 551
Number of Non-Hateful comments: 1821
Number of Neutral comments: 141
2513
2513
**************************************
551
1821
141


In [None]:
#word_embedding_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
arr = np.asarray

In [None]:
#Check later
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
corpus = neutralCorpus + hatefulCorpus + nHatefulCorpus
labels = ['Neutral'] * len(neutralCorpus) + ['Hateful'] * len(hatefulCorpus) + ['Non Hateful'] * len(nHatefulCorpus)

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(corpus, encoded_labels, test_size=0.2, random_state=42)

# Step 2: Extract features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 3: Define the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_))

# Step 4: Train the classifier
xgb_classifier.fit(X_train_tfidf, y_train)

# Step 5: Make predictions on the test data
y_pred = xgb_classifier.predict(X_test_tfidf)

# Step 6: Decode the predicted labels
predicted_labels = label_encoder.inverse_transform(y_pred)

# Step 7: Evaluate the accuracy
accuracy = accuracy_score(label_encoder.inverse_transform(y_test), predicted_labels)
print("Accuracy:", accuracy)

In [None]:
# Create labels
hateful_labels = np.ones(len(hatefulCorpus))
nHateful_labels = np.zeros(len(nHatefulCorpus))
neutral_labels = np.full(len(neutralCorpus), 2)

# Combine data and labels
data = nHatefulCorpus + hatefulCorpus + neutralCorpus
labels = np.concatenate([hateful_labels, nHateful_labels, neutral_labels])

# Convert text to word embeddings
data_embeddings = []
for text in data:
    words = word_tokenize(text)
    embeddings = []
    for word in words:
        if word in word_embedding_model:
            embeddings.append(word_embedding_model[word])
    if embeddings:
        text_embedding = np.mean(embeddings, axis=0)
        data_embeddings.append(text_embedding)

data_embeddings = np.array(data_embeddings)
#data_embeddings

In [None]:
labels = labels[:data_embeddings.shape[0]]
X_train, X_test, y_train, y_test = train_test_split(data_embeddings, labels, test_size=0.2, random_state=42)

params = {
    'objective': 'multi:softmax',
    'num_class': 3,
}

xgb_classifier = xgb.XGBClassifier(**params)
xgb_classifier.fit(X_train, y_train)
y_pred = xgb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
import numpy as np
a = np.arange(15).reshape(3,5)
print(a)
print(a.ndim)
print(a.dtype.name)
print(a.itemsize)
print(a.size)
print(type(a))