In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk.classify.util as util
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder as BCF
import itertools
import pickle

import os.path
from statistics import mode
from nltk.classify import ClassifierI

In [2]:
emoji_dict = {}
with open('emoji.txt', 'r', encoding='latin-1') as emoji_file:
    for line in emoji_file:
        line = line.strip()
        if line:
            emoji, value = line.split('\t')
            emoji_dict[emoji] = int(value)

In [3]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def replace_emojis(text, emoji_dict):
    for emoji, value in emoji_dict.items():
        if value == 1:
            text = re.sub(re.escape(emoji), 'happy', text)
        elif value == -1:
            text = re.sub(re.escape(emoji), 'sad', text)
    return text


def preprocess_text(text):
    # Remove special characters
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    # Convert to lowercase
    text = text.lower()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    text = ' '.join(filtered_words)

    return text


In [4]:
positive_preprocessed = []
with open('positive.txt', 'r', encoding='latin-1') as pos_file:
    for line in pos_file:
        comment = line.strip()
        comment = replace_emojis(comment, emoji_dict)
        comment = preprocess_text(comment)
        positive_preprocessed.append(comment)

# Preprocess negative text
negative_preprocessed = []
with open('negative.txt', 'r', encoding='latin-1') as neg_file:
    for line in neg_file:
        comment = line.strip()
        comment = replace_emojis(comment, emoji_dict)
        comment = preprocess_text(comment)
        negative_preprocessed.append(comment)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import xgboost as xgb
import numpy as np

In [6]:
word_embedding_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [7]:
# Create labels
positive_labels = np.ones(len(positive_preprocessed))
negative_labels = np.zeros(len(negative_preprocessed))

# Combine data and labels
data = positive_preprocessed + negative_preprocessed
labels = np.concatenate([positive_labels, negative_labels])

# Convert text to word embeddings
data_embeddings = []
for text in data:
    words = word_tokenize(text)
    embeddings = []
    for word in words:
        if word in word_embedding_model:
            embeddings.append(word_embedding_model[word])
    if embeddings:
        text_embedding = np.mean(embeddings, axis=0)
        data_embeddings.append(text_embedding)

data_embeddings = np.array(data_embeddings)
data_embeddings

array([[ 0.04384504,  0.06782313, -0.01133135, ..., -0.0765904 ,
        -0.01084682, -0.01331438],
       [-0.04973602,  0.00181389, -0.03222084, ..., -0.13389587,
         0.00722504, -0.00415039],
       [ 0.1281128 , -0.1151123 , -0.00805664, ...,  0.05335999,
        -0.07922363,  0.31835938],
       ...,
       [ 0.03812218,  0.10181274, -0.03788071, ..., -0.11651306,
        -0.00327454,  0.02767296],
       [ 0.05984497,  0.10308838, -0.07188034, ...,  0.03323364,
        -0.01623535,  0.06933594],
       [ 0.3063151 ,  0.22135417,  0.01839193, ...,  0.05924479,
        -0.052653  ,  0.03417969]], dtype=float32)

In [8]:
len(data_embeddings[1])

300

In [9]:
labels = labels[:data_embeddings.shape[0]]
X_train, X_test, y_train, y_test = train_test_split(data_embeddings, labels, test_size=0.2, random_state=42)

xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train, y_train)
y_pred = xgb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7428437353355233
