In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk.classify.util as util
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder as BCF
import itertools
import pickle

import os.path
from statistics import mode
from nltk.classify import ClassifierI
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vincent/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/vincent/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/vincent/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
emoji_dict = {}
with open('emoji.txt', 'r', encoding='latin-1') as emoji_file:
    for line in emoji_file:
        line = line.strip()
        if line:
            emoji, value = line.split('\t')
            emoji_dict[emoji] = int(value)

In [10]:
def replace_emojis(text, emoji_dict):
    for emoji, value in emoji_dict.items():
        if value == 1:
            text = re.sub(re.escape(emoji), 'happy', text)
        elif value == -1:
            text = re.sub(re.escape(emoji), 'sad', text)
    return text

In [11]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#vincent
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
# def preprocess_text(text):
#     # Remove special characters
#     text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

#     # Convert to lowercase
#     text = text.lower()

#     # Remove stopwords
#     stop_words = set(stopwords.words('english'))
#     words = word_tokenize(text)
#     filtered_words = [word for word in words if word.lower() not in stop_words]
#     text = ' '.join(filtered_words)

#     return text

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    # Tokenize the words
    words = word_tokenize(text)
    
    # Apply stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    # Apply lemmatization
    lemmatizer = WordNetLemmatizer()
    tagged = pos_tag(words)
    words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(pos)) if get_wordnet_pos(pos) else word for word, pos in tagged]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    
    return ' '.join(filtered_words)


# Preprocess positive text
positive_preprocessed = []
with open('positive.txt', 'r', encoding='latin-1') as pos_file:
    for line in pos_file:
        comment = line.strip()
        comment = replace_emojis(comment, emoji_dict)
        comment = preprocess_text(comment)
        positive_preprocessed.append(comment)

# Preprocess negative text
negative_preprocessed = []
with open('negative.txt', 'r', encoding='latin-1') as neg_file:
    for line in neg_file:
        comment = line.strip()
        comment = replace_emojis(comment, emoji_dict)
        comment = preprocess_text(comment)
        negative_preprocessed.append(comment)


In [12]:
all_texts = positive_preprocessed + negative_preprocessed
from nltk.probability import FreqDist

# Create frequency distribution
word_freq = FreqDist()

for text in all_texts:
    words = text.split()
    word_freq.update(words)
known_words = {word for word, freq in word_freq.items() if freq > 3}
#len(known_words)

In [13]:
import math
total_words = 0
num_comments = len(all_texts)

for comment in all_texts:
    total_words += len(comment.split())

average_words = total_words / num_comments

variance = 0
for comment in all_texts:
    num_words = len(comment.split())
    variance += (num_words - average_words) ** 2

std_deviation = math.sqrt(variance / num_comments)
M = round(average_words + std_deviation)

def comment_to_vector(comment, known_words, M):
    words = comment.split()
    vector = []
    for i in range(M):
        if i < len(words):
            word = words[i]
            if word in known_words:
                vector.append(known_words.index(word) + 1)
            else:
                vector.append(0)
        else:
            vector.append(0)
    return vector

# Convert preprocessed comments to vector representation
positive_comment_vectors = []
for comment in positive_preprocessed:
    vector = comment_to_vector(comment, list(known_words), M)
    positive_comment_vectors.append(vector)
    

negative_comment_vectors = []
for comment in negative_preprocessed:
    vector = comment_to_vector(comment, list(known_words), M)
    negative_comment_vectors.append(vector)
positive_comment_vectors

[[1338,
  3780,
  106,
  1271,
  2638,
  0,
  1649,
  2772,
  979,
  3111,
  967,
  307,
  1916,
  0,
  249,
  0],
 [1706,
  389,
  3317,
  693,
  3560,
  2883,
  2474,
  0,
  1526,
  974,
  2549,
  3704,
  0,
  1288,
  3003,
  2838],
 [1908, 0, 282, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [790, 2426, 1649, 902, 1513, 0, 4034, 175, 2277, 0, 0, 0, 0, 0, 0, 0],
 [4230, 3482, 462, 3604, 902, 3521, 0, 3823, 37, 515, 2426, 1360, 0, 0, 0, 0],
 [80,
  3366,
  967,
  2799,
  198,
  1877,
  3548,
  3111,
  3045,
  3057,
  3145,
  2461,
  0,
  0,
  0,
  0],
 [1212, 462, 505, 3028, 451, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [2110,
  101,
  4460,
  2772,
  871,
  1461,
  1498,
  1678,
  179,
  0,
  4034,
  1749,
  0,
  0,
  0,
  0],
 [4322,
  3817,
  3690,
  1998,
  0,
  3838,
  1661,
  2114,
  1707,
  4026,
  2290,
  0,
  0,
  0,
  0,
  0],
 [3715, 3746, 41, 1212, 2862, 3357, 1114, 2566, 1552, 0, 0, 0, 0, 0, 0, 0],
 [2448, 80, 2302, 4324, 4485, 185, 4474, 760, 0, 0, 0, 0, 0, 0, 0, 0],
 [3349, 3421,

In [14]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, Dense, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

positive_comment_vectors = np.array(positive_comment_vectors)
negative_comment_vectors = np.array(negative_comment_vectors)

all_comment_vectors = np.concatenate((positive_comment_vectors, negative_comment_vectors))

positive_labels = np.ones(len(positive_comment_vectors))
negative_labels = np.zeros(len(negative_comment_vectors))

all_labels = np.concatenate((positive_labels, negative_labels))

X_train, X_test, y_train, y_test = train_test_split(all_comment_vectors, all_labels, test_size=0.2, random_state=42)

embedding_dim = 300

# Define the model
model = Sequential()
model.add(Embedding(input_dim=len(known_words) + 1, output_dim=embedding_dim, input_length=M))
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
model.add(Dropout(0.5)) # Dropout layer for regularization
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(Dropout(0.5)) # Dropout layer for regularization
model.add(Dense(units=50, activation='relu'))
model.add(Dropout(0.5)) # Dropout layer for regularization
model.add(Flatten())
model.add(Dense(units=1, activation='tanh'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Add early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


<keras.src.callbacks.History at 0x2827231c0>