<a href="https://colab.research.google.com/github/epyyny/NLP_project_group_6/blob/main/NLP_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#remember to install spacy and en_core_web_sm

In [1]:
#get the dataset with pandas
import pandas as pd

url = "https://raw.githubusercontent.com/abishekarun/Text-Emotion-Classification/master/text_emotion.csv"
df = pd.read_csv(url)
print(df.head())

     tweet_id   sentiment       author  \
0  1956967341       empty   xoshayzers   
1  1956967666     sadness    wannamama   
2  1956967696     sadness    coolfunky   
3  1956967789  enthusiasm  czareaquino   
4  1956968416     neutral    xkilljoyx   

                                             content  
0  @tiffanylue i know  i was listenin to bad habi...  
1  Layin n bed with a headache  ughhhh...waitin o...  
2                Funeral ceremony...gloomy friday...  
3               wants to hang out with friends SOON!  
4  @dannycastillo We want to trade with someone w...  


In [2]:


#split the data into DataFrames based on sentiment categories

#at this point we can remove the rows with "empty" tag depending on our desired resuls
df = df[df["sentiment"] != "empty"]

#get all sentiment categories
categories = df["sentiment"].unique()
#create a dictionary
category_df = {}
for category in categories:

    #filter using current category
    filtered_df = df[df['sentiment'] == category]

    #reset the index of the filtered DataFrame
    filtered_df = filtered_df.reset_index(drop=True)

    #add the filtered DataFrame to the dictionary
    category_df[category] = filtered_df


for category, data in category_df.items():
    print(f"Category: {category}")
    print(data.head())
    print("\n")

Category: sadness
     tweet_id sentiment       author  \
0  1956967666   sadness    wannamama   
1  1956967696   sadness    coolfunky   
2  1956968487   sadness     ShansBee   
3  1956969035   sadness  nic0lepaula   
4  1956969172   sadness   Ingenue_Em   

                                             content  
0  Layin n bed with a headache  ughhhh...waitin o...  
1                Funeral ceremony...gloomy friday...  
2  I should be sleep, but im not! thinking about ...  
3            @charviray Charlene my love. I miss you  
4         @kelcouch I'm sorry  at least it's Friday?  


Category: enthusiasm
     tweet_id   sentiment        author  \
0  1956967789  enthusiasm   czareaquino   
1  1956981427  enthusiasm       Caillie   
2  1957066701  enthusiasm     FinIsKing   
3  1957067779  enthusiasm  Maureen12683   
4  1957073668  enthusiasm       kort030   

                                             content  
0               wants to hang out with friends SOON!  
1  bed...sorta. tod

In [3]:
#we now have the dataframe category_df

#task 1:
#suggest a script that constructs vocabulary set for each dataframe
#begin by making functions needed in the script

import nltk
from nltk.tokenize import word_tokenize
import re

#tokenize function
def tokenize(text):
    return word_tokenize(text.lower())

#modern english personal pronouns list
pronouns = ["i", "you", "he", "she", "it", "we", "they", "them", "us", "him", "her", "his", "hers", "its", "theirs", "our", "your"]
def pronoun_counter(tokens):
    count=0
    for token in tokens:
        if token in pronouns:
            count+=1

    return count

#finds all characters that are not upper or lowercase letters found in english alphabet
def uncommon_char_counter(text):
    return len(re.findall(r'[^a-zA-Z\s]', text))

#function to count repetitions in text
#note: I was not sure what "average number of repetitions per post" meant.
# few ideas:
# 1) repeated words, such as "no no no"
# 2) letters repeating consecutively more than twice, such as "nooooo"
# after inspecting the file it seems that option 2 has a lot of occurances, so I went with this.

def repetition_counter(text):
     return len(re.findall(r'(.)\1{2,}', text))





In [4]:
#task 1
#script that constructs vocabulary set for each dataframe.
import numpy as np

def vocab_set(dataframe):

    #store vocabulary, using set instead of list to avoid duplicates
    vocabulary = set()

    #lengths of posts for calculating ii)
    post_lengths = []

    #pronouns for calculating iii)
    pronoun_counts = []

    #uncommon characters for calculating iv)
    uncommon_counts = []

    #repetition counts for calculating v)
    repetition_counts = []

    #iterate over the posts in the current dataframe
    for content in dataframe["content"]:

        #add content tokens into vocabulary
        tokens = tokenize(content)
        vocabulary.update(tokens)

        #add content length for ii)
        post_lengths.append(len(tokens))

        #count the pronouns found in tokens and add to list for iii)
        pronoun_counts.append(pronoun_counter(tokens))

        #count uncommon characters found in content for iv)
        uncommon_counts.append(uncommon_char_counter(content))

        #count repetitions found in content for v)
        repetition_counts.append(repetition_counter(content))


    stats = {
        'Vocabulary Size': len(vocabulary),
        'Min Length': np.min(post_lengths),
        'Max Length': np.max(post_lengths),
        'Avg Length': np.mean(post_lengths),
        'Std Length': np.std(post_lengths),
        'Avg Pronouns': np.mean(pronoun_counts),
        'Avg Uncommon Chars': np.mean(uncommon_counts),
        'Avg Repetitions': np.mean(repetition_counts),
        'vocabulary': vocabulary
    }

    return stats


In [6]:
#make the table for task 1 and also store the vocabularies in dictionary for task 2
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

summary = []
vocabularies = {}

for category, c_df in category_df.items():
    stats = vocab_set(c_df)
    stats['Category'] = category
    vocabulary = stats.get('vocabulary', set())
    vocabularies[category] = vocabulary
    summary.append(stats)

summary_df = pd.DataFrame(summary)
summary_df.set_index('Category', inplace=True)
print(summary_df)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


            Vocabulary Size  Min Length  Max Length  Avg Length  Std Length  \
Category                                                                      
sadness               10916           1          46   17.022846    8.507309   
enthusiasm             3002           2          38   16.774704    8.314667   
neutral               17430           1          91   14.135448    8.567085   
worry                 16115           1          49   17.320723    8.432810   
surprise               6657           2          57   17.229538    8.459343   
love                   8842           1          74   16.764966    8.531608   
fun                    6098           2          45   18.146396    8.155372   
hate                   4629           2          54   17.725624    8.624764   
happiness             11925           1          82   16.876752    8.421454   
boredom                1041           3          39   16.569832    8.345165   
relief                 4735           1          41 

In [7]:
#task 2
#https://www.nltk.org/book/ch10-extras.html
#matrix will be 13x13 or 12x12 depending on if we include the "empty" rows or not
categories = list(vocabularies.keys())
matrix = [[0 for _ in range(len(categories))] for _ in range(len(categories))]

for i, category1 in enumerate(categories):
    for j, category2 in enumerate(categories):

        vocab1 = vocabularies.get(category1, set())
        vocab2 = vocabularies.get(category2, set())
        #intersection -> get wordds  that are in vocab1 AND 2, see nltk link
        common_vocab = vocab1.intersection(vocab2)
        total_size = len(vocab1) + len(vocab2)

        proportion_common = len(common_vocab) / total_size
        #store into matrix
        matrix[i][j] = proportion_common

vocab_overlap_df = pd.DataFrame(matrix, index=categories, columns=categories)
print(vocab_overlap_df)



             sadness  enthusiasm   neutral     worry  surprise      love  \
sadness     0.500000    0.122647  0.164362  0.179868  0.169806  0.169805   
enthusiasm  0.122647    0.500000  0.092991  0.098289  0.149291  0.135427   
neutral     0.164362    0.092991  0.500000  0.175019  0.141321  0.152253   
worry       0.179868    0.098289  0.175019  0.500000  0.151019  0.158473   
surprise    0.169806    0.149291  0.141321  0.151019  0.500000  0.169301   
love        0.169805    0.135427  0.152253  0.158473  0.169301  0.500000   
fun         0.161220    0.152527  0.134861  0.141269  0.171854  0.167604   
hate        0.155934    0.164723  0.118727  0.131122  0.169059  0.150249   
happiness   0.172760    0.116835  0.163073  0.171148  0.160424  0.171233   
boredom     0.065066    0.140242  0.043203  0.047972  0.089634  0.072346   
relief      0.157242    0.170609  0.125152  0.132758  0.174070  0.163144   
anger       0.050952    0.113595  0.034282  0.037243  0.071869  0.056621   

           

In [8]:
#task 3, same as task 2 but for top 30 tokens instead of vocabularies
from collections import Counter
#function to get top 30 tokens
def top_tokens(dataframe):
    token_counter = Counter()
    for content in dataframe["content"]:
        tokens = word_tokenize(content.lower())
        token_counter.update(tokens)

    top_30 = token_counter.most_common(30)
    most_common_tokens = [item[0] for item in top_30]

    return most_common_tokens

#store top 30 tokens for each category
top_30_tokens = {}
for category, c_df in category_df.items():
    top_30_tokens[category] = top_tokens(c_df)

#same as in task 2 but modified for task 3
categories = list(top_30_tokens.keys())
matrix_top30 = [[0 for _ in range(len(categories))] for _ in range(len(categories))]

for i, category1 in enumerate(categories):
    for j, category2 in enumerate(categories):

        tokens1 = top_30_tokens[category1]
        tokens2 = top_30_tokens[category2]

        common_tokens = list(set(tokens1) & set(tokens2))
        total_size = 60

        proportion_common_tokens = len(common_tokens) / total_size
        matrix_top30[i][j] = proportion_common_tokens

top_30_tokens_overlap_df = pd.DataFrame(matrix_top30, index=categories, columns=categories)
print(top_30_tokens_overlap_df)

             sadness  enthusiasm   neutral     worry  surprise      love  \
sadness     0.500000    0.450000  0.433333  0.466667  0.450000  0.383333   
enthusiasm  0.450000    0.500000  0.466667  0.450000  0.450000  0.400000   
neutral     0.433333    0.466667  0.500000  0.433333  0.466667  0.400000   
worry       0.466667    0.450000  0.433333  0.500000  0.416667  0.366667   
surprise    0.450000    0.450000  0.466667  0.416667  0.500000  0.416667   
love        0.383333    0.400000  0.400000  0.366667  0.416667  0.500000   
fun         0.433333    0.450000  0.450000  0.416667  0.466667  0.433333   
hate        0.450000    0.416667  0.433333  0.433333  0.450000  0.383333   
happiness   0.433333    0.416667  0.416667  0.400000  0.450000  0.433333   
boredom     0.400000    0.366667  0.383333  0.400000  0.383333  0.316667   
relief      0.416667    0.416667  0.433333  0.400000  0.433333  0.416667   
anger       0.433333    0.400000  0.400000  0.400000  0.433333  0.400000   

           

In [9]:
# task 4
#this section preprocesses content text. Then we use the processed text in the wnaffect script to get the emotion tags

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

sample = "Wondering why I'm awake at 7am,writing a new song,plotting my evil secret plots muahahaha...oh damn it,not secret anymore"
lemmatizer = WordNetLemmatizer()

#get wordnet POS tags so that lemmatization works correctly
#https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

#preprocess content, tokenize and lemmatize -> return lemmas, ALSO ADD STOPWORD REMOVAL!!!!
def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    lemmas = [lemmatizer.lemmatize(token.lower(), get_wordnet_pos(pos)) for token, pos in pos_tags]
    return lemmas

print(preprocess_text(sample))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


['wonder', 'why', 'i', "'m", 'awake', 'at', '7am', ',', 'write', 'a', 'new', 'song', ',', 'plot', 'my', 'evil', 'secret', 'plot', 'muahahaha', '...', 'oh', 'damn', 'it', ',', 'not', 'secret', 'anymore']


In [None]:
#task 4
#script for the WNAffect

from wnaffect import WNAffect
from emotion import Emotion # if needed
import nltk


nltk.download('wordnet')

wna = WNAffect('wordnet-1.6/', 'wn-domains-3.2/')

df['processed_content'] = df['content'].apply(preprocess_text)
#tokens = (preprocess_text(sample))
for index, row in df.iterrows():
    detected_emotions = []
    tokens = row['processed_content']
    for token in tokens:
        emo=wna.get_emotion(token, 'JJ')
        if emo!= None:
            detected_emotions.append(str(emo))

    df.at[index, 'emotions_detected'] = ", ".join(set(detected_emotions))

df.to_csv('processed_emotions_data.csv', index=False)

print("Processed emotions added to the DataFrame and saved to 'processed_emotions_data.csv'.")


#print(' -> '.join([emo.get_level(i).name for i in range(emo.level + 1)]))
#Emotion.printTree(Emotion.emotions["gloom"])


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Processed emotions added to the DataFrame and saved to 'processed_emotions_data.csv'.


In [None]:
# task 4
#get 5 most dominant emotions

from collections import Counter

#counter for the emotions
emotion_counter = Counter()

#emotions_detected column has the WNAffect emotions
for emotions in df['emotions_detected']:
    #store emotions into a list, account for commas if many emotions per cell, also ignore empty cells
    if pd.notna(emotions) and emotions.strip():
        emotion_list = [e.strip() for e in emotions.split(",")]
        emotion_counter.update(emotion_list)

#total number of emotions
total_emotions = sum(emotion_counter.values())

#proportion of each emotiom
emotion_proportions = {emotion: count / total_emotions for emotion, count in emotion_counter.items()}

top_five_emotions = Counter(emotion_proportions).most_common(5)


#print results
print("Top 5 Dominant Emotions and Their Proportions:")
for emotion, proportion in top_five_emotions:
    print(f"{emotion}: {proportion:.2%}")

Emotion.printTree(Emotion.emotions["benevolence"])



Top 5 Dominant Emotions and Their Proportions:
benevolence: 6.01%
happiness: 3.53%
lost-sorrow: 3.40%
placidity: 2.43%
eagerness: 2.08%
 benevolence┐
            └beneficence


In [None]:
#task 4
#train word2vec model using wiki-news-300d-1M.vec
from gensim.models import KeyedVectors

print("training model...")

model = KeyedVectors.load_word2vec_format(r'C:\Users\Hp\Documents\NLP_PROJECT-20241029T133214Z-002\NLP_PROJECT\wiki-news-300d-1M-001.vec', binary=False)

training model...


In [None]:
#task 4
#weighted average word2vec vector

def weighted_average_word2vec(emotions):
    #word2vec(hate) x 0.15 + word2vec(neutral)x 0.1 + word2vec(anger) x 0.05 + word2vec(happiness) x 0.06 + word2vec(surprise) x 0.04
    weighted_vector = 0
    for emotion, proportion in emotions:
        if emotion in model:
            weighted_vector += model[emotion] * proportion

    return weighted_vector

weighted_avg_vector = weighted_average_word2vec(top_five_emotions)
print(weighted_avg_vector)


[ 0.00453304 -0.01703335 -0.01173914 -0.0160204  -0.0014179  -0.010364
  0.00784905 -0.00451288  0.0134875  -0.00104138 -0.01049903  0.0092294
  0.00312414 -0.01275068  0.01795084 -0.00817655  0.00121392 -0.01076694
  0.01051664  0.00709103 -0.02236621  0.00361313 -0.01501228  0.01847637
 -0.0057675   0.00245502  0.00797408  0.00245166  0.00659872 -0.00128891
  0.01441482 -0.00157454 -0.0126182   0.00366171 -0.00367447 -0.0001765
  0.0098573   0.01483398 -0.00198814  0.01188629  0.00811419  0.02243211
 -0.01191161  0.01007307  0.00646167 -0.00811987  0.00215277 -0.01609861
 -0.02022503 -0.00686211  0.00557647  0.00539746 -0.10291036  0.00284963
 -0.00202729 -0.0038608  -0.00522976  0.01141731 -0.00397424 -0.01166905
 -0.01178165 -0.00161453 -0.01796527 -0.0070989   0.01379756 -0.0325169
  0.00383367 -0.00822778 -0.00388069  0.01970567 -0.00836656 -0.00049672
  0.00704074  0.00590158 -0.01163462  0.00222528  0.01078313  0.00956004
  0.00537248  0.03480566  0.00548286  0.02191873 -0.0058

In [None]:
#task 4
#cosine similarity
#https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html
from scipy.spatial.distance import cosine
import pandas as pd
import matplotlib.pyplot as plt

key_emotions = ["sadness","enthusiasm", "worry", "surprise", "fun", "hate", "love", "happiness", "relief", "boredom", "anger", "neutral"]
cos_similarity_matrix = np.zeros((len(key_emotions) + 1, len(key_emotions) + 1))

#cosine similarity calculation
for i, emotion in enumerate(key_emotions):
    #first calculate cosine similarity between key_emotions
    for j, key in enumerate(key_emotions):
        if emotion in model and key in model:
            cos_similarity_matrix[i, j] = 1 - cosine(model[emotion], model[key])

    if emotion in model:
        #similaritty with itself is always 1
        cos_similarity_matrix[i, i] = 1.0

        #key_emotion similarity with weighted average vector, use scipy cosine distance, see link on top for example
        cos_similarity_matrix[i, -1] = 1 - cosine(weighted_avg_vector, model[emotion])
        #cosine sim matrix is also symmetrical https://fr.mathworks.com/help/textanalytics/ref/cosinesimilarity.html#
        #fills in the last row
        cos_similarity_matrix[-1, i] = cos_similarity_matrix[i, -1]

cos_similarity_matrix[-1, -1] = 1.0

#print matrix
similarity_df = pd.DataFrame(cos_similarity_matrix, index=key_emotions+ ['weighted_average'], columns=key_emotions+ ['weighted_average'])
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
print("Cosine Similarity Matrix:")
print(similarity_df)

Cosine Similarity Matrix:
                  sadness  enthusiasm  worry  surprise   fun  hate  love  happiness  relief  boredom  anger  neutral  weighted_average
sadness             1.000       0.526  0.409     0.516 0.410 0.473 0.538      0.650   0.453    0.551  0.684    0.258             0.621
enthusiasm          0.526       1.000  0.368     0.494 0.474 0.384 0.512      0.450   0.339    0.470  0.539    0.275             0.617
worry               0.409       0.368  1.000     0.463 0.413 0.451 0.387      0.306   0.284    0.334  0.436    0.233             0.385
surprise            0.516       0.494  0.463     1.000 0.457 0.414 0.425      0.414   0.454    0.372  0.497    0.280             0.455
fun                 0.410       0.474  0.413     0.457 1.000 0.469 0.521      0.398   0.278    0.470  0.364    0.348             0.402
hate                0.473       0.384  0.451     0.414 0.469 1.000 0.713      0.356   0.294    0.390  0.570    0.315             0.429
love                0.538    

In [None]:
!cp ~/Downloads/WNAffect/* .

'cp' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
!ls

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
!cp -r ~/Downloads/WNStuff/wordnet-1.6 .


'cp' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
!cp -r ~/Downloads/WNStuff/wn-domains-3.2 .


'cp' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
!ls

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
# Task 5
# install: pip install NRCLex !
from nrclex import NRCLex

# Processing each post and identify emotions
df['nrc_emotions_detected'] = df['content'].apply(lambda text: [emotion for emotion, score in NRCLex(text).affect_frequencies.items() if score > 0])

emotion_counter = Counter()
for emotions in df['nrc_emotions_detected']:
    emotion_counter.update(emotions)

# Total num of emotions
total_emotions_nrc = sum(emotion_counter.values())

# Proportion of each emotion
emotion_proportions_nrc = {emotion: count / total_emotions for emotion, count in emotion_counter.items()}

# Get the top 5, the top result is the empty ones so we need to get top 6
top_five_emotions_nrc = Counter(emotion_proportions_nrc).most_common(6)
# Delete the empty emotion
del top_five_emotions_nrc[0]

# Display the top 5 emotions
print("\nTop 5 Dominant Emotions from NRC Lexicon and Their Proportions:")
for emotion, proportion in top_five_emotions_nrc:
    print(f"{emotion}: {proportion:.2%}")
Emotion.printTree(Emotion.emotions["benevolence"])

# - Model trained in task 4 -

# Weighted average word2vec vector for NRC
def weighted_average_word2vec_NRC(emotions):
    weighted_vector = np.zeros(model.vector_size)
    for emotion, proportion in emotions:
        if emotion in model:
            weighted_vector += model[emotion] * proportion
    return weighted_vector

weighted_avg_vector_nrc = weighted_average_word2vec_NRC(top_five_emotions)
print("\nWeighted average vector (NRC):", weighted_avg_vector_nrc)


# - Key emotions and cosine similarity matrix defined in task 4 -

# We can calculate the cosine similarities using the same code that in task 4
for i, emotion in enumerate(key_emotions):
    for j, key in enumerate(key_emotions):
        if emotion in model and key in model:
            cos_similarity_matrix[i, j] = 1 - cosine(model[emotion], model[key])

    if emotion in model:
        cos_similarity_matrix[i, i] = 1.0
        cos_similarity_matrix[i, -1] = 1 - cosine(weighted_avg_vector_nrc, model[emotion])
        cos_similarity_matrix[-1, i] = cos_similarity_matrix[i, -1]

cos_similarity_matrix[-1, -1] = 1.0

# Displaying the matrix
similarity_df = pd.DataFrame(cos_similarity_matrix, index=key_emotions + ['weighted_average'], columns=key_emotions + ['weighted_average'])
pd.set_option('display.float_format', '{:.3f}'.format)
print("\nCosine Similarity Matrix (NRC):")
print(similarity_df)


Top 5 Dominant Emotions from NRC Lexicon and Their Proportions:
benevolence: 6.01%
happiness: 3.53%
lost-sorrow: 3.40%
placidity: 2.43%
eagerness: 2.08%
 benevolence┐
            └beneficence

Weighted average vector (NRC): [ 0.00453304 -0.01703335 -0.01173914 -0.0160204  -0.0014179  -0.010364
  0.00784905 -0.00451288  0.0134875  -0.00104138 -0.01049903  0.0092294
  0.00312414 -0.01275068  0.01795084 -0.00817655  0.00121392 -0.01076694
  0.01051664  0.00709103 -0.02236621  0.00361313 -0.01501228  0.01847637
 -0.0057675   0.00245502  0.00797408  0.00245166  0.00659872 -0.00128891
  0.01441482 -0.00157454 -0.0126182   0.00366171 -0.00367447 -0.0001765
  0.0098573   0.01483398 -0.00198814  0.01188629  0.00811419  0.02243211
 -0.01191161  0.01007307  0.00646167 -0.00811987  0.00215277 -0.01609861
 -0.02022503 -0.00686211  0.00557647  0.00539746 -0.10291036  0.00284963
 -0.00202729 -0.0038608  -0.00522976  0.01141731 -0.00397424 -0.01166905
 -0.01178165 -0.00161453 -0.01796527 -0.0070989  

In [None]:
# Task 6
from gensim.models import KeyedVectors, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity

# Pairs for Circumplex model testing
circumplex_pairs = {
    'close_pairs': [("sadness", "boredom"), ("hate", "anger"), ("fun", "love"), ("love", "happiness"), ("fun", "happiness")],
    'distant_pairs': [("sadness", "fun"), ("anger", "happiness"), ("boredom", "love")]  # Testing against unrelated pairs
}

# Tagging data for doc2vec training
tagged_data = []
for category, c_df in category_df.items():
    for i, row in c_df.iterrows():
        words = preprocess_text(row["content"]) # Using function from task 4
        tagged_data.append(TaggedDocument(words=words, tags=[category]))

# Training the model
print("training model...")
doc2vec_model = Doc2Vec(vector_size=300, min_count=1, epochs=30)
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Getting vector of an emotion
def get_emotion_vector(emotion):
    try:
        return doc2vec_model.dv[emotion]
    except KeyError:
        print(f"Emotion '{emotion}' not found in Doc2Vec model.")
        return np.zeros(doc2vec_model.vector_size)

# Calculating the cosine similarity
def cosine_similarity_between_emotions(pair):
    vec1, vec2 = get_emotion_vector(pair[0]), get_emotion_vector(pair[1])
    return cosine_similarity([vec1], [vec2])[0, 0]

# Table for results
results = {"Emotion Pair": [], "Cosine Similarity": [], "Relation Type": []}

for relation_type, pairs in circumplex_pairs.items():
    for pair in pairs:
        similarity = cosine_similarity_between_emotions(pair)
        results["Emotion Pair"].append(f"{pair[0]} - {pair[1]}")
        results["Cosine Similarity"].append(similarity)
        results["Relation Type"].append("Close" if relation_type == "close_pairs" else "Distant")

# Displaying results in a dataframe
circumplex_df = pd.DataFrame(results)
circumplex_df.sort_values(by="Cosine Similarity", ascending=False, inplace=True)
print("\nCircumplex Model Closeness Test Results:")
print(circumplex_df)
# fun-love pair is the only close pair that has lower similarity than few distant pairs, other close pairs the hypothesis is correct

training model...

Circumplex Model Closeness Test Results:
        Emotion Pair  Cosine Similarity Relation Type
1       hate - anger              0.620         Close
0  sadness - boredom              0.452         Close
4    fun - happiness              0.323         Close
3   love - happiness              0.298         Close
6  anger - happiness              0.295       Distant
7     boredom - love              0.174       Distant
2         fun - love              0.156         Close
5      sadness - fun              0.137       Distant


In [None]:
# Task 7
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

# Prepare data
X = df['content']
y = df['sentiment']

# Splitting the training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function for evaluating the models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    conf_matrix = confusion_matrix(y_test, y_pred)
    #print(classification_report(y_test, y_pred))
    return precision, recall, f1, conf_matrix

# Tf-idf without stopword removal
tfidf_no_stop = TfidfVectorizer(max_features=None)
X_train_tfidf_no_stop = tfidf_no_stop.fit_transform(X_train)
X_test_tfidf_no_stop = tfidf_no_stop.transform(X_test)


# Machine learning models that are used
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': MultinomialNB()
}

# Evaluate each model without stopwords
results_no_stop = {}
for name, model in models.items():
    #print(f"Results for {name} without stopwords:")
    precision, recall, f1, conf_matrix = evaluate_model(model, X_train_tfidf_no_stop, X_test_tfidf_no_stop, y_train, y_test)
    results_no_stop[name] = {'Precision': precision, 'Recall': recall, 'F1 Score': f1, 'Confusion Matrix': conf_matrix}

# Displaying results
print("\nSummary of Task 7 Results:")
for model_name, metrics in results_no_stop.items():
    print(f"\n{model_name}")
    print(f"Precision: {metrics['Precision']:.2f}")
    print(f"Recall: {metrics['Recall']:.2f}")
    print(f"F1 Score: {metrics['F1 Score']:.2f}")
    print("Confusion Matrix:\n", metrics['Confusion Matrix'])


Summary of Task 7 Results:

Logistic Regression
Precision: 0.34
Recall: 0.35
F1 Score: 0.32
Confusion Matrix:
 [[  0   0   0   0   0   1   0   0   6   0   1   0  11]
 [  0   0   0   0   0   3   0   1   7   0   8   0  12]
 [  0   1   1   0   1  15   1   4  79   0   9   0  51]
 [  0   0   0   0   0  30   1   2  73   1  13   3  40]
 [  0   0   0   0   5  86   0  26 121   1  19   3  77]
 [  0   0   0   0  15 396   0  99 302   4  38   7 167]
 [  0   0   0   0   0   5  34   3  73   0  43   1 109]
 [  0   0   0   0   4 167   1 296 141   1  34   4 114]
 [  0   0   2   0   5 156   6  64 900   6 109  10 482]
 [  0   0   0   0   0  76   0  23 112  10  26   3 102]
 [  0   0   0   0   3  42  13  27 199   0 277   2 483]
 [  0   0   0   0   1  67   2  26 127   1  35  22 144]
 [  0   1   0   0   8 109  13  32 410   2 223   9 859]]

Decision Tree
Precision: 0.23
Recall: 0.24
F1 Score: 0.23
Confusion Matrix:
 [[  0   0   0   1   1   2   0   1   5   0   1   1   7]
 [  0   0   0   0   0   5   0   0  10  

In [None]:
# Task 8

# Different feature thresholds
feature_thresholds = [1000, 500, 100]

# Stopwords removal and evaluation at different thresholds
results_with_stop = {}
for threshold in feature_thresholds:
    #print(f"\nEvaluating models with stopword removal and max_features={threshold}\n")
    tfidf_with_stop = TfidfVectorizer(stop_words='english', max_features=threshold)
    X_train_tfidf_with_stop = tfidf_with_stop.fit_transform(X_train)
    X_test_tfidf_with_stop = tfidf_with_stop.transform(X_test)

    threshold_results = {}
    for name, model in models.items():
        #print(f"Results for {name} with stopwords removed and max_features={threshold}:")
        precision, recall, f1, conf_matrix = evaluate_model(model, X_train_tfidf_with_stop, X_test_tfidf_with_stop, y_train, y_test) # function defined in task 7
        threshold_results[name] = {'Precision': precision, 'Recall': recall, 'F1 Score': f1, 'Confusion Matrix': conf_matrix}

    results_with_stop[threshold] = threshold_results

# Displaying results
print("\nSummary of Task 8 Results:")
for threshold, models_results in results_with_stop.items():
    print(f"\nThreshold: {threshold} features")
    for model_name, metrics in models_results.items():
        print(f"\n{model_name}")
        print(f"Precision: {metrics['Precision']:.2f}")
        print(f"Recall: {metrics['Recall']:.2f}")
        print(f"F1 Score: {metrics['F1 Score']:.2f}")
        print("Confusion Matrix:\n", metrics['Confusion Matrix'])


Summary of Task 8 Results:

Threshold: 1000 features

Logistic Regression
Precision: 0.34
Recall: 0.35
F1 Score: 0.31
Confusion Matrix:
 [[  0   0   0   0   0   1   0   0   9   0   1   0   8]
 [  0   0   0   0   0   1   0   1   8   0   7   0  14]
 [  0   0   3   0   1  11   2   6  86   0  14   1  38]
 [  0   0   0   0   0  26   2   3  78   1   8   1  44]
 [  0   0   0   0   7  91   2  26 122   1  12   7  70]
 [  0   0   0   0  21 376   2 102 318   4  30  11 164]
 [  0   0   0   0   1   8  49   2  77   0  38   1  92]
 [  0   0   0   0   6 178   2 294 152   3  33   3  91]
 [  0   0   2   0   7 150   7  56 992   6  90  10 420]
 [  0   0   0   0   2  80   0  26 135  11  18   2  78]
 [  0   0   0   0   5  51  19  33 273   2 247   6 410]
 [  0   0   0   0   1  79   3  21 150   2  25  16 128]
 [  0   0   0   0   6 110  17  46 519   2 184   7 775]]

Decision Tree
Precision: 0.24
Recall: 0.26
F1 Score: 0.25
Confusion Matrix:
 [[  0   0   0   0   1   2   0   0  10   0   2   0   4]
 [  0   0   1

In [None]:
#Task 9: Task 8 with Bigrams

# Different feature thresholds
feature_thresholds = [1000, 500, 100]

# Stopwords removal and evaluation at different thresholds using bigrams
results_with_stop_bigrams = {}
for threshold in feature_thresholds:
    #print(f"\nEvaluating models with stopword removal, bigrams, and max_features={threshold}\n")
    tfidf_with_stop_bigrams = TfidfVectorizer(stop_words='english', max_features=threshold, ngram_range=(2, 2))
    X_train_tfidf_with_stop_bigrams = tfidf_with_stop_bigrams.fit_transform(X_train)
    X_test_tfidf_with_stop_bigrams = tfidf_with_stop_bigrams.transform(X_test)

    threshold_results_bigrams = {}
    for name, model in models.items():
        #print(f"Results for {name} with stopwords removed, bigrams, and max_features={threshold}:")
        precision, recall, f1, conf_matrix = evaluate_model(model, X_train_tfidf_with_stop_bigrams, X_test_tfidf_with_stop_bigrams, y_train, y_test) # function defined in task 7
        threshold_results_bigrams[name] = {'Precision': precision, 'Recall': recall, 'F1 Score': f1, 'Confusion Matrix': conf_matrix}

    results_with_stop_bigrams[threshold] = threshold_results_bigrams

# Displaying results
print("\nSummary of Task 9 Results with Bigrams:")
for threshold, models_results in results_with_stop_bigrams.items():
    print(f"\nThreshold: {threshold} features (Bigrams)")
    for model_name, metrics in models_results.items():
        print(f"\n{model_name}")
        print(f"Precision: {metrics['Precision']:.2f}")
        print(f"Recall: {metrics['Recall']:.2f}")
        print(f"F1 Score: {metrics['F1 Score']:.2f}")
        print("Confusion Matrix:\n", metrics['Confusion Matrix'])



Summary of Task 9 Results with Bigrams:

Threshold: 1000 features (Bigrams)

Logistic Regression
Precision: 0.29
Recall: 0.26
F1 Score: 0.19
Confusion Matrix:
 [[   0    0    0    0    0    0    0    0   16    0    0    0    3]
 [   0    0    0    0    0    0    0    0   27    0    0    0    4]
 [   0    0    0    0    0    8    0    4  136    0    3    1   10]
 [   0    0    0    0    1   16    0    4  126    0    4    0   12]
 [   0    0    0    0    0   36    0    7  249    0   13    0   33]
 [   0    0    0    0    3  160    0   68  699    2   16    0   80]
 [   0    0    0    0    0    5    2    2  205    0   16    1   37]
 [   0    0    0    0    0   77    0  125  495    1    5    1   58]
 [   0    0    0    0    0   65    1   32 1477    0   24    0  141]
 [   0    0    0    0    0   29    0   11  251    9    9    0   43]
 [   0    0    0    0    0   31    0   12  777    3   46    1  176]
 [   0    0    0    0    0   22    0   12  342    0    8    2   39]
 [   0    0    0    0  

In [None]:
#Task 10
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

class EmotionCNNClassifier:
    def __init__(self, max_words=10000, max_len=100, embedding_dim=100):
        self.max_words = max_words
        self.max_len = max_len
        self.embedding_dim = embedding_dim
        self.tokenizer = None
        self.label_encoder = None
        self.model = None

    def preprocess_data(self, X_train, X_test, y_train, y_test):
        """Preprocess text data and labels for CNN."""
        # Convert to numpy arrays if needed
        if hasattr(X_train, 'values'):
            X_train = X_train.values
        if hasattr(X_test, 'values'):
            X_test = X_test.values
        if hasattr(y_train, 'values'):
            y_train = y_train.values
        if hasattr(y_test, 'values'):
            y_test = y_test.values

        # Process text data
        self.tokenizer = Tokenizer(num_words=self.max_words)
        self.tokenizer.fit_on_texts(X_train)

        X_train_seq = self.tokenizer.texts_to_sequences(X_train)
        X_test_seq = self.tokenizer.texts_to_sequences(X_test)

        X_train_pad = pad_sequences(X_train_seq, maxlen=self.max_len)
        X_test_pad = pad_sequences(X_test_seq, maxlen=self.max_len)

        # Process labels
        self.label_encoder = LabelEncoder()
        y_train_encoded = self.label_encoder.fit_transform(y_train)
        y_test_encoded = self.label_encoder.transform(y_test)

        return X_train_pad, X_test_pad, y_train_encoded, y_test_encoded

    def create_model(self, num_classes):
        """Create CNN model with improved architecture."""
        vocab_size = min(self.max_words, len(self.tokenizer.word_index) + 1)

        model = Sequential([
            # Embedding layer
            Embedding(vocab_size, self.embedding_dim, input_length=self.max_len),

            # Multiple Conv1D layers with different kernel sizes
            Conv1D(128, 3, activation='relu', padding='same'),
            BatchNormalization(),
            Dropout(0.2),

            Conv1D(128, 4, activation='relu', padding='same'),
            BatchNormalization(),
            Dropout(0.2),

            Conv1D(128, 5, activation='relu', padding='same'),
            GlobalMaxPooling1D(),

            # Dense layers with dropout and batch normalization
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.5),

            Dense(128, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),

            Dense(num_classes, activation='softmax')
        ])

        optimizer = Adam(learning_rate=0.001)
        model.compile(optimizer=optimizer,
                     loss='sparse_categorical_crossentropy',
                     metrics=['accuracy'])

        self.model = model
        return model

def calculate_metrics(y_true, y_pred, all_labels):
    return {
        'precision': precision_score(y_true, y_pred,
                                   average='weighted',
                                   zero_division=0,
                                   labels=all_labels),
        'recall': recall_score(y_true, y_pred,
                             average='weighted',
                             zero_division=0,
                             labels=all_labels),
        'f1': f1_score(y_true, y_pred,
                      average='weighted',
                      zero_division=0,
                      labels=all_labels),
        'confusion_matrix': confusion_matrix(y_true, y_pred,
                                           labels=all_labels)
    }

def train_and_evaluate_cnn(X_train, X_test, y_train, y_test):
    """Train and evaluate CNN model with proper preprocessing and metric calculation."""
    # Initialize classifier
    classifier = EmotionCNNClassifier(max_words=10000, max_len=100, embedding_dim=100)

    # Get unique labels before encoding
    all_labels = np.unique(np.concatenate([y_train, y_test]))

    # Preprocess data
    X_train_pad, X_test_pad, y_train_encoded, y_test_encoded = classifier.preprocess_data(
        X_train, X_test, y_train, y_test
    )

    # Create model
    num_classes = len(all_labels)
    classifier.create_model(num_classes)

    # Print model summary
    print("\nModel Architecture:")
    classifier.model.summary()

    # Callbacks
    callbacks = [
        EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        ),
        ModelCheckpoint(
            'best_emotion_model.h5',
            monitor='val_loss',
            save_best_only=True
        )
    ]

    # Train model
    history = classifier.model.fit(
        X_train_pad, y_train_encoded,
        epochs=15,
        batch_size=32,
        validation_split=0.2,
        callbacks=callbacks,
        verbose=1
    )

    # Get predictions
    y_pred_probs = classifier.model.predict(X_test_pad)
    y_pred_encoded = np.argmax(y_pred_probs, axis=1)

    # Convert predictions back to original labels
    y_pred = classifier.label_encoder.inverse_transform(y_pred_encoded)

    # Calculate overall metrics
    overall_metrics = calculate_metrics(y_test, y_pred, all_labels)

    # Calculate per-class metrics
    print("\nPer-class Performance:")
    class_metrics = {}
    for label in all_labels:
        true_mask = y_test == label
        pred_mask = y_pred == label

        if np.any(true_mask) or np.any(pred_mask):
            precision = precision_score(y_test == label, y_pred == label,
                                     zero_division=0)
            recall = recall_score(y_test == label, y_pred == label,
                                zero_division=0)
            f1 = f1_score(y_test == label, y_pred == label,
                         zero_division=0)

            class_metrics[label] = {
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'support': np.sum(true_mask)
            }

            print(f"\nClass: {label}")
            print(f"Precision: {precision:.2f}")
            print(f"Recall: {recall:.2f}")
            print(f"F1 Score: {f1:.2f}")
            print(f"Support: {np.sum(true_mask)}")

    return overall_metrics, class_metrics, history, classifier

# Example usage:

# Using the same train-test split as previous tasks
overall_metrics, class_metrics, history, model = train_and_evaluate_cnn(X_train, X_test, y_train, y_test)

# Print overall results
print("\nOverall CNN Model Results:")
print(f"Precision: {overall_metrics['precision']:.2f}")
print(f"Recall: {overall_metrics['recall']:.2f}")
print(f"F1 Score: {overall_metrics['f1']:.2f}")
print("Confusion Matrix:\n", overall_metrics['confusion_matrix'])



Model Architecture:
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 100, 100)          1000000   
                                                                 
 conv1d_10 (Conv1D)          (None, 100, 128)          38528     
                                                                 
 batch_normalization_12 (Bat  (None, 100, 128)         512       
 chNormalization)                                                
                                                                 
 dropout_10 (Dropout)        (None, 100, 128)          0         
                                                                 
 conv1d_11 (Conv1D)          (None, 100, 128)          65664     
                                                                 
 batch_normalization_13 (Bat  (None, 100, 128)         512       
 chNormalization)                