In [1]:
#question 1
import pandas as pd
import re
import numpy as np
from nltk.tokenize import word_tokenize
from collections import Counter

# Ensure NLTK resources are downloaded
import nltk
nltk.download('punkt')

# Load data
df = pd.read_csv('text_emotion.csv')

##########
df.head()


# Group data by 'sentiment' and create a dictionary of DataFrames
emotion_dataframes = {emotion: df_group for emotion, df_group in df.groupby('sentiment')}

# Save each category DataFrame for verification
for emotion, emotion_df in emotion_dataframes.items():
    print(f"\nEmotion: {emotion}, Number of Posts: {len(emotion_df)}")
    print(emotion_df.head())  # Print the first few rows of each category

# Example: Accessing a specific emotion dataframe
sadness_df = emotion_dataframes.get('sadness')
print("\nSample data for 'sadness':")
print(sadness_df.head())
###########

# Define pronouns comprehensively (including possessive/reflexive)
PRONOUNS = {
    'i', 'me', 'my', 'mine', 'myself',
    'you', 'your', 'yours', 'yourself', 'yourselves',
    'he', 'him', 'his', 'himself',
    'she', 'her', 'hers', 'herself',
    'it', 'its', 'itself',
    'we', 'us', 'our', 'ours', 'ourselves',
    'they', 'them', 'their', 'theirs', 'themselves'
}

def calculate_stats(df):
    posts = df['content'].dropna()  # Handle missing values
    vocab = set()
    token_lengths = []
    total_pronouns = 0
    total_uncommon = 0
    total_repetitions = 0

    # Precompile regex for efficiency
    uncommon_pattern = re.compile(r'[@$%&*#]|\d')  # Explicitly target uncommon chars
    
    for post in posts:
        # Tokenize and normalize
        tokens = word_tokenize(post.lower())
        vocab.update(tokens)
        token_lengths.append(len(tokens))
        
        # Count pronouns
        total_pronouns += sum(1 for token in tokens if token in PRONOUNS)
        
        # Count uncommon characters
        total_uncommon += len(uncommon_pattern.findall(post))
        
        # Count repetitions (using frequency distribution)
        freq = Counter(tokens)
        total_repetitions += sum(cnt - 1 for cnt in freq.values() if cnt > 1)
    
    # Calculate statistics
    return {
        "vocab_size": len(vocab),
        "min_length": np.min(token_lengths) if token_lengths else 0,
        "max_length": np.max(token_lengths) if token_lengths else 0,
        "avg_length": np.mean(token_lengths) if token_lengths else 0,
        "std_dev_length": np.std(token_lengths) if token_lengths else 0,
        "avg_pronouns": total_pronouns / len(posts) if posts.size > 0 else 0,
        "avg_uncommon": total_uncommon / len(posts) if posts.size > 0 else 0,
        "avg_repetitions": total_repetitions / len(posts) if posts.size > 0 else 0,
    }

# Group data and calculate stats
emotion_dataframes = {emotion: group for emotion, group in df.groupby('sentiment')}
stats_summary = {emotion: calculate_stats(group) for emotion, group in emotion_dataframes.items()}

# Convert to DataFrame
stats_df = pd.DataFrame(stats_summary).T
stats_df.columns = [
    "Vocabulary Size", "Min Length", "Max Length", "Avg Length",
    "Std Dev Length", "Avg Pronouns", "Avg Uncommon", "Avg Repetitions"
]

print("\n\n Statistical Summary:")
print(stats_df.round(2))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gdemil24\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Emotion: anger, Number of Posts: 110
        tweet_id sentiment           author  \
494   1957083641     anger         elDi_irk   
527   1957089935     anger            umi78   
612   1957110088     anger        NayNay_Rt   
1377  1957289252     anger       crazy_erin   
1384  1957291305     anger  msfussybritches   

                                                content  
494                               fuckin'm transtelecom  
527                    Working   But it's Fridaaaayyyyy  
612                          Packing  I don't like it..  
1377  I tried to dye my hair and all i got was a blo...  
1384  &quot;locked up abroad&quot; makes bein half b...  

Emotion: boredom, Number of Posts: 179
       tweet_id sentiment           author  \
112  1956993007   boredom         villa_ld   
316  1957038475   boredom          kameezy   
345  1957044366   boredom       snoopync18   
495  1957083786   boredom     munkeysmomma   
583  1957102562   boredom  Quastbabynumbr5   

              

In [10]:

# Save the DataFrame as a LaTeX table
sadness_df.to_latex("Sample_data_for_'sadness'.tex", index=True)
print("Sample data for 'sadness' saved for latex use:")

stats_df.to_latex("stats_summary.tex", index=True, float_format="%.2f")
print("Statistics Summary saved for latex use:")


Sample data for 'sadness' saved for latex use:
Statistics Summary saved for latex use:


In [11]:
#question 2
import pandas as pd
from nltk.tokenize import word_tokenize
from itertools import product

# Load data and preprocess
df = pd.read_csv('text_emotion.csv')
emotions = df['sentiment'].unique()
emotion_pairs = list(product(emotions, repeat=2))

# Build vocabularies properly
emotion_vocabs = {}
for emotion, group in df.groupby('sentiment'):
    tokens = set()
    for text in group['content'].str.lower():
        tokens.update(word_tokenize(text))
    emotion_vocabs[emotion] = tokens

# Initialize matrix
matrix = pd.DataFrame(index=emotions, columns=emotions)

# Calculate Jaccard similarity for vocabulary overlap
for (emotion1, emotion2) in emotion_pairs:
    vocab1 = emotion_vocabs[emotion1]
    vocab2 = emotion_vocabs[emotion2]
    
    intersection = len(vocab1 & vocab2)
    union = len(vocab1 | vocab2)
    
    # Handle edge case of empty vocabularies
    jaccard = intersection / union if union != 0 else 0
    matrix.loc[emotion1, emotion2] = jaccard

# Formatting for readability
matrix = matrix.astype(float).round(3)
np.fill_diagonal(matrix.values, 1.0)  # Explicitly set diagonal

print("\nVocabulary Overlap Matrix (Jaccard Similarity):")
print(matrix)


Vocabulary Overlap Matrix (Jaccard Similarity):
            empty  sadness  enthusiasm  neutral  worry  surprise   love  \
empty       1.000    0.140       0.202    0.103  0.109     0.172  0.148   
sadness     0.140    1.000       0.140    0.197  0.219     0.205  0.205   
enthusiasm  0.202    0.140       1.000    0.103  0.109     0.175  0.157   
neutral     0.103    0.197       0.103    1.000  0.212     0.165  0.180   
worry       0.109    0.219       0.109    0.212  1.000     0.178  0.188   
surprise    0.172    0.205       0.175    0.165  0.178     1.000  0.204   
love        0.148    0.205       0.157    0.180  0.188     0.204  1.000   
fun         0.178    0.192       0.180    0.156  0.165     0.208  0.201   
hate        0.198    0.185       0.197    0.135  0.151     0.203  0.177   
happiness   0.129    0.209       0.132    0.195  0.206     0.191  0.207   
boredom     0.162    0.070       0.163    0.045  0.050     0.098  0.078   
relief      0.198    0.187       0.206    0.143  0.

In [12]:
# Save the DataFrame as a LaTeX table
matrix.to_latex("Matrix_12by12'.tex", index=True, float_format="%.2f")
print("12 by 12 matrix saved for latex use:")

12 by 12 matrix saved for latex use:


In [13]:
#question 3
import pandas as pd
from nltk.tokenize import word_tokenize
from collections import Counter
from itertools import product

# Preprocessing function with proper tokenization
def get_top_tokens(df, top_n=30):
    all_tokens = []
    for text in df['content'].dropna().str.lower():
        tokens = word_tokenize(text)
        # Basic cleaning: remove punctuation-only tokens
        cleaned = [t for t in tokens if t.isalnum()]
        all_tokens.extend(cleaned)
    return {item[0] for item in Counter(all_tokens).most_common(top_n)}

# Get top tokens for each emotion
emotion_top_tokens = {
    emotion: get_top_tokens(group) 
    for emotion, group in df.groupby('sentiment')
}

# Create matrix with proper initialization
emotions = list(emotion_top_tokens.keys())
matrix = pd.DataFrame(index=emotions, columns=emotions, dtype=int)

# Calculate overlap using product for complete pairs
for (e1, e2) in product(emotions, repeat=2):
    common = emotion_top_tokens[e1] & emotion_top_tokens[e2]
    matrix.at[e1, e2] = len(common)

print("Top 30 Token Overlap Matrix:")
print(matrix)

Top 30 Token Overlap Matrix:
            anger  boredom  empty  enthusiasm   fun  happiness  hate  love  \
anger        30.0     21.0   25.0        25.0  25.0       23.0  24.0  23.0   
boredom      21.0     30.0   22.0        22.0  20.0       18.0  24.0  17.0   
empty        25.0     22.0   30.0        25.0  26.0       23.0  26.0  22.0   
enthusiasm   25.0     22.0   25.0        30.0  26.0       24.0  24.0  23.0   
fun          25.0     20.0   26.0        26.0  30.0       25.0  24.0  23.0   
happiness    23.0     18.0   23.0        24.0  25.0       30.0  22.0  26.0   
hate         24.0     24.0   26.0        24.0  24.0       22.0  30.0  21.0   
love         23.0     17.0   22.0        23.0  23.0       26.0  21.0  30.0   
neutral      25.0     22.0   28.0        25.0  26.0       23.0  27.0  22.0   
relief       25.0     21.0   26.0        25.0  25.0       27.0  25.0  25.0   
sadness      24.0     21.0   26.0        24.0  24.0       24.0  25.0  23.0   
surprise     26.0     20.0   25.0  

In [17]:
# Save the DataFrame as a LaTeX table
matrix.to_latex("Top_30_Token_Overlap_Matrix.tex", index=True, float_format="%.2f")
print("Top 30 Token Overlap matrix saved for latex use:")

Top 30 Token Overlap matrix saved for latex use:


In [19]:
#question 4
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

# 1. Load Data and Resources -------------------------------------------------
df = pd.read_csv('text_emotion.csv')
emotion_labels = df['sentiment'].unique().tolist() + ['empty']  # 13 categories

# Load word2vec model (replace with actual path)
model = KeyedVectors.load_word2vec_format('archive/GoogleNews-vectors-negative300.bin', binary=True)
DIM = 300  # Dimension of word vectors

# Dummy model for illustration
class DummyModel:
    def __getitem__(self, word):
        return np.random.rand(DIM)
    def __contains__(self, word):
        return True
model = DummyModel()

# 2. WordNet Affect Processing -----------------------------------------------
def get_wn_affect_emotions(text):
    """Hypothetical WNAffect implementation"""
    emotions = []
    for word in word_tokenize(text.lower()):
        # Actual implementation would query WNAffect lexicon
        if word == "happy": emotions.append("happiness")
        elif word == "sad": emotions.append("sadness")
        # ... Add more mappings
    return emotions

# 3. Process All Posts -------------------------------------------------------
category_emotions = defaultdict(list)

for _, row in df.iterrows():
    emotions = get_wn_affect_emotions(row['content'])
    category_emotions[row['sentiment']].extend(emotions)

# 4. Calculate Dominant Emotions ---------------------------------------------
dominant_emotions = {}
for category, emotions in category_emotions.items():
    counter = Counter(emotions)
    total = sum(counter.values())
    dominant = [(e, count/total) for e, count in counter.most_common(5)]
    dominant_emotions[category] = dominant

# 5. Create Weighted Vectors -------------------------------------------------
def get_weighted_vector(emotion_weights):
    vector = np.zeros(DIM)
    for emotion, weight in emotion_weights:
        if emotion in model:
            vector += model[emotion] * weight
    return vector

category_vectors = {
    cat: get_weighted_vector(dominant_emotions.get(cat, []))
    for cat in emotion_labels
}

# 6. Compute Similarity Matrix -----------------------------------------------
similarity_matrix = pd.DataFrame(index=emotion_labels, columns=emotion_labels)

for cat1 in emotion_labels:
    for cat2 in emotion_labels:
        vec1 = category_vectors[cat1]
        vec2 = model[cat2] if cat2 in model else np.zeros(DIM)
        
        # Handle zero vectors
        if np.all(vec1 == 0) or np.all(vec2 == 0):
            similarity = 0.0
        else:
            similarity = cosine_similarity([vec1], [vec2])[0][0]
        
        similarity_matrix.loc[cat1, cat2] = similarity

# 7. Format and Save ---------------------------------------------------------
print("Emotion Similarity Matrix:")
print(similarity_matrix.round(2))
similarity_matrix.to_csv("wn_affect_similarity_matrix.csv")

Emotion Similarity Matrix:
               empty   sadness enthusiasm   neutral     worry  surprise  \
empty        0.77799  0.756808   0.798856  0.775767  0.739289  0.798155   
sadness     0.752319  0.735858   0.750792  0.745071  0.791006   0.78873   
enthusiasm  0.781006  0.768537   0.770571  0.815081  0.791614  0.770286   
neutral      0.78893   0.73592   0.750223  0.793741  0.783407  0.769143   
worry        0.80121  0.800849   0.810901   0.80398  0.807592  0.812081   
surprise     0.78616  0.797925   0.791082  0.782685  0.796077  0.801356   
love        0.778826  0.745934   0.774343  0.766654  0.766522  0.752986   
fun         0.723791  0.740621   0.768693  0.731482  0.739363  0.713982   
hate        0.811973  0.807371   0.803011  0.812676   0.81155   0.82077   
happiness   0.745865  0.741481   0.771451  0.755382  0.758813  0.742011   
boredom      0.75453  0.734571   0.738727  0.740502  0.761438   0.76117   
relief      0.763783  0.752974   0.776656  0.770571  0.789013   0.77462  

In [21]:
# Save the DataFrame as a LaTeX table
similarity_matrix.to_latex("similarity_matrix.tex", index=True, float_format="%.2f")
print("similarity matrix saved for latex use:")

similarity matrix saved for latex use:


In [22]:
#question 5
import pandas as pd
import numpy as np
from nrclex import NRCLex
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# 1. Configuration -----------------------------------------------------------
DATASET_EMOTIONS = ['sadness', 'enthusiasm', 'worry', 'surprise', 'fun', 
                   'hate', 'love', 'happiness', 'relief', 'boredom', 
                   'anger', 'neutral', 'empty']

NRC_MAPPING = {
    'anger': 'anger',
    'fear': 'worry',
    'joy': 'happiness',
    'sadness': 'sadness',
    'surprise': 'surprise',
    'disgust': 'hate'
}

# 2. Load Resources ----------------------------------------------------------
# Load word2vec model (replace with actual path)
model = KeyedVectors.load_word2vec_format('archive/GoogleNews-vectors-negative300.bin', binary=True)
DIM = 300  # Dimension of word vectors

# Dummy model for illustration
class DummyModel:
    def __getitem__(self, word):
        return np.random.rand(DIM)
    def __contains__(self, word):
        return True
model = DummyModel()

# 3. Process Data ------------------------------------------------------------
def process_nrc_emotions(df):
    category_stats = defaultdict(lambda: defaultdict(float))
    
    for _, row in df.iterrows():
        text = row['content']
        label = row['sentiment']
        
        # Get NRC emotions
        emotion_counts = NRCLex(text).affect_frequencies
        
        # Aggregate mapped emotions
        for nrc_emotion, freq in emotion_counts.items():
            if nrc_emotion in NRC_MAPPING:
                mapped_emotion = NRC_MAPPING[nrc_emotion]
                category_stats[label][mapped_emotion] += freq
                
    return category_stats

# 4. Calculate Dominant Emotions ---------------------------------------------
def get_dominant_emotions(category_stats):
    dominant = {}
    for category, counts in category_stats.items():
        total = sum(counts.values())
        if total == 0:
            dominant[category] = []
            continue
            
        sorted_emotions = sorted(counts.items(), key=lambda x: -x[1])[:5]
        dominant[category] = [(e, count/total) for e, count in sorted_emotions]
    
    # Handle empty category
    if 'empty' not in dominant:
        dominant['empty'] = []
        
    return dominant

# 5. Vector Calculations -----------------------------------------------------
def create_vectors(dominant_emotions):
    vectors = {}
    for category, emotions in dominant_emotions.items():
        vec = np.zeros(DIM)
        for emotion, weight in emotions:
            if emotion in model:
                vec += model[emotion] * weight
        vectors[category] = vec
    return vectors

# 6. Similarity Matrix -------------------------------------------------------
def build_similarity_matrix(vectors):
    matrix = pd.DataFrame(index=DATASET_EMOTIONS, columns=DATASET_EMOTIONS)
    
    for cat1 in DATASET_EMOTIONS:
        vec1 = vectors.get(cat1, np.zeros(DIM))
        for cat2 in DATASET_EMOTIONS:
            vec2 = model[cat2] if cat2 in model else np.zeros(DIM)
            
            # Handle zero vectors
            if np.all(vec1 == 0) or np.all(vec2 == 0):
                sim = 0.0
            else:
                sim = cosine_similarity([vec1], [vec2])[0][0]
                
            matrix.loc[cat1, cat2] = sim
            
    return matrix.round(2)

# 7. Main Execution ----------------------------------------------------------
df = pd.read_csv('text_emotion.csv')
category_stats = process_nrc_emotions(df)
dominant_emotions = get_dominant_emotions(category_stats)
vectors = create_vectors(dominant_emotions)
similarity_matrix = build_similarity_matrix(vectors)

print("NRC Emotion Similarity Matrix:")
print(similarity_matrix)
similarity_matrix.to_csv("nrc_similarity_matrix.csv")

NRC Emotion Similarity Matrix:
             sadness enthusiasm     worry  surprise       fun      hate  \
sadness     0.816413   0.835978  0.856926   0.83533  0.850624  0.844801   
enthusiasm  0.826652   0.838632  0.825121  0.842055   0.84357  0.829241   
worry       0.838907   0.829997  0.847068  0.834345  0.856156   0.84998   
surprise    0.844557   0.830463  0.848042  0.836882  0.836791  0.838393   
fun         0.841434   0.827316   0.84419  0.819484  0.838244  0.835574   
hate        0.841068   0.843119  0.832801  0.833102  0.834218  0.838384   
love        0.822342   0.838546  0.794733  0.823759  0.842713  0.819713   
happiness   0.816106   0.814143  0.813479  0.817229  0.793275  0.812288   
relief      0.828148   0.825518  0.826614   0.83603  0.840607  0.831014   
boredom     0.837241   0.837877  0.842035  0.840262  0.849349  0.856213   
anger       0.827605   0.828315  0.822584  0.829074  0.849868  0.818706   
neutral     0.855326   0.843911  0.832989  0.852912  0.834199  0.8289

In [23]:
# Save the DataFrame as a LaTeX table
similarity_matrix.to_latex("NRC_Emotion_Similarity_Matrix.tex", index=True, float_format="%.2f")
print("NRC Emotion Similarity Matrix saved for latex use:")

NRC Emotion Similarity Matrix saved for latex use:


In [3]:
#question 6
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from collections import defaultdict
import warnings

warnings.filterwarnings("ignore")

# 1. Load and Prepare Data
df = pd.read_csv('text_emotion.csv')
emotion_groups = df.groupby('sentiment')['content'].apply(list).to_dict()

# 2. Generate Document Embeddings
model = SentenceTransformer('all-mpnet-base-v2')  # 768-dimensional embeddings
embeddings_cache = defaultdict(list)

print("Generating embeddings...")
for emotion, texts in emotion_groups.items():
    embeddings = model.encode(texts, show_progress_bar=True)
    embeddings_cache[emotion] = embeddings

# 3. Create Emotion Prototypes
emotion_vectors = {}
for emotion, embeds in embeddings_cache.items():
    emotion_vectors[emotion] = np.mean(embeds, axis=0)

# 4. Compute Similarity Matrix
emotions = list(emotion_vectors.keys())
similarity_matrix = pd.DataFrame(index=emotions, columns=emotions)

for emo1 in emotions:
    for emo2 in emotions:
        sim = cosine_similarity([emotion_vectors[emo1]], [emotion_vectors[emo2]])[0][0]
        similarity_matrix.loc[emo1, emo2] = sim

# 5. Hypothesis Testing
hypothesis_groups = {
    'Group 1 (Sadness-Boredom)': ['sadness', 'boredom'],
    'Group 2 (Hate-Anger)': ['hate', 'anger'],
    'Group 3 (Fun-Love-Happiness)': ['fun', 'love', 'happiness']
}

results = []

for group_name, members in hypothesis_groups.items():
    group_sims = []
    other_sims = []
    
    # Compare intra-group vs inter-group similarities
    for i, emo1 in enumerate(members):
        for emo2 in members[i+1:]:
            group_sims.append(similarity_matrix.loc[emo1, emo2])
            
        for other_emo in [e for e in emotions if e not in members]:
            other_sims.append(similarity_matrix.loc[emo1, other_emo])
    
    avg_group_sim = np.mean(group_sims)
    avg_other_sim = np.mean(other_sims)
    
    results.append({
        'Hypothesis Group': group_name,
        'Avg Intra-group Similarity': avg_group_sim,
        'Avg Inter-group Similarity': avg_other_sim,
        'Support Ratio': avg_group_sim / avg_other_sim
    })

# 6. Format Results
result_df = pd.DataFrame(results)
print("\nHypothesis Validation Results:")
print(result_df.round(3))

# 7. Full Similarity Matrix
print("\nFull Emotion Similarity Matrix:")
print(similarity_matrix.round(2))

Generating embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Batches:   0%|          | 0/24 [00:00<?, ?it/s]

Batches:   0%|          | 0/56 [00:00<?, ?it/s]

Batches:   0%|          | 0/163 [00:00<?, ?it/s]

Batches:   0%|          | 0/42 [00:00<?, ?it/s]

Batches:   0%|          | 0/121 [00:00<?, ?it/s]

Batches:   0%|          | 0/270 [00:00<?, ?it/s]

Batches:   0%|          | 0/48 [00:00<?, ?it/s]

Batches:   0%|          | 0/162 [00:00<?, ?it/s]

Batches:   0%|          | 0/69 [00:00<?, ?it/s]

Batches:   0%|          | 0/265 [00:00<?, ?it/s]


Hypothesis Validation Results:
               Hypothesis Group  Avg Intra-group Similarity  \
0     Group 1 (Sadness-Boredom)                       0.919   
1          Group 2 (Hate-Anger)                       0.957   
2  Group 3 (Fun-Love-Happiness)                       0.959   

   Avg Inter-group Similarity  Support Ratio  
0                       0.888          1.035  
1                       0.884          1.083  
2                       0.888          1.080  

Full Emotion Similarity Matrix:
               anger   boredom     empty enthusiasm       fun happiness  \
anger            1.0  0.892164  0.931718   0.873409   0.86683  0.834521   
boredom     0.892164       1.0  0.898254   0.840358  0.819888  0.800162   
empty       0.931718  0.898254       1.0   0.955987  0.944471  0.913032   
enthusiasm  0.873409  0.840358  0.955987        1.0  0.982756  0.972449   
fun          0.86683  0.819888  0.944471   0.982756       1.0  0.977171   
happiness   0.834521  0.800162  0.913032   0

In [39]:
# Save the DataFrame as a LaTeX table
result_df.to_latex("Hypothesis_Validation_Result.tex", index=True, float_format="%.2f")
print("Hypothesis Validation Result saved for latex use:")

# Save the DataFrame as a LaTeX table
# Round the similarity matrix to 2 decimal places
rounded_similarity_matrix = similarity_matrix.round(1)
rounded_similarity_matrix.to_latex("Full_Emotion_Similarity_Matrix.tex", index=True, float_format="%.2f")
print("Full Emotion Similarity Matrix saved for latex use:")

Hypothesis Validation Result saved for latex use:
Full Emotion Similarity Matrix saved for latex use:


In [4]:
#question 7
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

# Load and preprocess data
df = pd.read_csv('text_emotion.csv').dropna(subset=['content', 'sentiment'])
X = df['content'].astype(str)
y = df['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(
    tokenizer=word_tokenize,
    lowercase=True,
    max_features=10000
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize models
models = {
    "Linear SVM": LinearSVC(dual='auto'),
    "Random Forest": RandomForestClassifier(n_estimators=200),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

# Train and evaluate models
results = []
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    
    # Calculate metrics
    report = classification_report(y_test, y_pred, output_dict=True)
    weighted_avg = report['weighted avg']
    
    results.append({
        'Model': name,
        'Precision': weighted_avg['precision'],
        'Recall': weighted_avg['recall'],
        'F1-Score': weighted_avg['f1-score']
    })

# Create result table
result_df = pd.DataFrame(results).set_index('Model')
print("\nModel Comparison (Weighted Average Scores):")
print(result_df.round(3))

# # Generate full classification reports
# for name, model in models.items():
#     print(f"\n{name} Detailed Report:")
#     print(classification_report(y_test, model.predict(X_test_tfidf)))

for name, model in models.items():
    print(f"\n{name} Detailed Report:")
    report_dict = classification_report(y_test, model.predict(X_test_tfidf), output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose().round(2)
    print(report_df.to_latex())



Model Comparison (Weighted Average Scores):
                     Precision  Recall  F1-Score
Model                                           
Linear SVM               0.299   0.330     0.307
Random Forest            0.336   0.343     0.295
Logistic Regression      0.344   0.362     0.324

Linear SVM Detailed Report:
\begin{tabular}{lrrrr}
\toprule
 & precision & recall & f1-score & support \\
\midrule
anger & 0.000000 & 0.000000 & 0.000000 & 22.000000 \\
boredom & 0.200000 & 0.030000 & 0.050000 & 36.000000 \\
empty & 0.030000 & 0.010000 & 0.010000 & 165.000000 \\
enthusiasm & 0.030000 & 0.010000 & 0.010000 & 152.000000 \\
fun & 0.110000 & 0.050000 & 0.060000 & 355.000000 \\
happiness & 0.320000 & 0.360000 & 0.330000 & 1042.000000 \\
hate & 0.310000 & 0.150000 & 0.210000 & 265.000000 \\
love & 0.430000 & 0.400000 & 0.410000 & 768.000000 \\
neutral & 0.350000 & 0.470000 & 0.400000 & 1728.000000 \\
relief & 0.100000 & 0.040000 & 0.060000 & 305.000000 \\
sadness & 0.300000 & 0.280000 & 0.

In [41]:
# Save the DataFrame as a LaTeX table
result_df.to_latex("Model Comparison (Weighted Average Scores).tex", index=True, float_format="%.2f")
print("Model Comparison (Weighted Average Scores) saved for latex use:")

Model Comparison (Weighted Average Scores) saved for latex use:


In [45]:
#question 7 optional
import pandas as pd
import re
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler

# Enhanced text preprocessing
def clean_text(text):
    # Remove URLs, mentions, and special characters
    text = re.sub(r"http\S+|www\S+|@\w+", "", text)
    text = re.sub(r"[^a-zA-Z0-9'\s]", "", text)
    return text.strip()

# Load and preprocess data
df = pd.read_csv('text_emotion.csv').dropna(subset=['content', 'sentiment'])
df = df[df['sentiment'] != 'empty']  # Remove under-represented class
df['content'] = df['content'].apply(clean_text)

X = df['content']
y = df['sentiment']

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Improved TF-IDF configuration
tokenizer = TweetTokenizer().tokenize
vectorizer = TfidfVectorizer(
    tokenizer=tokenizer,
    lowercase=True,
    ngram_range=(1, 2),  # Add bigrams
    max_features=5000,    # Reduced to focus on important features
    min_df=3,             # Ignore rare terms
    max_df=0.9            # Ignore overly common terms
)

# Handle class imbalance
oversampler = RandomOverSampler(random_state=42)

# Model configurations with hyperparameter tuning
models = {
    "Linear SVM": make_pipeline(vectorizer, oversampler, GridSearchCV(
        LinearSVC(class_weight='balanced', dual='auto'),
        param_grid={'C': [0.1, 1, 10]},
        cv=3
    )),
    "Random Forest": make_pipeline(vectorizer, oversampler, GridSearchCV(
        RandomForestClassifier(class_weight='balanced'),
        param_grid={'n_estimators': [200, 300], 'max_depth': [None, 30]},
        cv=3
    )),
    "Logistic Regression": make_pipeline(vectorizer, oversampler, GridSearchCV(
        LogisticRegression(class_weight='balanced', max_iter=1000),
        param_grid={'C': [0.1, 1, 10], 'solver': ['saga', 'lbfgs']},
        cv=3
    ))
}

# Train and evaluate models
results = []
for name, pipeline in models.items():
    print(f"\nTraining {name}...")
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    report = classification_report(y_test, y_pred, output_dict=True)
    weighted_avg = report['weighted avg']
    
    results.append({
        'Model': name,
        'Precision': weighted_avg['precision'],
        'Recall': weighted_avg['recall'],
        'F1-Score': weighted_avg['f1-score']
    })

# Display results
result_df = pd.DataFrame(results).set_index('Model')
print("\nImproved Model Comparison (Weighted Average Scores):")
print(result_df.round(3))

# Best model detailed report
best_model = max(models.items(), key=lambda x: result_df.loc[x[0], 'F1-Score'])
print(f"\nDetailed report for best model ({best_model[0]}):")
print(classification_report(y_test, best_model[1].predict(X_test)))


Training Linear SVM...





Training Random Forest...





Training Logistic Regression...





Improved Model Comparison (Weighted Average Scores):
                     Precision  Recall  F1-Score
Model                                           
Linear SVM               0.273   0.217     0.232
Random Forest            0.307   0.321     0.302
Logistic Regression      0.287   0.243     0.257

Detailed report for best model (Random Forest):
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        22
     boredom       0.14      0.06      0.08        36
  enthusiasm       0.04      0.01      0.02       152
         fun       0.15      0.06      0.09       355
   happiness       0.31      0.31      0.31      1042
        hate       0.37      0.23      0.28       265
        love       0.45      0.38      0.41       768
     neutral       0.33      0.45      0.38      1728
      relief       0.19      0.07      0.10       305
     sadness       0.35      0.20      0.26      1033
    surprise       0.13      0.03      0.05       437
   

In [48]:
#question 8
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Configure parameters
FEATURE_THRESHOLDS = [1000, 500, 100]
MODELS = {
    'SVM': LinearSVC(class_weight='balanced'),
    'Random Forest': RandomForestClassifier(class_weight='balanced'),
    'Logistic Regression': LogisticRegression(class_weight='balanced')
}
# nltk_stopwords = set(stopwords.words('english'))
nltk_stopwords = list(stopwords.words('english'))  # Convert to list

def evaluate_model(model, X_train, X_test, y_train, y_test, feature_size):
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            stop_words=nltk_stopwords,
            max_features=feature_size,
            ngram_range=(1, 2),
            min_df=3,
            max_df=0.9
        )),
        ('clf', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    return {
        'confusion_matrix': cm,
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1': report['weighted avg']['f1-score']
    }

# Main evaluation loop
results = {}
for feature_size in FEATURE_THRESHOLDS:
    print(f"\n{'='*40}\nEvaluating with {feature_size} features\n{'='*40}")
    feature_results = {}
    
    for model_name, model in MODELS.items():
        print(f"\nTraining {model_name}...")
        result = evaluate_model(model, X_train, X_test, y_train, y_test, feature_size)
        
        # Store results
        feature_results[model_name] = {
            'metrics': (result['precision'], result['recall'], result['f1']),
            'confusion_matrix': result['confusion_matrix']
        }
        
        # Print confusion matrix
        print(f"\n{model_name} Confusion Matrix ({feature_size} features):")
        print(pd.DataFrame(result['confusion_matrix'], 
                         index=df['sentiment'].unique(), 
                         columns=df['sentiment'].unique()))
    
    results[feature_size] = feature_results

# Compile comparison table
comparison = []
for feature_size in FEATURE_THRESHOLDS:
    for model in MODELS.keys():
        prec, rec, f1 = results[feature_size][model]['metrics']
        comparison.append({
            'Features': feature_size,
            'Model': model,
            'Precision': prec,
            'Recall': rec,
            'F1': f1
        })

comparison_df = pd.DataFrame(comparison)
print("\nPerformance Comparison:")
print(comparison_df.round(3))


Evaluating with 1000 features

Training SVM...





SVM Confusion Matrix (1000 features):
            sadness  enthusiasm  neutral  worry  surprise  love  fun  hate  \
sadness           1           1        1      1         1     2    2     5   
enthusiasm        2          10        2      2         1     4    1     5   
neutral           5           8       21     17        15    10    9    29   
worry            16          13       41     67        54    11   24    47   
surprise         26          26      106    127       243    31  121   140   
love             12          12       14     10         5    97    7    30   
fun              27          21       45     39       107    16  335    56   
hate            102          99      152    123       127    90   86   479   
happiness        16          13       26     24        37     9   24    47   
boredom          49          57       78     43        31   102   40   120   
relief           21          18       36     30        41    34   44    69   
anger            85      

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression Confusion Matrix (1000 features):
            sadness  enthusiasm  neutral  worry  surprise  love  fun  hate  \
sadness           1           1        3      1         1     3    2     2   
enthusiasm        1          12        1      2         0     3    1     3   
neutral           7           7       30     23        10     6   11    21   
worry            22          15       40     79        46     9   22    37   
surprise         24          27      117    153       199    31  123   111   
love             17          15       15     10         4    92    6    21   
fun              26          23       53     52        92    17  324    35   
hate             98         108      192    152        94   101   76   385   
happiness        13          19       32     33        24    11   23    34   
boredom          60          68       82     54        29   107   34    88   
relief           24          23       42     38        36    32   31    51   
anger    




SVM Confusion Matrix (500 features):
            sadness  enthusiasm  neutral  worry  surprise  love  fun  hate  \
sadness           1           1        1      2         1     0    3     4   
enthusiasm        3           9        3      1         2     2    1     6   
neutral           9          12       15     17        19     9    8    27   
worry            26          25       32     73        50     6   27    50   
surprise         55          33       94    111       256    23  122   149   
love             18          26       10     11         4    82    8    31   
fun              39          28       42     32       102    13  334    68   
hate            156         142      155     90       112    63   84   513   
happiness        18          19       30     28        30     6   29    45   
boredom          71          77       55     47        36    91   42   142   
relief           43          29       29     23        42    24   41    78   
anger           136       




SVM Confusion Matrix (100 features):
            sadness  enthusiasm  neutral  worry  surprise  love  fun  hate  \
sadness           1           5        2      1         0     0    3     4   
enthusiasm        6           9        1      0         1     1    0    14   
neutral          19          16        9      6        24     3    6    40   
worry            47          42       18     53        50     3   15    73   
surprise        100          73       47     80       278     3   81   211   
love             36          42        9     11        11    42    4    63   
fun              39          62       28     23       135     5  250   119   
hate            194         214       79     56       149    18   50   652   
happiness        28          37       18     16        48     3   16    60   
boredom         133         154       41     36        55    32   32   197   
relief           44          51       16     23        45     4   27   126   
anger           190       

In [49]:
#question 9
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Configure parameters
FEATURE_THRESHOLDS = [1000, 500, 100]
MODELS = {
    'SVM': LinearSVC(class_weight='balanced'),
    'Random Forest': RandomForestClassifier(class_weight='balanced'),
    'Logistic Regression': LogisticRegression(class_weight='balanced')
}
# nltk_stopwords = set(stopwords.words('english'))
nltk_stopwords = list(stopwords.words('english'))  # Convert to list

def evaluate_model(model, X_train, X_test, y_train, y_test, feature_size):
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            stop_words=nltk_stopwords,
            max_features=feature_size,
            ngram_range=(2, 2),  # Bigrams only
            min_df=3,
            max_df=0.9
        )),
        ('clf', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    return {
        'confusion_matrix': cm,
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1': report['weighted avg']['f1-score']
    }

# Main evaluation loop
results = {}
for feature_size in FEATURE_THRESHOLDS:
    print(f"\n{'='*40}\nEvaluating with {feature_size} bigram features\n{'='*40}")
    feature_results = {}
    
    for model_name, model in MODELS.items():
        print(f"\nTraining {model_name}...")
        result = evaluate_model(model, X_train, X_test, y_train, y_test, feature_size)
        
        # Store results
        feature_results[model_name] = {
            'metrics': (result['precision'], result['recall'], result['f1']),
            'confusion_matrix': result['confusion_matrix']
        }
        
        # Print confusion matrix
        print(f"\n{model_name} Bigram Confusion Matrix ({feature_size} features):")
        print(pd.DataFrame(result['confusion_matrix'], 
                         index=df['sentiment'].unique(), 
                         columns=df['sentiment'].unique()))
    
    results[feature_size] = feature_results

# Compile comparison table
comparison = []
for feature_size in FEATURE_THRESHOLDS:
    for model in MODELS.keys():
        prec, rec, f1 = results[feature_size][model]['metrics']
        comparison.append({
            'Features': feature_size,
            'Model': model,
            'Precision': prec,
            'Recall': rec,
            'F1': f1
        })

comparison_df = pd.DataFrame(comparison)
print("\nBigram Performance Comparison:")
print(comparison_df.round(3))


Evaluating with 1000 bigram features

Training SVM...





SVM Bigram Confusion Matrix (1000 features):
            sadness  enthusiasm  neutral  worry  surprise  love  fun  hate  \
sadness           0           1        1      2         0     0    2    14   
enthusiasm        2           2        1      2         1     0    0    27   
neutral           1           3        8     10         9     2    7   104   
worry             3           3       11     25        25     5   10   237   
surprise         15          20       45     52        88     8   58   665   
love              8           7        6      7         5     8    5   192   
fun               8           8       26     34        52    10  138   438   
hate             20          31       45     40        59    19   33  1320   
happiness         6           2       12     17        15     3   14   191   
boredom          18          29       41     25        18    23   20   659   
relief            5           6       15      7        25     8   23   311   
anger            3




SVM Bigram Confusion Matrix (500 features):
            sadness  enthusiasm  neutral  worry  surprise  love  fun  hate  \
sadness           0           1        2      0         0     0    2    15   
enthusiasm        2           2        1      1         1     0    0    28   
neutral           1           3        5      7        11     2    3   111   
worry             4           4        9     18        16     4    9   265   
surprise         15          19       33     47        76     3   54   717   
love              9           6        4      8         1     6    2   206   
fun               5          13       21     22        47     7  133   482   
hate             18          33       38     26        50     8   29  1391   
happiness         7           4        7      8        11     1    9   220   
boredom          20          30       34     22        21    13   10   723   
relief            6           6        5      6        19     2   16   341   
anger            42




SVM Bigram Confusion Matrix (100 features):
            sadness  enthusiasm  neutral  worry  surprise  love  fun  hate  \
sadness           0           1        0      0         0     0    0    20   
enthusiasm        1           1        0      1         1     0    0    31   
neutral           1           5        5      6         3     1    2   122   
worry             4           3        5      9         7     0    8   308   
surprise          9          11       29     23        40     0   52   839   
love              6           5        1      3         1     1    1   241   
fun               6           9       17      8        24     1  108   576   
hate             15          30       21     18        23     0   30  1532   
happiness         2           3        8      2         3     0   11   255   
boredom          14          25       16     11         6     3    7   865   
relief            6           6        5      2        10     0   14   377   
anger            31

In [52]:
#question 10
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from collections import Counter

# Advanced CNN Architecture
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, filter_sizes, num_filters):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        
        # Parallel convolutional layers
        self.convs = nn.ModuleList([
            nn.Conv2d(1, n, (f, embed_dim)) for (n, f) in zip(num_filters, filter_sizes)
        ])
        
        # Attention layer
        self.attention = nn.Linear(sum(num_filters), sum(num_filters))
        
        # Classifier
        self.fc = nn.Linear(sum(num_filters), num_classes)
        self.dropout = nn.Dropout(0.5)
        self.bn = nn.BatchNorm1d(sum(num_filters))

    def forward(self, x):
        x = self.embedding(x)  # [batch_size, seq_len, embed_dim]
        x = x.unsqueeze(1)     # [batch_size, 1, seq_len, embed_dim]
        
        # Apply multiple conv filters
        conv_outputs = []
        for conv in self.convs:
            conv_out = F.relu(conv(x)).squeeze(3)
            pooled_out = F.max_pool1d(conv_out, conv_out.size(2)).squeeze(2)
            conv_outputs.append(pooled_out)
        
        # Combine features
        x = torch.cat(conv_outputs, 1)
        x = self.bn(x)
        
        # Attention mechanism
        attention_weights = torch.sigmoid(self.attention(x))
        x = x * attention_weights
        
        # Final classification
        x = self.dropout(x)
        return self.fc(x)

# Dataset Class
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Tokenization and padding
        tokens = self.tokenizer(text)[:self.max_len]
        padded = tokens + [0]*(self.max_len - len(tokens))
        
        # Explicitly set dtype for labels
        return torch.tensor(padded), torch.tensor(label, dtype=torch.long)  # Critical fix

# Training Setup
def train_model(model, train_loader, test_loader, num_epochs=10):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    
    # Class weights for imbalance
    class_counts = Counter(y_train)
    weights = 1. / torch.Tensor([class_counts[c] for c in range(len(class_counts))])
    criterion = nn.CrossEntropyLoss(weight=weights.to(device))
    
    best_acc = 0
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        
        # Validation
        model.eval()
        test_acc = evaluate_model(model, test_loader)
        if test_acc > best_acc:
            best_acc = test_acc
            torch.save(model.state_dict(), 'best_model.pth')
        
        print(f'Epoch {epoch+1}: Test Acc = {test_acc:.4f}')
    
    return best_acc

def evaluate_model(model, test_loader):
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

# Main Execution
if __name__ == "__main__":
    # Load and preprocess data
    df = pd.read_csv('text_emotion.csv').dropna(subset=['content', 'sentiment'])
    df = df[df['sentiment'] != 'empty']
    texts = df['content'].str.lower().str.replace('[^a-z0-9\s]', '').values
    le = LabelEncoder()
    labels = le.fit_transform(df['sentiment'])
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels, test_size=0.2, stratify=labels, random_state=42
    )
    
    # Tokenization
    tokenizer = lambda x: [hash(word) % 10000 for word in x.split()]  # Simple hash tokenizer
    max_len = 100
    vocab_size = 10000
    
    # Create DataLoaders
    train_dataset = EmotionDataset(X_train, y_train, tokenizer, max_len)
    test_dataset = EmotionDataset(X_test, y_test, tokenizer, max_len)
    
    batch_size = 64
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Model parameters
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = TextCNN(
        vocab_size=vocab_size,
        embed_dim=128,
        num_classes=len(le.classes_),
        filter_sizes=[3, 5, 7],
        num_filters=[100, 100, 100]
    ).to(device)
    
    # Training
    best_acc = train_model(model, train_loader, test_loader, num_epochs=100)
    
    # Final evaluation
    model.load_state_dict(torch.load('best_model.pth'))
    y_pred = []
    with torch.no_grad():
        for inputs, _ in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            y_pred.extend(torch.argmax(outputs, 1).cpu().numpy())
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    
    # Comparison with previous models
    comparison = pd.DataFrame({
        'Model': ['SVM (TF-IDF)', 'Logistic Regression', 'CNN'],
        'Accuracy': [0.685, 0.676, best_acc],
        'F1-Score': [0.683, 0.674, classification_report(y_test, y_pred, output_dict=True)['weighted avg']['f1-score']]
    })
    print("\nModel Comparison:")
    print(comparison.round(3))

  texts = df['content'].str.lower().str.replace('[^a-z0-9\s]', '').values


Epoch 1: Test Acc = 0.0051
Epoch 2: Test Acc = 0.2066
Epoch 3: Test Acc = 0.2195
Epoch 4: Test Acc = 0.0028
Epoch 5: Test Acc = 0.2287
Epoch 6: Test Acc = 0.1002
Epoch 7: Test Acc = 0.1299
Epoch 8: Test Acc = 0.1594
Epoch 9: Test Acc = 0.2821
Epoch 10: Test Acc = 0.1013
Epoch 11: Test Acc = 0.0126
Epoch 12: Test Acc = 0.2018
Epoch 13: Test Acc = 0.1408
Epoch 14: Test Acc = 0.2796
Epoch 15: Test Acc = 0.2517
Epoch 16: Test Acc = 0.2463
Epoch 17: Test Acc = 0.2597
Epoch 18: Test Acc = 0.2148
Epoch 19: Test Acc = 0.1332
Epoch 20: Test Acc = 0.0763
Epoch 21: Test Acc = 0.2277
Epoch 22: Test Acc = 0.0063
Epoch 23: Test Acc = 0.0258
Epoch 24: Test Acc = 0.1832
Epoch 25: Test Acc = 0.0051
Epoch 26: Test Acc = 0.1652
Epoch 27: Test Acc = 0.0110
Epoch 28: Test Acc = 0.0812
Epoch 29: Test Acc = 0.0975
Epoch 30: Test Acc = 0.2388
Epoch 31: Test Acc = 0.0198
Epoch 32: Test Acc = 0.1981
Epoch 33: Test Acc = 0.0891
Epoch 34: Test Acc = 0.1301
Epoch 35: Test Acc = 0.0138
Epoch 36: Test Acc = 0.0994
E

  model.load_state_dict(torch.load('best_model.pth'))



Classification Report:
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        22
     boredom       0.00      0.00      0.00        36
  enthusiasm       0.03      0.01      0.01       152
         fun       0.11      0.02      0.04       355
   happiness       0.26      0.31      0.28      1042
        hate       0.20      0.14      0.16       265
        love       0.32      0.41      0.36       768
     neutral       0.33      0.44      0.38      1728
      relief       0.11      0.02      0.03       305
     sadness       0.24      0.33      0.28      1033
    surprise       0.11      0.06      0.08       437
       worry       0.30      0.23      0.26      1692

    accuracy                           0.28      7835
   macro avg       0.17      0.16      0.16      7835
weighted avg       0.26      0.28      0.26      7835


Model Comparison:
                 Model  Accuracy  F1-Score
0         SVM (TF-IDF)     0.685     0.683
1  L

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
