In [1]:
import json
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import time


In [2]:
# Load data_identification.csv
data_identification = pd.read_csv('data_identification.csv')

# Load emotion.csv
emotion = pd.read_csv('emotion.csv')

# Merge emotion labels with data_identification
data_identification = data_identification.merge(emotion, on='tweet_id', how='left')


In [3]:
# Initialize lists to hold tweet data
tweet_ids = []
tweet_texts = []

# Read tweets_DM.json
with open('tweets_DM.json', 'r', encoding='utf-8') as f:
    for line in f:
        try:
            tweet_json = json.loads(line)
            tweet_id = tweet_json['_source']['tweet']['tweet_id']
            tweet_text = tweet_json['_source']['tweet']['text']
            tweet_ids.append(tweet_id)
            tweet_texts.append(tweet_text)
        except json.JSONDecodeError:
            continue

# Create a DataFrame
tweets_df = pd.DataFrame({'tweet_id': tweet_ids, 'text': tweet_texts})


In [4]:
# Merge tweets with data_identification
data = data_identification.merge(tweets_df, on='tweet_id', how='left')

# Drop rows with missing tweets
data = data.dropna(subset=['text'])


In [5]:
def preprocess_tweet(tweet_text):
    # Replace <LH> with [EMOTION]
    tweet_text = tweet_text.replace('<LH>', '')
    # Minimal cleaning: remove URLs and mentions
    tweet_text = re.sub(r'http\S+|www\S+|https\S+', '', tweet_text)
    tweet_text = re.sub(r'\@\w+', '', tweet_text)
    return tweet_text

# Apply preprocessing
data['clean_text'] = data['text'].apply(preprocess_tweet)


In [6]:
# Split data into training and test sets
train_data = data[data['identification'] == 'train']
test_data = data[data['identification'] == 'test']

# Encode labels
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['emotion'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['label'] = label_encoder.fit_transform(train_data['emotion'])


In [7]:
# Use TfidfVectorizer or HashingVectorizer
vectorizer = TfidfVectorizer(
    max_features=50000,  # Limit features to speed up
    ngram_range=(1, 2),  # Use unigrams and bigrams
    stop_words='english'
)

# Fit and transform training data
X_train = vectorizer.fit_transform(train_data['clean_text'])

# Transform test data
X_test = vectorizer.transform(test_data['clean_text'])

# Labels
y_train = train_data['label']


In [8]:
# You can choose between MultinomialNB or LogisticRegression
# Multinomial Naive Bayes
# clf = MultinomialNB()

# Or Logistic Regression
clf = LogisticRegression(max_iter=2000, n_jobs=-1)


In [9]:
start_time = time.time()

clf.fit(X_train, y_train)

training_time = time.time() - start_time
print(f"Training time: {training_time:.2f} seconds")


Training time: 230.32 seconds


In [10]:
# Split a small validation set if needed
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train, y_train, test_size=0.3, random_state=10, stratify=y_train
)

# Re-train on subset if necessary
clf.fit(X_train_sub, y_train_sub)

# Predict on validation set
y_val_pred = clf.predict(X_val)

# Evaluate
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))


              precision    recall  f1-score   support

       anger       0.63      0.22      0.33     11960
anticipation       0.62      0.55      0.58     74681
     disgust       0.47      0.38      0.42     41730
        fear       0.69      0.36      0.47     19200
         joy       0.54      0.80      0.65    154805
     sadness       0.48      0.45      0.47     58031
    surprise       0.65      0.21      0.32     14619
       trust       0.55      0.31      0.40     61643

    accuracy                           0.55    436669
   macro avg       0.58      0.41      0.45    436669
weighted avg       0.55      0.55      0.53    436669



In [11]:
# Predict on the test data
test_pred = clf.predict(X_test)

# Map numeric labels back to emotion strings
test_pred_emotions = label_encoder.inverse_transform(test_pred)


In [12]:
submission_df = pd.DataFrame({
    'id': test_data['tweet_id'],
    'emotion': test_pred_emotions
})

# Save to CSV
submission_df.to_csv('change.csv', index=False)
