In [1]:
# import the following libraries into the Python environment
import numpy as np
import pandas as pd

In [2]:
# load data from a tweet_emotions.csv.csv
data=pd.read_csv('/content/tweet_emotions.csv')

In [3]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [4]:
# Set the option to display the full text content of DataFrame columns

pd.set_option('display.max_colwidth',None)

In [5]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin on your call...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will."


In [6]:
data.shape

(40000, 3)

In [7]:
# Drop the 'tweet_id' column from the DataFrame

data = data.drop(columns=['tweet_id'])

In [8]:
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[
1,sadness,Layin n bed with a headache ughhhh...waitin on your call...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will."


In [9]:
data['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [10]:
# Define a mapping to convert original emotions to a simplified set (joy, sadness, anger, fear, surprise)

emotion_mapping = {
    'empty': 'neutral',
    'sadness': 'sadness',
    'enthusiasm': 'joy',
    'neutral': 'neutral',
    'worry': 'fear',
    'surprise': 'surprise',
    'love': 'joy',
    'fun': 'joy',
    'hate': 'anger',
    'happiness': 'joy',
    'boredom': 'neutral',
    'relief': 'joy',
    'anger': 'anger'
}

In [11]:
# Replace the original sentiment labels with the mapped labels

data['sentiment'] = data['sentiment'].map(emotion_mapping)

In [12]:
data.head()

Unnamed: 0,sentiment,content
0,neutral,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[
1,sadness,Layin n bed with a headache ughhhh...waitin on your call...
2,sadness,Funeral ceremony...gloomy friday...
3,joy,wants to hang out with friends SOON!
4,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will."


In [13]:
data['sentiment'].value_counts()

joy         13112
neutral      9644
fear         8459
sadness      5165
surprise     2187
anger        1433
Name: sentiment, dtype: int64

In [44]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

In [68]:
# Encode the target variable
label_encoder = LabelEncoder()
data['sentiment_encoded'] = label_encoder.fit_transform(data['sentiment'])
#y_encoded = label_encoder.fit_transform(data['sentiment'])
y = to_categorical(y)


In [69]:
data.head()

Unnamed: 0,sentiment,content,sentiment_encoded
0,neutral,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[,3
1,sadness,Layin n bed with a headache ughhhh...waitin on your call...,4
2,sadness,Funeral ceremony...gloomy friday...,4
3,joy,wants to hang out with friends SOON!,2
4,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will.",3


### Text Preprocessing

In [14]:
#remove puctuations

import string
import re
def remove_punct(text):
    # Remove website addresses
    text = re.sub(r'http\S+|www\S+|\S+\.com\S+', '', text, flags=re.MULTILINE)

    # Remove words starting with '@'
    text = re.sub(r'@\w+', '', text)

    # Remove other punctuation
    return re.sub(r'[^\w\s]', '', text)

In [15]:
# Tokenize the given text into words

import nltk
nltk.download('punkt')
def tokenization(text):
  words = nltk.word_tokenize(text)
  return words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
# Remove stopwords from the given text

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
  output = [i for i in text if i not in stopwords]
  return output

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Lemmatize each word in the given text

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemm = WordNetLemmatizer()
def lemm(text):
  lemm_text = [wordnet_lemm.lemmatize(word) for word in text]
  return lemm_text

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
# Preprocess the given data column

def preprocess(data_col):
  corpus = []
  for item in data_col:
    new_item = remove_punct(item)
    new_item = new_item.lower()
    new_item = tokenization(new_item)
    new_tem = remove_stopwords(new_item)
    new_item = lemm(new_item)
    corpus.append(' '.join(str(x) for x in new_item))
  return corpus

In [19]:
corpus=preprocess(data['content'])

In [20]:
from sklearn.model_selection import train_test_split

In [70]:
train_data, test_data, train_labels, test_labels = train_test_split(
    corpus, data['sentiment_encoded'], test_size=0.2, random_state=42)

## Feature Extraction

####**Bag-of-Words**

In [71]:
# Transform training and test data into feature vectors

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2))
train_vectors = cv.fit_transform(train_data)
test_vectors =cv.transform(test_data)
x = train_vectors
y = train_labels


In [72]:
x.shape

(32000, 195509)

In [73]:
y.shape

(32000,)

## Model Building and Evaluation

In [28]:
from sklearn.metrics import accuracy_score, classification_report

### RandomForest

In [90]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(x,y)

In [91]:
predictions = clf.predict(test_vectors)

In [92]:
accuracy = accuracy_score(test_labels, predictions)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.44


### Logistic regression

In [74]:
# Train a Logistic Regression model on the features 'x' and target variable 'y' using scikit-learn's LogisticRegression.

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [75]:
predictions = lr.predict(test_vectors)

In [76]:
accuracy = accuracy_score(test_labels, predictions)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.46


### XGBoost

In [77]:
import xgboost as xgb

In [78]:
xgb_clf=xgb.XGBClassifier(n_estimators = 100,max_depth=4,random_state=42)


In [79]:
xgb_clf.fit(x,y)

In [81]:
y_pred_xgb=xgb_clf.predict(test_vectors)

In [82]:
accuracy = accuracy_score(test_labels, y_pred_xgb)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.45


Since Logistic Regression achieved the highest accuracy, let's use it for classification

In [93]:
new_tweets = [
    "The weather is absolutely wonderful today!",
    "I can't believe how frustrating this traffic is.",
    "Just finished watching a fantastic movie. Highly recommend it!",
    "Feeling a bit under the weather today.",
    "Excited to try out the new restaurant in town."]


# Assuming 'cv' is your CountVectorizer and 'lr' is your Logistic Regression model
new_features = cv.transform(new_tweets)
new_predictions = lr.predict(new_features)

# Display predictions
for tweet, prediction in zip(new_tweets, new_predictions):
    print(f"Tweet: {tweet}\nPredicted Sentiment: {prediction}\n")

Tweet: The weather is absolutely wonderful today!
Predicted Sentiment: 2

Tweet: I can't believe how frustrating this traffic is.
Predicted Sentiment: 3

Tweet: Just finished watching a fantastic movie. Highly recommend it!
Predicted Sentiment: 2

Tweet: Feeling a bit under the weather today.
Predicted Sentiment: 3

Tweet: Excited to try out the new restaurant in town.
Predicted Sentiment: 2

