## DATA UNDERSTANDING

In [27]:
# Import the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.collections import *
from nltk import FreqDist, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_validate
import string
import re
import warnings
warnings.filterwarnings('ignore')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import Normalizer

In [12]:
# Load and read the dataset
data = pd.read_csv('../data_file/tweet_sentiments.csv', encoding='ISO-8859-1')
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


The dataset contains tweets along with their associated emotions and whether the emotion is directed at a brand or product. Here are the columns present in the dataset:

1. **tweet_text**: The text of the tweet.
2. **emotion_in_tweet_is_directed_at**: The brand or product the emotion is directed toward.
3. **is_there_an_emotion_directed_at_a_brand_or_product**: The sentiment of the tweet, indicating whether it's a positive or negative emotion.


In [13]:
# Examine the shape of the dataset
data.shape

(9093, 3)

In [14]:
# Describe the data
data.describe

<bound method NDFrame.describe of                                              tweet_text  \
0     .@wesley83 I have a 3G iPhone. After 3 hrs twe...   
1     @jessedee Know about @fludapp ? Awesome iPad/i...   
2     @swonderlin Can not wait for #iPad 2 also. The...   
3     @sxsw I hope this year's festival isn't as cra...   
4     @sxtxstate great stuff on Fri #SXSW: Marissa M...   
...                                                 ...   
9088                      Ipad everywhere. #SXSW {link}   
9089  Wave, buzz... RT @mention We interrupt your re...   
9090  Google's Zeiger, a physician never reported po...   
9091  Some Verizon iPhone customers complained their...   
9092  Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...   

     emotion_in_tweet_is_directed_at  \
0                             iPhone   
1                 iPad or iPhone App   
2                               iPad   
3                 iPad or iPhone App   
4                             Google   
...              

In [15]:
# Check for missing values
data.isnull().sum()

tweet_text                                               1
emotion_in_tweet_is_directed_at                       5802
is_there_an_emotion_directed_at_a_brand_or_product       0
dtype: int64

In [16]:
# Drop rows with missing values
data.dropna(inplace=True)

In [17]:
# cross checking there are no missing values
data.isnull().sum()

tweet_text                                            0
emotion_in_tweet_is_directed_at                       0
is_there_an_emotion_directed_at_a_brand_or_product    0
dtype: int64

In [20]:
# Check the distribution of sentiment classes
sentiment_distribution = data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()


#### Initial Data Cleaning

In [22]:
# Most Common Words: Identify the most frequent words used in the tweets.
# Remove punctuation and split words
from collections import Counter

all_words = ' '.join(data['tweet_text']).lower()
all_words = re.findall(r'\b\w+\b', all_words)
word_counts = Counter(all_words)
most_common_words = word_counts.most_common(10)
most_common_words

[('sxsw', 3468),
 ('mention', 2278),
 ('the', 1806),
 ('to', 1324),
 ('link', 1210),
 ('ipad', 1160),
 ('at', 1088),
 ('apple', 1013),
 ('rt', 980),
 ('for', 962)]

In [33]:
# Function for text preprocessing
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    words = word_tokenize(text)
    # Remove stop words and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

## Modeling

### a. Binary classification

#### Data Preprocessing

In [34]:
# Encode the target variable
data['label'] = data['is_there_an_emotion_directed_at_a_brand_or_product'].apply(lambda x: 1 if x == 'Positive emotion' else 0)

# Select features and target
X = data['tweet_text']
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


#### Model selection

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X_train_vec, y_train)

# Make predictions
y_pred = model.predict(X_test_vec)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(report)


Accuracy: 0.83
              precision    recall  f1-score   support

           0       0.87      0.10      0.19       124
           1       0.83      1.00      0.90       535

    accuracy                           0.83       659
   macro avg       0.85      0.55      0.55       659
weighted avg       0.83      0.83      0.77       659



i. The model achieved a good overall accuracy (83%), but there is a significant imbalance in performance between the two classes.
ii. The recall for the negative class (0) is very low (0.10), indicating the model struggles to identify negative sentiments.
iii. The model performs well on the positive class (1) with a high recall (1.00) and a decent F1-score (0.90).

### b. Random Forest Model

In [36]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train_vec, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_vec)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(report)

Accuracy: 0.85
              precision    recall  f1-score   support

           0       0.93      0.21      0.34       124
           1       0.84      1.00      0.91       535

    accuracy                           0.85       659
   macro avg       0.89      0.60      0.63       659
weighted avg       0.86      0.85      0.81       659



#### Hyperparameter Tuning

In [41]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=3,
                           verbose=1,
                           n_jobs=-1)

# Fit the model
grid_search.fit(X_train_vec, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Use the best estimator to make predictions
best_rf_model = grid_search.best_estimator_
y_pred_tuned = best_rf_model.predict(X_test_vec)

# Evaluate the tuned model
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
report_tuned = classification_report(y_test, y_pred_tuned)

print(f"Tuned Model Accuracy: {accuracy_tuned:.2f}")
print(report_tuned)


Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Tuned Model Accuracy: 0.85
              precision    recall  f1-score   support

           0       0.93      0.21      0.34       124
           1       0.84      1.00      0.91       535

    accuracy                           0.85       659
   macro avg       0.89      0.60      0.63       659
weighted avg       0.86      0.85      0.81       659



The tuned model shows improved accuracy compared to the Logistic Regression model(85%).
Precision for the negative class (0) is high (0.93), but recall is low (0.21), indicating that while the model is good at identifying true negatives, it struggles to detect them in general.
The positive class (1) has excellent recall (1.00) and a strong F1-score (0.91), showing that it identifies positive sentiments effectively.
Overall performance shows a significant imbalance, suggesting potential areas for further improvement, such as addressing class imbalance or using advanced techniques like SMOTE for oversampling.