<a href="https://colab.research.google.com/github/engige/NLP_Twitter_Sentiment_Analysis/blob/main/josepha.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
import pandas as pd

# Load the dataset to inspect its structure
tweet_data = pd.read_csv('tweet_sentiments.csv', encoding='ISO-8859-1')

# Display the first few rows of the dataset
tweet_data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [None]:
# Check for missing values and get a summary of the dataset
missing_values = tweet_data.isnull().sum()

# Check the distribution of sentiment classes
sentiment_distribution = tweet_data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

# Display the missing values and sentiment distribution
print("Missing Values:")
print(missing_values)

print("\nSentiment Distribution:")
print(sentiment_distribution)

Missing Values:
tweet_text                                               1
emotion_in_tweet_is_directed_at                       5802
is_there_an_emotion_directed_at_a_brand_or_product       0
dtype: int64

Sentiment Distribution:
is_there_an_emotion_directed_at_a_brand_or_product
No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: count, dtype: int64


In [None]:
# Drop rows with missing tweet_text and drop the column 'emotion_in_tweet_is_directed_at' as it is not necessary for sentiment analysis
cleaned_tweet_data = tweet_data.dropna(subset=['tweet_text']).drop(columns=['emotion_in_tweet_is_directed_at'])

# Display the cleaned dataset for further inspection
cleaned_tweet_data.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


In [None]:
# Cross-checking that there are no missing values
missing_values = cleaned_tweet_data.isnull().sum()

missing_values

Unnamed: 0,0
tweet_text,0
is_there_an_emotion_directed_at_a_brand_or_product,0


In [None]:
# Import necessary libraries for text preprocessing
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function for text preprocessing
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    words = word_tokenize(text)
    # Remove stop words and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply the preprocessing function to the 'tweet_text' column
cleaned_tweet_data['cleaned_text'] = cleaned_tweet_data['tweet_text'].apply(preprocess_text)

# Display the first few rows of the preprocessed data
cleaned_tweet_data[['tweet_text', 'cleaned_text']].head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,tweet_text,cleaned_text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,wesley g iphone hr tweeting riseaustin dead ne...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,jessedee know fludapp awesome ipadiphone app y...
2,@swonderlin Can not wait for #iPad 2 also. The...,swonderlin wait ipad also sale sxsw
3,@sxsw I hope this year's festival isn't as cra...,sxsw hope year festival isnt crashy year iphon...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,sxtxstate great stuff fri sxsw marissa mayer g...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the TF-IDF Vectorizer with a maximum of 5000 features and stop words removed
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text data into TF-IDF features
X_tfidf = tfidf_vectorizer.fit_transform(cleaned_tweet_data['cleaned_text'])

# Display the shape of the resulting TF-IDF matrix
X_tfidf.shape

(9092, 5000)

In [None]:
# Convert the sparse TF-IDF matrix to a dense array
tfidf_sample = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the first 5 rows and first 10 columns of the TF-IDF matrix
print(tfidf_sample.iloc[:5, :10])

   aapl   ab  abc  ability  able  absolutely  absolutley  abt  abuzz  academy
0   0.0  0.0  0.0      0.0   0.0         0.0         0.0  0.0    0.0      0.0
1   0.0  0.0  0.0      0.0   0.0         0.0         0.0  0.0    0.0      0.0
2   0.0  0.0  0.0      0.0   0.0         0.0         0.0  0.0    0.0      0.0
3   0.0  0.0  0.0      0.0   0.0         0.0         0.0  0.0    0.0      0.0
4   0.0  0.0  0.0      0.0   0.0         0.0         0.0  0.0    0.0      0.0


In [None]:
from sklearn.model_selection import train_test_split

# Filter relevant rows and map sentiment labels to numerical categories
multiclass_data = cleaned_tweet_data[cleaned_tweet_data['is_there_an_emotion_directed_at_a_brand_or_product'].isin(['Positive emotion', 'Negative emotion', 'No emotion toward brand or product'])]

# Re-apply the TF-IDF transformation on the filtered data
X_tfidf_multiclass = tfidf_vectorizer.transform(multiclass_data['cleaned_text'])

# Define the target variable (sentiment)
y_multiclass = multiclass_data['is_there_an_emotion_directed_at_a_brand_or_product'].map({
    'Positive emotion': 1,
    'Negative emotion': 0,
    'No emotion toward brand or product': 2
})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_multiclass, y_multiclass, test_size=0.2, random_state=42, stratify=y_multiclass)

# Display the shapes of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (7148, 5000) (7148,)
Testing set shape: (1788, 5000) (1788,)


In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [None]:
# Check the new class distribution after SMOTE balancing
y_train_smote_series = pd.Series(y_train_smote)
print(y_train_smote_series.value_counts())

is_there_an_emotion_directed_at_a_brand_or_product
2    4310
1    4310
0    4310
Name: count, dtype: int64


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

# Set up Random Forest with hyperparameter tuning for multiclass classification
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train_smote, y_train_smote)

# Best parameters and model evaluation
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

# Evaluate the model
print("Best Parameters:", grid_search.best_params_)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Fitting 3 folds for each of 24 candidates, totalling 72 fits


  pid = os.fork()
  pid = os.fork()


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.6929530201342282
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.27      0.39       114
           1       0.63      0.52      0.57       596
           2       0.72      0.83      0.77      1078

    accuracy                           0.69      1788
   macro avg       0.67      0.54      0.58      1788
weighted avg       0.69      0.69      0.68      1788



In [None]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the Multinomial Naive Bayes model
nb = MultinomialNB()

# Fit the model to the training data
nb.fit(X_train_smote, y_train_smote)

# Make predictions on the test set
y_pred = nb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=['Negative', 'Positive', 'Neutral'])

# Display the evaluation metrics
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

Accuracy: 0.610738255033557
Classification Report:
               precision    recall  f1-score   support

    Negative       0.28      0.59      0.38       114
    Positive       0.52      0.69      0.59       596
     Neutral       0.80      0.57      0.67      1078

    accuracy                           0.61      1788
   macro avg       0.54      0.62      0.55      1788
weighted avg       0.68      0.61      0.62      1788



In [None]:
# Define the hyperparameter grid for alpha (smoothing parameter)
param_grid = {'alpha': [0.01, 0.1, 0.5, 1, 10]}

# Initialize the Multinomial Naive Bayes model
nb = MultinomialNB()

# Set up GridSearchCV to find the best alpha value
grid_search = GridSearchCV(nb, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train_smote, y_train_smote)

# Get the best model from GridSearchCV
best_nb = grid_search.best_estimator_

# Make predictions using the best model
y_pred = best_nb.predict(X_test)

# Evaluate the tuned model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=['Negative', 'Positive', 'Neutral'])

# Display the best parameters and evaluation metrics
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Parameters: {'alpha': 0.01}
Accuracy: 0.6286353467561522
Classification Report:
               precision    recall  f1-score   support

    Negative       0.35      0.44      0.39       114
    Positive       0.52      0.64      0.57       596
     Neutral       0.76      0.64      0.70      1078

    accuracy                           0.63      1788
   macro avg       0.54      0.57      0.55      1788
weighted avg       0.65      0.63      0.64      1788



In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

# Initialize the Naive Bayes and Random Forest models
nb = MultinomialNB(alpha=0.01)  # Tuned Naive Bayes with best alpha
rf = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=42)  # Tuned Random Forest

# Create a Voting Classifier (soft voting)
voting_clf = VotingClassifier(estimators=[('rf', rf), ('nb', nb)], voting='soft')

# Fit the Voting Classifier on the SMOTE-balanced training data
voting_clf.fit(X_train_smote, y_train_smote)

# Make predictions on the test set
y_pred = voting_clf.predict(X_test)

# Evaluate the ensemble model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=['Negative', 'Positive', 'Neutral'])

# Display the evaluation metrics
print(f"Ensemble Model Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

Ensemble Model Accuracy: 0.685682326621924
Classification Report:
               precision    recall  f1-score   support

    Negative       0.53      0.41      0.46       114
    Positive       0.59      0.60      0.60       596
     Neutral       0.75      0.76      0.75      1078

    accuracy                           0.69      1788
   macro avg       0.62      0.59      0.61      1788
weighted avg       0.68      0.69      0.68      1788



In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Define the hyperparameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200],   # Number of trees
    'max_depth': [3, 6, 10],      # Maximum depth of trees
    'learning_rate': [0.01, 0.1, 0.2],  # Step size shrinkage
    'subsample': [0.7, 1.0],      # Subsample ratio of the training instance
    'colsample_bytree': [0.7, 1.0] # Subsample ratio of features for each tree
}

# Initialize the XGBoost model
xgb = XGBClassifier(objective='multi:softmax', num_class=3, random_state=42)

# Set up GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(xgb, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train_smote, y_train_smote)

# Best parameters and model evaluation
best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=['Negative', 'Positive', 'Neutral'])

# Display the best parameters and evaluation metrics
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best Parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 200, 'subsample': 1.0}
Accuracy: 0.6812080536912751
Classification Report:
               precision    recall  f1-score   support

    Negative       0.52      0.33      0.41       114
    Positive       0.61      0.50      0.55       596
     Neutral       0.72      0.82      0.76      1078

    accuracy                           0.68      1788
   macro avg       0.62      0.55      0.57      1788
weighted avg       0.67      0.68      0.67      1788

