# Loading and Sampling the Tweets Dataset

Dataset by my Instructor Amusa Abdulahi Tomisin from Nigeria

In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

df = pd.read_csv('../data/Tweets.csv')
df.sample(10)

Unnamed: 0,textID,text,selected_text,sentiment
13604,055fe824e5,ended up face to face with a bear on 181 driving home this evening. Sadly he took off into the woods before i hauled out my cameraphone,Sadly,negative
23163,d3f2f25252,Aural goodness,goodness,positive
25002,9121ad6c21,yay back at home,yay back at home,neutral
8983,812e066eed,_Souljaa I couldn`t eat 2,_Souljaa I couldn`t eat 2,neutral
5171,3e1f7acd3c,i don`t know what lender it was? yea these people definitely sucked butt toes. NOT friendly or helpful.,i don`t know what lender it was? yea these people definitely sucked butt toes. NOT friendly or helpful.,negative
569,03f9f6f798,"I don`t think I`ve ever been so tierd in my life.Ugh,goodnight.So sleeping in tomorrow",I don`t think I`ve ever been so tierd in my life.U,negative
11835,d454289ce9,"thx! i became IBM Master Inventor in 2008. really enjoyed your blog, particularly with the vegan tag",thx!,positive
847,e3d78da8b9,"Okay, make sure he`s alright kk? Cuidalo. Let him know he`s a got friend in us aha..","Okay, make sure he`s alright kk? Cuidalo. Let him know he`s a got friend in us aha..",neutral
12231,94495f1901,HAPPY MOTHER`S DAY!!!!!,HAPPY,positive
18311,479141acc5,"actually, web works fine. tweetdeck keeps crashing i`m in NJ today avoiding nascar","actually, web works fine. tweetdeck keeps crashing i`m in NJ today avoiding nascar",neutral


# Text Cleaning and Preprocessing

- Convert to lowercase
- Remove URLs
- Remove usernames starting with '@'
- Remove hashtags starting with '#'
- Remove non-alphabetic characters
- Lemmatize words
- remove 1 character
- Remove stopwords
- Remove duplicate words while preserving the order

In [3]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    if text and isinstance(text, str):
        text = re.sub(r'https?://\S+|www\.\S+|@\w+|#\w+|[^a-zA-Z]', ' ', text.lower())
        lemmatizer = WordNetLemmatizer()
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if len(word) > 1 and word not in stopwords.words('english')])
        text = ' '.join(list(dict.fromkeys(text.split())))
    else:
        text = ''
    return text

df['clean_text'] = df['text'].apply(clean_text)
df[['text', 'clean_text']].sample(10)

Unnamed: 0,text,clean_text
26915,it`s my fave!,fave
27443,Yes! I love him. I have seen the eps so many time that I quote his lines with him.,yes love seen eps many time quote line
21964,"great. my mom is pissed at me, so she sent me to the asbestos filled backroom",great mom pissed sent asbestos filled backroom
7564,"mmmmmm, late night Brusters ice cream! om nom nom nom",mmmmmm late night brusters ice cream om nom
12724,Go the bubble bath!!! Always relaxing.......,go bubble bath always relaxing
22505,nothing right now to do,nothing right
4120,I was going to hidden-file it and you`re thinking about youtube? SO NOTT! Lol. Hey put the phone down and more swimming,going hidden file thinking youtube nott lol hey put phone swimming
22067,"I like the idea of eliminating bludgers and beaters just tackling ppl, makes it nearly as dangerous as book quidditch",like idea eliminating bludgers beater tackling ppl make nearly dangerous book quidditch
5390,"Will be going to Indiana Baptist Sunday, Pray for summer missionaries...",going indiana baptist sunday pray summer missionary
12249,"don`t frown my lil aussie, I still love you! *muah*",frown lil aussie still love muah


# Sentiment Labeling with TextBlob Analysis

In [4]:
from textblob import TextBlob

def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

df['textblob_polarity'] = df['clean_text'].apply(get_sentiment).round(2)

def categorize_sentiment(score):
    if score >= 0.5:
        return 'Positive'
    elif score >= 0.05 and score < 0.5:
        return 'Moderately Positive'
    elif score > -0.05 and score < 0.05:
        return 'Neutral'
    elif score > -0.5 and score <= -0.05:
        return 'Moderately Negative'
    else:
        return 'Negative'

df['sentiment_textblob'] = df['textblob_polarity'].apply(categorize_sentiment)

df[['clean_text', 'textblob_polarity', 'sentiment_textblob']].sample(10)

Unnamed: 0,clean_text,textblob_polarity,sentiment_textblob
10161,good monday morning everyone hope week successful start,0.72,Positive
12022,feel owe listen new album everything released chain since nice twitter,0.37,Moderately Positive
3370,see used organ shop let know sign ejamming com rock,0.0,Neutral
2341,im cleaning listening fiona apple birthday party cant believe already,0.0,Neutral
489,thats another sponsor,0.0,Neutral
3336,sleep wake soooo early,0.1,Moderately Positive
23348,good luck tomorrow,0.7,Positive
20575,fight night demo load keep crashing first screen time delete download think,0.25,Moderately Positive
22225,sweet something new show,0.24,Moderately Positive
22474,okay,0.5,Positive


# Sentiment Distribution Analysis

In [5]:
sentiment_counts = df['sentiment_textblob'].value_counts()
sentiment_counts

Neutral                10418
Moderately Positive     7694
Positive                4551
Moderately Negative     3139
Negative                1679
Name: sentiment_textblob, dtype: int64

# Balancing Sentiment Classes using Random Oversampling

In [6]:
from imblearn.over_sampling import RandomOverSampler

labels = df['sentiment_textblob']
features = df['clean_text']

oversampler = RandomOverSampler(random_state=42)
features_balanced, labels_balanced = oversampler.fit_resample(features.values.reshape(-1, 1), labels)
balanced_df = pd.DataFrame({'clean_text': features_balanced.flatten(), 'sentiment_textblob': labels_balanced})
balanced_sentiment_counts = balanced_df['sentiment_textblob'].value_counts()

print(balanced_sentiment_counts)

Neutral                10418
Negative               10418
Positive               10418
Moderately Positive    10418
Moderately Negative    10418
Name: sentiment_textblob, dtype: int64


# Text Feature Extraction with CountVectorizer

In [25]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix

features = balanced_df['clean_text']
labels = balanced_df['sentiment_textblob']

vectorizer = CountVectorizer(ngram_range=(1, 3))
features_transformed = vectorizer.fit_transform(features)
features_sparse = csr_matrix(features_transformed)
features_df = pd.DataFrame.sparse.from_spmatrix(features_sparse, columns=vectorizer.get_feature_names_out())
data = pd.concat([features_df, labels], axis=1)

data.sample(10)

Unnamed: 0,aa,aa working,aa working double,aaa,aaa new,aaa new follower,aaaa,aaaa cant,aaaa cant go,aaaa need,...,zzzz taking mom,zzzzy,zzzzy office,zzzzy office alone,zzzzzzz,zzzzzzz goodnight,zzzzzzz goodnight tweet,zzzzzzzzzzzzzzz,zzzzzzzzzzzzzzz boring,sentiment_textblob
16016,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Positive
20788,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Neutral
22044,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Moderately Positive
17449,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Neutral
26722,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Positive
30788,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Moderately Negative
21742,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Moderately Positive
34853,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Moderately Positive
44617,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Negative
18969,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Neutral


# Train-Test Split for Sentiment Classification

In [8]:
from sklearn.model_selection import train_test_split

features = data.drop('sentiment_textblob', axis=1) 
labels = data['sentiment_textblob']
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=42)

print("Train Features Shape:", train_features.shape)
print("Train Labels Shape:", train_labels.shape)
print("Test Features Shape:", test_features.shape)
print("Test Labels Shape:", test_labels.shape)

Train Features Shape: (41672, 268683)
Train Labels Shape: (41672,)
Test Features Shape: (10418, 268683)
Test Labels Shape: (10418,)


# Sentiment Classification with Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr_classifier = LogisticRegression()
lr_classifier.fit(train_features, train_labels)
predictions = lr_classifier.predict(test_features)
report = classification_report(test_labels, predictions)

print(report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                     precision    recall  f1-score   support

Moderately Negative       0.96      0.97      0.96      2139
Moderately Positive       0.95      0.85      0.90      2033
           Negative       0.97      1.00      0.99      2054
            Neutral       0.93      0.93      0.93      2097
           Positive       0.92      0.98      0.95      2095

           accuracy                           0.95     10418
          macro avg       0.95      0.95      0.95     10418
       weighted avg       0.95      0.95      0.95     10418



# Sentiment Prediction using Logistic Regression Model

In [23]:
new_sentence = input("Enter the text for sentiment prediction: ")

cleaned_sentence = clean_text(new_sentence)
new_sentence_features = vectorizer.transform([cleaned_sentence])
probabilities = lr_classifier.predict_proba(new_sentence_features)[0]

sentiment_categories = ['Negative', 'Moderately Negative', 'Neutral', 'Moderately Positive', 'Positive']
sorted_sentiments = sorted(zip(sentiment_categories, probabilities), key=lambda x: x[1], reverse=True)

print("Your sentence:", new_sentence)
print("Predicted sentiment:", sorted_sentiments[0][0])

print("\nSentiment Probabilities:")
for sentiment, probability in sorted_sentiments:
    print(sentiment + ":", f"{probability:.2f}%")

Enter the text for sentiment prediction: do you love me?
Your sentence: do you love me?
Predicted sentiment: Positive

Sentiment Probabilities:
Positive: 0.87%
Moderately Negative: 0.09%
Moderately Positive: 0.04%
Negative: 0.00%
Neutral: 0.00%


# Saving and Loading Vectorizer and Classifier Models

In [None]:
import pickle

#Save the vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Save the classifier
with open('classifier.pkl', 'wb') as f:
    pickle.dump(lr_classifier, f)

# Load the vectorizer
with open('vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

# Load the classifier
with open('classifier.pkl', 'rb') as f:
    lr_classifier = pickle.load(f)

# Fine-tuning and Optimization

To improve the accuracy, you can consider the following steps in your fine-tuning and optimization process:

- <b>Hyperparameter Tuning:</b> Experiment with different hyperparameter values of your chosen model. Utilize techniques like grid search or random search to find the optimal combination of hyperparameters that yield the best performance.

- <b>Feature Engineering:</b> Explore additional features or transform existing features to enhance the predictive power of your model. This could involve creating new features, scaling or normalizing features, or using domain-specific knowledge to extract meaningful information from the data.

- <b>Model Selection:</b> Consider trying out different models or algorithms to find the one that best fits your sentiment classification task. Evaluate and compare the performance of various models such as logistic regression, decision trees, random forests, support vector machines, or neural networks.

- <b>Data Augmentation:</b> Increase the diversity and size of your training data by applying data augmentation techniques. This could involve techniques like oversampling, undersampling, synthetic data generation, or utilizing pre-trained language models for text augmentation.

By carefully tuning hyperparameters, engineering relevant features, exploring different models, and augmenting your data, you can enhance the accuracy and performance of your sentiment classification model.

# GridSearchCV for Hyperparameter Tuning in Logistic Regression

In [24]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.linear_model import LogisticRegression

# param_grid = {'C': [0.1, 0.5, 1.0, 2.0, 5.0]}

# logreg_classifier = LogisticRegression()

# grid_search = GridSearchCV(estimator=logreg_classifier, param_grid=param_grid, cv=5)
# grid_search.fit(train_features, train_labels)

# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

In [26]:
# best_C = best_params['C']

# logreg_classifier_best = LogisticRegression(C=best_C)
# logreg_classifier_best.fit(train_features, train_labels)

# Evaluation of Logistic Regression Model with Best Hyperparameters

In [27]:
# from sklearn.metrics import classification_report, confusion_matrix

# predictions_best = logreg_classifier_best.predict(test_features)
# classification_report = classification_report(test_labels, predictions_best)
# confusion_mat = confusion_matrix(test_labels, predictions_best)

# print("Evaluation Report:")
# print(classification_report)

# print("Confusion Matrix:")
# print(confusion_mat)

# Sentiment Prediction using Logistic Regression Model

In [28]:
# new_sentence = input("Enter the text for sentiment prediction: ")

# cleaned_sentence = clean_text(new_sentence)
# new_sentence_features = vectorizer.transform([cleaned_sentence])
# probabilities_best = logreg_classifier_best.predict_proba(new_sentence_features)[0]

# sentiment_categories = ['Negative', 'Moderately Negative', 'Neutral', 'Moderately Positive', 'Positive']
# sorted_sentiments = sorted(zip(sentiment_categories, probabilities_best), key=lambda x: x[1], reverse=True)

# print("Your sentence:", new_sentence)
# print("Predicted sentiment:", sorted_sentiments[0][0])

# print("\nSentiment Probabilities:")
# for sentiment, probabilities_best in sorted_sentiments:
#     print(sentiment + ":", f"{probabilities_best:.2f}%")

# Saving and Loading Classifier Models

In [24]:
# import pickle

# # Save the classifier
# with open('logreg_classifier_best.pkl', 'wb') as f:
#     pickle.dump(logreg_classifier_best, f)

# # Load the classifier
# with open('logreg_classifier_best.pkl', 'rb') as f:
#     logreg_classifier_best = pickle.load(f)