Load the CSV

In [20]:
import pandas as pd

train_df = pd.read_csv('/Users/gnanreddybobba/Desktop/twitter_sentiment_analysis/twitter_training.csv', header=None)

train_df.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [21]:
train_df.columns = ['ID', 'Entity', 'Sentiment', 'Tweet']

In [22]:
train_df.head()

Unnamed: 0,ID,Entity,Sentiment,Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [23]:
train_df.shape

(74682, 4)

In [24]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         74682 non-null  int64 
 1   Entity     74682 non-null  object
 2   Sentiment  74682 non-null  object
 3   Tweet      73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [25]:
print("\nSentiment class distribution:")
print(train_df['Sentiment'].value_counts())


Sentiment class distribution:
Sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64


In [26]:
train_df.sample(10)

Unnamed: 0,ID,Entity,Sentiment,Tweet
60062,3493,Facebook,Irrelevant,She is a. Want More? dirtibook.com.
72268,11181,TomClancysGhostRecon,Neutral,"This punk thought he just was sneaky, he trapp..."
33483,6548,Fortnite,Neutral,"""Fortnite BUT its best<unk> NOOBS"" youtube.com..."
12569,8559,NBA2K,Positive,you would be the best addition 2k has ever rec...
56420,11282,TomClancysRainbowSix,Negative,"@ INTERRO Ik you probably don't care, but you ..."
71959,11126,TomClancysGhostRecon,Neutral,It is not the first time that the EU Commissio...
67488,7158,johnson&johnson,Negative,Many boys and men who used the medication deve...
50862,6330,FIFA,Negative,@EAHelp you rats really removed the option to ...
39856,1242,Battlefield,Neutral,Fair play I ’ m doing pretty shit on
54836,2217,CallOfDuty,Negative,People don't care.


In [27]:
train_df['Sentiment'] = train_df['Sentiment'].replace('Irrelevant', 'Neutral')

train_df['Sentiment'].value_counts()

Sentiment
Neutral     31308
Negative    22542
Positive    20832
Name: count, dtype: int64

In [28]:
train_df.isnull().sum()

ID             0
Entity         0
Sentiment      0
Tweet        686
dtype: int64

remove the missing values

In [29]:
train_df = train_df.dropna(subset=['Tweet'])
train_df.isnull().sum()

ID           0
Entity       0
Sentiment    0
Tweet        0
dtype: int64

In [30]:
import re
import string

In [31]:
def clean_tweet(text):
    text = text.lower()                                      # Lowercase
    text = re.sub(r'@[\w_]+', '', text)                      # Remove @mentions
    text = re.sub(r'#\w+', '', text)                         # Remove hashtags
    text = re.sub(r'http\S+|www.\S+', '', text)              # Remove URLs
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)                          # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()                 # Remove extra spaces
    return text

In [32]:
train_df['Clean_Tweet'] = train_df['Tweet'].apply(clean_tweet)

# Optional: Preview a few cleaned tweets
train_df[['Tweet', 'Clean_Tweet']].sample(5)


Unnamed: 0,Tweet,Clean_Tweet
18809,Months later I reinstalled version 3 to Check ...,months later i reinstalled version to check if...
12503,Ayy or choc! Represent... @AEClan2K<unk> R,ayy or choc represent unk r
10319,"Xbox Series with HD look: Fast, powerful and u...",xbox series with hd look fast powerful and unp...
50721,Dawggggg bernado silva beat my defender in the...,dawggggg bernado silva beat my defender in the...
70824,@GhostRecon,


In [33]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gnanreddybobba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

train_df['Clean_Tweet'] = train_df['Clean_Tweet'].apply(remove_stopwords)
# Optional: Preview a few cleaned tweets after stopword removal
train_df[['Tweet', 'Clean_Tweet']].sample(5)

Unnamed: 0,Tweet,Clean_Tweet
9334,I might have to get an Xbox exclusive Series F...,might get xbox exclusive series five x cheap f...
52084,And I finally have finished playing the origin...,finally finished playing original red dead red...
11218,Thank you.. Now focus on me making the one gre...,thank focus making one greatest game please
54528,@Treyarch Call of Duty Black Ops 3 is the wors...,call duty black ops worst fucking game ever ga...
53361,wrong. this good red dead redemption,wrong good red dead redemption


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Step 1: Define X and y
X = train_df['Clean_Tweet']
y = train_df['Sentiment_Label']

# Step 2: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(X)

# Step 3: Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)


KeyError: 'Sentiment_Label'

In [36]:
# Map sentiment labels to numeric values
label_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
train_df['Sentiment_Label'] = train_df['Sentiment'].map(label_map)

# Verify the new column
train_df[['Sentiment', 'Sentiment_Label']].head()

Unnamed: 0,Sentiment,Sentiment_Label
0,Positive,2
1,Positive,2
2,Positive,2
3,Positive,2
4,Positive,2


In [37]:
X = train_df['Clean_Tweet']
y = train_df['Sentiment_Label']

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(X)

# Split into train/test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)
from sklearn.naive_bayes import MultinomialNB

# Initialize the model
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)


In [38]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Predict on test data
y_pred = model.predict(X_test)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.6893918918918919

Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.68      0.70      4380
           1       0.67      0.75      0.71      6301
           2       0.70      0.61      0.65      4119

    accuracy                           0.69     14800
   macro avg       0.69      0.68      0.68     14800
weighted avg       0.69      0.69      0.69     14800


Confusion Matrix:
 [[2984 1081  315]
 [ 816 4720  765]
 [ 398 1222 2499]]


In [None]:
# using logistic regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Initialize the model
lr_model = LogisticRegression(max_iter=200)  # Increased max_iter just in case

# Train the model
lr_model.fit(X_train, y_train)

# Predict on test set
y_pred_lr = lr_model.predict(X_test)

# Evaluate
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.7205405405405405

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.72      0.73      4380
           1       0.71      0.76      0.73      6301
           2       0.71      0.66      0.69      4119

    accuracy                           0.72     14800
   macro avg       0.72      0.71      0.72     14800
weighted avg       0.72      0.72      0.72     14800


Confusion Matrix:
 [[3151  925  304]
 [ 719 4795  787]
 [ 350 1051 2718]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
# using svm

from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Initialize the SVM model
svm_model = LinearSVC()

# Train the model
svm_model.fit(X_train, y_train)

# Predict on test set
y_pred_svm = svm_model.predict(X_test)

# Evaluate
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))

SVM Accuracy: 0.7333783783783784

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.75      0.75      4380
           1       0.73      0.75      0.74      6301
           2       0.72      0.69      0.70      4119

    accuracy                           0.73     14800
   macro avg       0.73      0.73      0.73     14800
weighted avg       0.73      0.73      0.73     14800


Confusion Matrix:
 [[3278  792  310]
 [ 739 4743  819]
 [ 331  955 2833]]


In [40]:
# save the model

import joblib

# Save the trained SVM model
joblib.dump(svm_model, 'svm_sentiment_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [41]:
# To load later:
model = joblib.load('svm_sentiment_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')


In [42]:
# example usage
def predict_sentiment(tweet):
    # Clean the tweet
    cleaned_tweet = clean_tweet(tweet)
    
    # Vectorize the tweet
    vectorized_tweet = vectorizer.transform([cleaned_tweet])
    
    # Predict sentiment
    prediction = model.predict(vectorized_tweet)
    
    # Map back to original sentiment labels
    sentiment_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
    
    return sentiment_map[prediction[0]]
# Test the function
print(predict_sentiment("I love this product! It's amazing."))
print(predict_sentiment("This is the worst experience I've ever had."))
print(predict_sentiment("The service was okay, nothing special."))
# Test with a tweet that contains a URL and mentions
print(predict_sentiment("Check out this link: http://example.com @user"))
# Test with a tweet that contains numbers and punctuation
print(predict_sentiment("I bought 2 tickets for the concert!!! Can't wait!"))
# Test with a tweet that contains emojis
print(predict_sentiment("😊 This is so exciting! 🎉"))
# Test with a tweet that contains a hashtag
print(predict_sentiment("I love #Python programming!"))
# Test with an empty tweet
print(predict_sentiment(""))
# Test with a tweet that contains only stopwords
print(predict_sentiment("is the in of to"))
# Test with a tweet that contains slang or abbreviations
print(predict_sentiment("This is lit! Can't even!"))
# Test with a tweet that contains mixed languages
print(predict_sentiment("Me encanta este producto! It's amazing."))
# Test with a tweet that contains special characters
print(predict_sentiment("I love this product! @#$%^&*()"))
# Test with a tweet that contains a mix of positive and negative sentiments
print(predict_sentiment("I love this product, but the service was terrible."))
# Test with a tweet that contains sarcasm
print(predict_sentiment("Oh great, another product that doesn't work."))
# Test with a tweet that contains a question
print(predict_sentiment("Is this product really as good as they say?"))
# Test with a tweet that contains a quote
print(predict_sentiment("As Shakespeare said, 'All the world's a stage.'"))
# Test with a tweet that contains a call to action
print(predict_sentiment("Check out this amazing product! You won't regret it."))
# Test with a tweet that contains a review
print(predict_sentiment("This product is fantastic! I highly recommend it."))
# Test with a tweet that contains a complaint
print(predict_sentiment("I'm really disappointed with this product."))
# Test with a tweet that contains a compliment
print(predict_sentiment("This is the best product I've ever used!"))
# Test with a tweet that contains a suggestion
print(predict_sentiment("I think they should improve the packaging."))

Positive
Neutral
Neutral
Neutral
Neutral
Positive
Neutral
Neutral
Neutral
Positive
Positive
Neutral
Neutral
Neutral
Positive
Neutral
Neutral
Positive
Neutral
Neutral
Positive
