In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from string import punctuation

from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, classification_report


In [9]:
df = pd.read_csv(r"data\training.1600000.processed.noemoticon.csv",
                 encoding="ISO-8859-1",
                 header=None)

df.columns = ['sentiment','id','date','query','user','text']


  df = pd.read_csv(r"data\training.1600000.processed.noemoticon.csv",


In [12]:
df.sample(10)


Unnamed: 0,sentiment,id,date,query,user,text
48517,0,1677893254,Sat May 02 04:32:27 PDT 2009,NO_QUERY,charleypearson,@angelashushan just broked
93867,0,1770833960,Mon May 11 22:22:43 PDT 2009,NO_QUERY,cedupre,Hates this french homework.
978375,1,1833830631,Mon May 18 00:36:33 PDT 2009,NO_QUERY,TamiLNorman,i hate season finales. they are so mean! goodn...
127060,0,1834648535,Mon May 18 03:43:35 PDT 2009,NO_QUERY,leishae,@michalabanas i'm so disappointed. guess that...
328970,0,2011026382,Tue Jun 02 18:21:29 PDT 2009,NO_QUERY,cheshireccatt,Just found out I'm going to need two surgeries
368365,0,2049486590,Fri Jun 05 16:58:02 PDT 2009,NO_QUERY,cherrydarlingg,"I've been fucked at handball, my leg hurts an..."
1020140,1,1882320445,Fri May 22 06:39:26 PDT 2009,NO_QUERY,gez_g,"@richard_hughes Thanks, Rich - so funny..I had..."
282992,0,1992784944,Mon Jun 01 09:19:03 PDT 2009,NO_QUERY,jesskontur,does NOT need a cast! And no splint.. Just can...
700914,0,2254909887,Sat Jun 20 10:28:21 PDT 2009,NO_QUERY,heatherfoxxx,well i've slept away half of my day. more sum...
39635,0,1573579538,Tue Apr 21 01:50:52 PDT 2009,NO_QUERY,sarahutabarat,Storm in Jakarta.... Gonna be MASSIVE traffic ...


In [13]:
df.sentiment.value_counts()

sentiment
0                     668925
1                     248576
0                     131071
polarity of tweet          1
Name: count, dtype: int64

In [14]:
df['sentiment'].unique()


array(['polarity of tweet\xa0', '0', 0, 1], dtype=object)

In [15]:
df.columns


Index(['sentiment', 'id', 'date', 'query', 'user', 'text'], dtype='str')

In [16]:
df.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,polarity of tweet,id of the tweet,date of the tweet,query,user,text of the tweet
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [17]:
df = df.iloc[1:]


In [18]:
df = df.reset_index(drop=True)


In [19]:
df.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [20]:
df['sentiment'] = df['sentiment'].astype(int)


In [21]:
df.sentiment.unique()

array([0, 1])

In [22]:
df = df[['sentiment', 'text']]
df = df.drop_duplicates('text')


In [23]:
stemmer = SnowballStemmer("english")

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r"https?\S+|www\S+", " ", text)
    text = re.sub(r'\@\w+|\#', " ", text)
    text = re.sub(r"[^\w\s']", " ", text)
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

df['text'] = df['text'].apply(preprocess)


In [24]:
vectorizer = TfidfVectorizer(max_features=80000, ngram_range=(1,2))

X = vectorizer.fit_transform(df['text'])
y = df['sentiment']


In [25]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [26]:
import joblib

In [27]:
svm = LinearSVC(dual=False, class_weight='balanced')

classifier = CalibratedClassifierCV(svm)
classifier.fit(X_train, y_train)
# Save trained model
joblib.dump(classifier, "sentiment_model.pkl")

# Save TF-IDF vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("Model and Vectorizer saved successfully!")


Model and Vectorizer saved successfully!


In [28]:
y_pred_test = classifier.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))


Test Accuracy: 0.8521747522841333
              precision    recall  f1-score   support

           0       0.87      0.94      0.91    118429
           1       0.75      0.57      0.65     36991

    accuracy                           0.85    155420
   macro avg       0.81      0.75      0.78    155420
weighted avg       0.84      0.85      0.84    155420



In [29]:
new_text = "I am very happy today"

new_vector = vectorizer.transform([new_text])
prediction = classifier.predict(new_vector)

if prediction[0] == 1:
    print("Positive Sentiment")
else:
    print("Negative Sentiment")


Positive Sentiment


In [31]:
new_text = "I should be much better than this"

new_vector = vectorizer.transform([new_text])
prediction = classifier.predict(new_vector)

if prediction[0] == 1:
    print("Positive Sentiment")
else:
    print("Negative Sentiment")


Negative Sentiment
