In [1]:
import zipfile
import pandas as pd
from nltk.corpus import stopwords
import re
# from nltk.stem import PorterStemmer
# from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

# nltk.download('punkt')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

# Read Training Data
Data source: https://www.kaggle.com/datasets/kazanova/sentiment140  

Columns:
1. target:   The polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
2. ids:      The id of the tweet
3. date:     The date of the tweet (Sat May 16 23:58:44 UTC 2009)
4. flag:     The query (lyx). If there is no query, then this value is NO_QUERY.
5. user:     The user that tweeted
6. text:     The text of the tweet

In [2]:
archive = zipfile.ZipFile("data.zip", "r")
data = pd.read_csv(archive.open("train_binary.csv"), header=None, encoding_errors="replace")
# renaming columns for easier access
data.columns = ["label", "id", "date", "flag", "user", "raw_text"]
data

Unnamed: 0,label,id,date,flag,user,raw_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


Number of tweets per label:

In [3]:
data["label"].value_counts()

0    800000
4    800000
Name: label, dtype: int64

There are no neutral labelled tweets.
So we change the labels to binary form.
(0 = negative
1 = positive)

In [4]:
data.loc[data["label"]==4, "label"] = 1

For model training we only need the label and text columns

In [5]:
# data = data.drop([1,2,3,4], axis=1)
# data = data.drop(["textID","selected_text"], axis=1)
# data.columns = ["raw_text","label"]

# data.columns = ["label","raw_text"]
# data.head()

# Data Preprocessing

Turning everything into lowercase characters

In [5]:
data["text"] = [entry.lower() for entry in data["raw_text"]]
data.head()

Unnamed: 0,label,id,date,flag,user,raw_text,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","@switchfoot http://twitpic.com/2y1zl - awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,@kenichan i dived many times for the ball. man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","@nationwideclass no, it's not behaving at all...."


Removing Stopwords (Common words like "my", "he", "is", ...)

In [6]:
stop = stopwords.words('english')
# we think 'no' and 'not' might be important words for the sentiment and don't want them to be removed
stop.remove("no")
stop.remove("not")

# data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

Removing links, tags and several punctuations from tweets using regular expressions

In [7]:
data["text"] = data["text"].apply(lambda x: re.sub("http[s]?://\S+", "", x))
data["text"] = data["text"].apply(lambda x: re.sub("@\S+", "", x))
data["text"] = data["text"].apply(lambda x: re.sub("-|\.|,|'|\?|\!|`|\*", "", x))

data.head()

Unnamed: 0,label,id,date,flag,user,raw_text,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww thats a bummer you shoulda got david ...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he cant update his facebook by t...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball managed to sa...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",no its not behaving at all im mad why am i he...


Word-Stemming

In [8]:
# sstemmer = SnowballStemmer("english")
# pstemmer = PorterStemmer()

# data['text'] = data['text'].apply(lambda x: ' '.join([sstemmer.stem(word) for word in x.split()]))

Tokenize Words

In [9]:
# This process takes a few minutes. So you only need to do it once, save it to a csv-file and read that file
### comment these lines after running once:
# data["text"] = [str(word_tokenize(entry)) for entry in data["text"]]
# data.to_csv('tokenized_data.csv', index=False)
###

data = pd.read_csv('tokenized_data.csv')

Our finished data:

In [10]:
data.head()

Unnamed: 0,label,id,date,flag,user,raw_text,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","['awww', 'thats', 'a', 'bummer', 'you', 'shoul..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"['is', 'upset', 'that', 'he', 'cant', 'update'..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"['i', 'dived', 'many', 'times', 'for', 'the', ..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"['my', 'whole', 'body', 'feels', 'itchy', 'and..."
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","['no', 'its', 'not', 'behaving', 'at', 'all', ..."


# Model Training

In [11]:
stop = stopwords.words('english')
# we think 'no' and 'not' might be important words for the sentiment and don't want them to be removed
stop.remove("no")
stop.remove("not")

In [12]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(data["text"], data["label"], test_size=0.2, random_state=42)

In [13]:
Tfidf_vect = TfidfVectorizer(analyzer="word", strip_accents="unicode", stop_words=stop, min_df=10)
Data_Tfidf = Tfidf_vect.fit_transform(data["text"])
X_train_Tfidf = Tfidf_vect.transform(X_train)
X_test_Tfidf = Tfidf_vect.transform(X_test)

In [None]:
print("# of Features:", len(Tfidf_vect.get_feature_names_out()))

# of Features: 35019


In [16]:
model_selection.cross_validate(naive_bayes.MultinomialNB(), Data_Tfidf, data["label"], cv=10)

{'fit_time': array([1.16970515, 1.03002501, 0.86024332, 0.78069043, 0.90279222,
        0.95077801, 1.25592923, 0.99169946, 0.99595642, 0.9945848 ]),
 'score_time': array([0.06500053, 0.07131934, 0.03577995, 0.0343821 , 0.07000065,
        0.0598042 , 0.07800317, 0.0742836 , 0.07038307, 0.06745005]),
 'test_score': array([0.762425  , 0.7641    , 0.752925  , 0.764     , 0.76579375,
        0.7563625 , 0.76333125, 0.77474375, 0.7667125 , 0.759575  ])}

In [17]:
# Naive Bayes Classifier
NB = naive_bayes.MultinomialNB()
NB.fit(X_train_Tfidf, y_train)

predictions_NB = NB.predict(X_test_Tfidf)
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, y_test)*100)

Naive Bayes Accuracy Score ->  76.90249999999999


In [18]:
predictions_NB

array([1, 0, 1, ..., 1, 0, 0], dtype=int64)

In [19]:
# Support Vector Machine Classifier
SVM = svm.SVC(C=1, kernel='poly', degree=3, gamma='auto', cache_size= 2000, max_iter=10000, decision_function_shape="ovo")
SVM.fit(X_train_Tfidf, y_train)

predictions_SVM = SVM.predict(X_test_Tfidf)
print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, y_test)*100)

# Test on self-labelled game related tweets

In [None]:
testdata = pd.read_excel("testdata_games.xlsx")
testdata = testdata[["text", "multiclass_label"]]
testdata = testdata.loc[testdata["multiclass_label"] != 1]
testdata = testdata.dropna()
testdata["multiclass_label"] = testdata["multiclass_label"].astype(int)
testdata.loc[testdata["multiclass_label"]==2, "multiclass_label"] = 1

testdata

Unnamed: 0,text,multiclass_label
0,"Okay, someone explain to me why Fall Guys and ...",0
4,@ItsCsteph @JAAY_ROCK_ Sea of Thieves let's yo...,1
6,Multiplayer pirate adventure Sea of Thieves at...,1
7,Sea of Thieves Stream Friday probably for the ...,1
9,"1,000 FOLLOWERS! 😲\n\nThank you all so much!\n...",1
...,...,...
489,Swarlos318 played Grand Theft Auto IV: The Com...,1
495,FastDropYT played Grand Theft Auto V (Steam) i...,1
500,luisbarrelin played Grand Theft Auto V (Xbox S...,1
502,evo13 played Grand Theft Auto V (Xbox One) in ...,1


In [None]:
testdata["multiclass_label"].value_counts()


NameError: name 'testdata' is not defined

In [None]:
testdata_tfidf = Tfidf_vect.transform(testdata["text"])

In [None]:
predictions_NB = NB.predict(testdata_tfidf)
# predictions_NB = pd.Series(predictions_NB)

In [None]:
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, testdata["multiclass_label"])*100)

Naive Bayes Accuracy Score ->  56.97674418604651
