In [2]:
import zipfile
import pandas as pd
# import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# nltk.download('punkt')
# nltk.download('omw-1.4')
# nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

# Read Training Data
Data source: https://www.kaggle.com/datasets/kazanova/sentiment140

- target:   The polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

- ids:      The id of the tweet

- date:     The date of the tweet (Sat May 16 23:58:44 UTC 2009)

- flag:     The query (lyx). If there is no query, then this value is NO_QUERY.

- user:     The user that tweeted

- text:     The text of the tweet

In [None]:
archive = zipfile.ZipFile("training_data.zip", "r")
data = pd.read_csv(archive.open("training.1600000.processed.noemoticon.csv"), header=None, encoding_errors="replace")

There are no neutral labelled tweets.
So we change the labels to binary form.
(0 = negative
1 = positive)

In [None]:
data.loc[data[0]==4, 0] = 1
data.head()

For model training we only need the label and text columns

In [None]:
data = data.drop([1,2,3,4], axis=1)
data.columns = ["label","raw_text"]
data.head()

# Data Preprocessing

Turning everything into lowercase characters

In [None]:
data["text"] = [entry.lower() for entry in data["raw_text"]]
# data.head()
# data["text"] = data["raw_text"]

Removing Stopwords and Links and User-Tags

In [None]:
stop = stopwords.words('english')
# we think 'no' and 'not' might be important words for the sentiment and don't want them to be removed
stop.remove("no")
stop.remove("not")

# data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
data["text"] = data["text"].apply(lambda x: re.sub("http[s]?://\S+", "", x))
data["text"] = data["text"].apply(lambda x: re.sub("@\S+", "", x))
data["text"] = data["text"].apply(lambda x: re.sub("-|\.|,|'|\?|\!", "", x))

data.head()

Word-Stemming

In [None]:
# from nltk.stem import PorterStemmer
# from nltk.stem.snowball import SnowballStemmer

# sstemmer = SnowballStemmer("english")
# pstemmer = PorterStemmer()

# data['text'] = data['text'].apply(lambda x: ' '.join([sstemmer.stem(word) for word in x.split()]))

Tokenize Words

In [3]:
# This process takes a few minutes. So you only need to do it once, save it to a csv-file and read that file
### comment these lines after running once:
# data["text"] = [word_tokenize(entry) for entry in data["text"]]
# data.to_csv('tokenized_data.csv', index=False)
###

data = pd.read_csv('tokenized_data.csv')

Our finished data:

In [3]:
data.head()

Unnamed: 0,label,raw_text,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","['awww', 'thats', 'a', 'bummer', 'you', 'shoul..."
1,0,is upset that he can't update his Facebook by ...,"['is', 'upset', 'that', 'he', 'cant', 'update'..."
2,0,@Kenichan I dived many times for the ball. Man...,"['i', 'dived', 'many', 'times', 'for', 'the', ..."
3,0,my whole body feels itchy and like its on fire,"['my', 'whole', 'body', 'feels', 'itchy', 'and..."
4,0,"@nationwideclass no, it's not behaving at all....","['no', 'its', 'not', 'behaving', 'at', 'all', ..."


# Model Training

In [4]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(data["text"], data["label"],test_size=0.20, random_state=11)

In [5]:
stop = stopwords.words('english')
# we think 'no' and 'not' might be important words for the sentiment and don't want them to be removed
stop.remove("no")
stop.remove("not")

In [8]:
Tfidf_vect = TfidfVectorizer(analyzer="word", strip_accents="unicode", stop_words=stop, min_df=10)
Data_Tfidf = Tfidf_vect.fit_transform(data["text"])

In [18]:
# Data_Tfidf = Tfidf_vect.transform(data["text"])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [9]:
print("# of Features:", len(Tfidf_vect.get_feature_names_out()))

# of Features: 35016


In [20]:
model_selection.cross_validate(naive_bayes.MultinomialNB(), Data_Tfidf, data["label"], cv=10)

{'fit_time': array([0.8331542 , 0.97276592, 0.87712479, 0.78345704, 0.78779507,
        0.74288034, 0.81502581, 1.02199936, 0.75476074, 0.74301982]),
 'score_time': array([0.0649817 , 0.07127166, 0.05353975, 0.05253625, 0.04892564,
        0.04993916, 0.05719161, 0.08049512, 0.0522306 , 0.04152632]),
 'test_score': array([0.76240625, 0.7641125 , 0.7528875 , 0.76394375, 0.76575625,
        0.756375  , 0.76336875, 0.774675  , 0.7667    , 0.75955625])}

In [21]:
# Naive Bayes Classifier
NB = naive_bayes.MultinomialNB()
NB.fit(Data_Tfidf, data["label"])

# predictions_NB = NB.predict(Test_X_Tfidf)
# print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

MultinomialNB()

In [None]:
# Support Vector Machine Classifier
# SVM = svm.SVC(C=0.9, kernel='rbf', degree=3, gamma='auto', cache_size= 1000, max_iter=2500, decision_function_shape="ovo", random_state=10)
# SVM.fit(Train_X_Tfidf,Train_Y)

# predictions_SVM = SVM.predict(Test_X_Tfidf)
# print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, Test_Y)*100)

In [106]:
testdata = pd.read_excel("api_responses\data_20220615-111428.json.xlsx")
testdata = testdata["text"]

In [108]:
review = testdata
review

0     I stayed up all night playing stardew valley. ...
1     Stardew Valley 1.5 Random Bundles  E157 Enteri...
2                   bacc at my stardew valley grind era
3     It's 2am... I need to go to sleep... I've been...
4         I started Stardew Valley for the first time 😅
                            ...                        
95    @hauntmallows POV: You're Clint from Stardew V...
96    @Draconic_mer Awesome!!! Have you seen dangero...
97    i wanna play stardew valley on stream again ju...
98    @thepoggingman I still need to play stardew va...
99    There's a stardew valley game trial on switch ...
Name: text, Length: 100, dtype: object

In [109]:
testdata = Tfidf_vect.transform(testdata)
print(testdata)

  (0, 32559)	0.3699750847461673
  (0, 28847)	0.3182368695627071
  (0, 23193)	0.248664455035521
  (0, 20884)	0.1825565584874412
  (0, 14857)	0.48829586301554434
  (0, 11115)	0.4654761767209046
  (0, 10593)	0.2959136441702954
  (0, 6535)	0.3521604996789864
  (1, 34555)	0.221162166512661
  (1, 32559)	0.34805875169901856
  (1, 24576)	0.2835148217445709
  (1, 23160)	0.38497687254888713
  (1, 15022)	0.36964242423237065
  (1, 14857)	0.4593705239843071
  (1, 10278)	0.379392179656457
  (1, 6535)	0.3312994549350945
  (2, 32559)	0.46074122896168623
  (2, 13322)	0.46213820581891196
  (2, 10353)	0.4958079199554629
  (2, 2963)	0.5729924128352741
  (3, 32559)	0.4143467096065845
  (3, 29167)	0.29515060694291984
  (3, 27711)	0.2226255197048213
  (3, 20638)	0.20995866457336002
  (3, 14857)	0.5468578628580049
  :	:
  (96, 26695)	0.23958176025290603
  (96, 14857)	0.4562891731384883
  (96, 12257)	0.2319486710033482
  (96, 8041)	0.4266022567024747
  (96, 6535)	0.32907717509260626
  (96, 2810)	0.199902869457

In [110]:
predictions_NB = NB.predict(testdata)
predictions_NB = pd.Series(predictions_NB)

In [111]:
print(predictions_NB)

0     0
1     1
2     0
3     0
4     1
     ..
95    1
96    1
97    1
98    1
99    1
Length: 100, dtype: int64


In [114]:
frame = pd.concat([predictions_NB, review], axis=1)
frame

Unnamed: 0,0,text
0,0,I stayed up all night playing stardew valley. ...
1,1,Stardew Valley 1.5 Random Bundles E157 Enteri...
2,0,bacc at my stardew valley grind era
3,0,It's 2am... I need to go to sleep... I've been...
4,1,I started Stardew Valley for the first time 😅
...,...,...
95,1,@hauntmallows POV: You're Clint from Stardew V...
96,1,@Draconic_mer Awesome!!! Have you seen dangero...
97,1,i wanna play stardew valley on stream again ju...
98,1,@thepoggingman I still need to play stardew va...


In [115]:
frame.to_excel("test.xlsx")