In [33]:
import pandas as pd
import re
re.compile('<title>(.*)</title>')
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cathe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cathe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [34]:
train = pd.read_csv('Kaggle-Train.csv')

In [35]:
#Dropping any null values in text
train.dropna(subset=['text'],inplace=True)
train = train[train.text != '']


In [36]:
#Remove punctuation
train.text = train.text.apply(lambda x: re.sub(r'[^\w\s]', '', x) )

In [37]:
#Remove stopword
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
train.text = train.text.apply(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word.lower() not in stopwords]))


In [38]:
#Remove link
train.text = train.text.apply(lambda x: re.sub(r'\(?http\S+', '', x))

In [39]:
#Normalizing words
#Stemming is normalizing the word by chopping the end of the word which is not always useful, hence we use lemmatization instead

lemmatizer = nltk.WordNetLemmatizer()
train.text = train.text.apply(lambda x: ' '.join( [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x)]) )

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import metrics

In [41]:
#Countvectorise() transform the text to training matrix which we use for machine learning
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train.text) #we use fit_transform to transfrom text to document-term matrix(DTM)

In [42]:
train.to_csv('cleanedTrain.csv',index=False) #export trainning dataset to a csv file

In [44]:
train.head(10)

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,Id responded going,neutral
1,549e992a42,Sooo SAD miss San Diego,negative
2,088c60f138,bos bullying,negative
3,9642c003ef,interview leave alone,negative
4,358bd9e861,Sons couldnt put release already bought,negative
5,28b57f3990,shameless plugging best Rangers forum earth,neutral
6,6e0c6d75b1,2am feeding baby fun smile coo,positive
7,50e14c0bb8,Soooo high,neutral
8,e050245fbd,,neutral
9,fc2cbefa9d,Journey Wow u became cooler hehe possible,positive


Model Building and Evaluation

In [45]:
nb=MultinomialNB()
nb.fit(X_train,train.sentiment) #training the model with the training dataset

MultinomialNB()

In [46]:
#Reading in and cleaning test dataset
test = pd.read_csv('test.csv')
test.head(10)

Unnamed: 0,textID,text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh
1,96d74cb729,Shanghai is also really exciting (precisely -...
2,eee518ae67,"Recession hit Veronique Branquinho, she has to..."
3,01082688c6,happy bday!
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!
5,726e501993,that`s great!! weee!! visitors!
6,261932614e,I THINK EVERYONE HATES ME ON HERE lol
7,afa11da83f,"soooooo wish i could, but im in school and my..."
8,e64208b4ef,and within a short time of the last clue all ...
9,37bcad24ca,What did you get? My day is alright.. haven`...


In [47]:
test.dropna(subset=['text'],inplace=True)
test = test[test.text != '']

In [48]:
test.text = test.text.apply(lambda x: re.sub(r'[^\w\s]', '', x) ) #get rid of symbol 

In [49]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
test.text = test.text.apply(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word.lower() not in stopwords]))

In [50]:
test.text = test.text.apply(lambda x: re.sub(r'\(?http\S+', '', x)) #get rid of http/website link

In [51]:
#Lemmatization(Normalisation)is betterthan Stemming  
lemmatizer = nltk.WordNetLemmatizer()
test.text = test.text.apply(lambda x: ' '.join( [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x)]) )


In [52]:
X_test = vectorizer.transform(test.text) 
#we use 'transform' to transform text to DTM, we don't use fit_transform as it will lead to overfitting in the model 

In [53]:
#Perform prediction using the trained model
predicted = list(nb.predict(X_test)) 

In [54]:
test['Sentiment'] = predicted

In [55]:
test.head(10)

Unnamed: 0,textID,text,Sentiment
0,f87dea47db,Last session day,positive
1,96d74cb729,Shanghai also really exciting precisely skyscr...,positive
2,eee518ae67,Recession hit Veronique Branquinho quit compan...,negative
3,01082688c6,happy bday,positive
4,33987a8ee5,like,neutral
5,726e501993,thats great weee visitor,positive
6,261932614e,THINK EVERYONE HATES lol,negative
7,afa11da83f,soooooo wish could im school myspace completel...,negative
8,e64208b4ef,within short time last clue,neutral
9,37bcad24ca,get day alright havent done anything yet leavi...,neutral


In [56]:
test=test.drop('text',axis=1) #drop column text

In [57]:
test.to_csv('test-Kaggle.csv',index=False)

Apply the trained model on the business dataset and perform sentiment predictions

In [82]:
#Reading the business dataset and called it Uber
Uber = pd.read_csv('comments1.csv') 

In [83]:
Uber.Reply = Uber.Reply.apply(lambda x: re.sub(r'[^\w\s]', '', x)) #getting rid of symbol 

In [84]:
#One of the major forms of pre-processing is to filter out useless data
#getting rid of English stopwords
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
Uber.Reply = Uber.Reply.apply(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word.lower() not in stopwords]))

In [85]:
Reply = vectorizer.transform(Uber.Reply) #transforming reply to document-term matrix
prediction = nb.predict(Reply) #predict sentiment using the trained model
Uber['Sentiment']=prediction #create a new column called Sentiment and store prediction in the new column
Uber.to_csv('Uber.csv',index=False)

In [86]:
Uber.head(100)

Unnamed: 0,Reply,Upvote,Time,Key,neg,neu,pos,compound,Sentiment
0,Write letter Uber 1455 Market St 400 San Franc...,4.0,2020-09-17 03:23:49,0.0,0.000,1.000,0.000,0.0000,neutral
1,Forgot password Fix,3.0,2020-09-17 04:26:58,0.0,0.000,1.000,0.000,0.0000,negative
2,contact support actually dealing Uber employee...,2.0,2020-09-17 02:11:08,0.0,0.057,0.910,0.033,-0.4092,negative
3,Fuck Uber,1.0,2020-09-17 11:35:02,0.0,0.778,0.222,0.000,-0.5423,positive
4,could take account delete,1.0,2020-09-17 12:04:59,0.0,0.000,1.000,0.000,0.0000,neutral
...,...,...,...,...,...,...,...,...,...
95,still lost lmao,1.0,2020-09-15 14:17:07,19.0,0.280,0.244,0.476,0.3818,negative
96,Everyhing either emits electromagnetic energy ...,2.0,2020-09-15 14:22:18,19.0,0.000,0.908,0.092,0.4854,positive
97,things detected far far sway magnetically,1.0,2020-09-15 14:54:41,19.0,0.000,1.000,0.000,0.0000,neutral
98,look remote sensing,2.0,2020-09-15 14:58:10,19.0,0.000,1.000,0.000,0.0000,positive


In [98]:
#The number of matching values and mismatch values between Naive Bayes and Textblob
match = 0
mismatch = 0
#neutral = 0

for i in range(len(Uber)):
    if (Uber.compound.iloc[i] < 0  and  Uber.Sentiment.iloc[i] == "negative"):
        match += 1
    elif (Uber.compound.iloc[i] > 0 and Uber.Sentiment.iloc[i] == "positive"):
        match += 1
    elif (Uber.compound.iloc[i] == 0 and Uber.Sentiment.iloc =="neutral"):
        match +=1
    else:
        mismatch += 1

print(match, mismatch)


492 1485


Why is there a difference between Naive Bayes model and Vader?

It shows that here are 492 pairs of matching predictions and 1485 pairs of mismatch prediction between Naive Bayes model and Vader.
The difference exists because Vader has been trained on numerous amount of data, whereas the NB model we built has only been trained on 20000 plus rows of data. Additionally, the Kaggle data we used to train the model has a different context to the Reddit data we collected which explains why there is a difference in the sentiment analysis between Naive Bayes and Vader. 