In [476]:
import pandas as pd
import csv
import numpy as ny
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score

In [477]:
df= pd.read_csv("UMICH_SI650_Sentiment_Classification.txt", sep='\t', names=['liked', 'txt'])

In [478]:
df.head()

Unnamed: 0,liked,txt
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [479]:
#TFIDF Vectorizer, just like before
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [480]:
#in this case our dependent variable will be liked as 0 (didn't like the movie) or 1 (liked the movie)
y = df.liked

In [481]:
#convert df.txt from text to features
X= vectorizer.fit_transform(df.txt)

In [482]:
#6918 observations x 2022 unique words.
print (y.shape)
print (X.shape)

(6918,)
(6918, 2011)


In [483]:
#Test Train Split as usual
X_train, X_test,y_train, y_test = train_test_split(X, y, random_state=42)

In [484]:
#we will train a naive_bayes classifier
clf = naive_bayes.MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [485]:
#We can test our model's accuracy like this:

roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

0.9979292333245913

That's a pretty good model.   Lets try it...

In [486]:
#csv = ny.genfromtxt("cleaned_posts_new.csv", delimiter=",")
df = pd.read_excel('cleaned_posts_new.xlsx', sheet_name='Sheet1')

print(df)
movie_reviews_array=ny.array(["you are amazing!", "bad nice to meet you!"])
movie_review_vector = vectorizer.transform(movie_reviews_array)

df_vector = vectorizer.transform(df)

#print (clf.predict(movie_review_vector))

print (clf.predict(df_vector))

                                                   text
0     Fuck this. My house got broken for the third a...
1     The fundamentals department just sent out info...
2     Crazy how Bill Watterson nailed American lefti...
3     Hi. Today’s my 21st birthday and I’m feeling r...
4     SinemaSins: Everything wrong with Moderate Dem...
...                                                 ...
1950                                       Norton sucks
1951  Still haven’t secured an internship for next s...
1952  I have a huge crush on my art prof and it’s be...
1953     Anyone else get an email about phi beta kappa?
1954  Type: Robbery\nDate: 11/15/2021 10:45 AM\nDesc...

[1955 rows x 1 columns]
[1]
