In [1]:
# Copy the dataset from blob storage
# !curl https://jeffpro.blob.core.windows.net/public/reviews.csv -o reviews.txt

In [2]:
# Load the data
import pandas as pd

df = pd.read_csv('Data/reviews.csv', encoding="ISO-8859-1")
df.head(10)

Unnamed: 0,Text,Sentiment
0,Once again Mr. Costner has dragged out a movie...,0
1,This is an example of why the majority of acti...,0
2,"First of all I hate those moronic rappers, who...",0
3,Not even the Beatles could write songs everyon...,0
4,Brass pictures (movies is not a fitting word f...,0
5,"A funny thing happened to me while watching ""M...",0
6,This German horror film has to be one of the w...,0
7,"Being a long-time fan of Japanese film, I expe...",0
8,"""Tokyo Eyes"" tells of a 17 year old Japanese g...",0
9,Wealthy horse ranchers in Buenos Aires have a ...,0


In [3]:
# Clean the data
df['Text'] = df['Text'].str.lower()
df['Text'] = df['Text'].str.replace("[.;:!\'?,\"()\[\]]", '')
df['Text'] = df['Text'].str.replace("(<br\s*/><br\s*/>)|(\-)|(\/)", ' ')
df.head(10)

Unnamed: 0,Text,Sentiment
0,once again mr costner has dragged out a movie ...,0
1,this is an example of why the majority of acti...,0
2,first of all i hate those moronic rappers who ...,0
3,not even the beatles could write songs everyon...,0
4,brass pictures movies is not a fitting word fo...,0
5,a funny thing happened to me while watching mo...,0
6,this german horror film has to be one of the w...,0
7,being a long time fan of japanese film i expec...,0
8,tokyo eyes tells of a 17 year old japanese gir...,0
9,wealthy horse ranchers in buenos aires have a ...,0


In [4]:
# Vectorize the text
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words=['the', 'and', 'am', 'are'], min_df=10)
vectors = vectorizer.fit_transform(df['Text'])

In [5]:
# Split the dataset into a training set and a test set
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(vectors, df['Sentiment'], test_size=0.2, random_state=0)

In [6]:
# Train a classifier
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [7]:
# Score the model
model.score(x_test, y_test)

0.9093

In [8]:
# Assess accuracy with a confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, model.predict(x_test))

array([[4534,  468],
       [ 439, 4559]])

In [9]:
# Score a review
review = ['The long lines and poor customer service really turned me off']
model.predict_proba(vectorizer.transform(review))[0][1]

0.15456181390302676

In [10]:
# Score another review
review = ['The best hike in the United States. Fun!']
model.predict_proba(vectorizer.transform(review))[0][1]

0.813449616387299

In [11]:
# Save the model and the vocabulary
import pickle

pickle.dump(model, open('sentiment_analysis.pkl', 'wb'))
pickle.dump(vectorizer.vocabulary_, open('vocabulary.pkl', 'wb'))