In [1]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import pickle
import pandas as pd
import numpy as np

def save_data(filename, data):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)
        
def load_data(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [2]:
data = pd.read_csv('../Data/IMDB dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
def clean_tags(string):
    tags = [
        '<br />'
    ]
    for tag in tags:
        if tag in string:
            string = string.replace(tag, '')
    return string

data['review'] = data['review'].apply(lambda x: clean_tags(x))
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
train_x, test_x, train_y, test_y = train_test_split(data.review, data.sentiment, test_size=0.2, random_state=42)

In [5]:
vectorizer = TfidfVectorizer()
vec_train_x = vectorizer.fit_transform(train_x)
vec_test_x = vectorizer.transform(test_x)

In [6]:
logreg = LogisticRegression()
logreg.fit(vec_train_x, train_y)
save_data('logreg_model.pickle', logreg)
save_data('fit_vector.pickle', train_x)

In [7]:
print(logreg.predict(vec_test_x[0]))
print(logreg.score(vec_test_x, test_y))

['negative']
0.9008


In [8]:
def load_data(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

model = load_data('logreg_model.pickle')
fit_vec = load_data('fit_vector.pickle')
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(fit_vec)
text = ["Somehow, I think the continuation of the previous story could have been a better idea.", "This movie doesnt look that interesting. I'll pass"]
vec_test = vectorizer.transform(text)
print(model.predict(vec_test))

['negative' 'negative']
