# Movie rating prediction

dataset link: https://ai.stanford.edu/~amaas/data/sentiment/

Our goal is to train a model to predict if a movie reveiw is a positive or a negative one.
Dataset is already splitted 50/50: 12.5k positive reviews and 12.5k negative reviews.

In [1]:
import os
from pathlib import Path

print(f'Current working directory: {os.getcwd()}')


Current working directory: /home/alex/bin/Notebooks


In [12]:
import re
from random import shuffle

import pandas as pd

currdir = Path(os.getcwd())
train_folder = currdir / 'aclImdb' / 'train'
test_folder = currdir / 'aclImdb' / 'test'

def get_data_frame(folder):
    '''
    Get text files from folder,
    clean, shuffle and combine it
    and return a pd.DataFrame object.
    '''
    
    def import_files(data_folder):
        '''
        Return all .txt files in 'data_folder'.
        Data_folder - pathlib.Path() object.
        '''
        files = [
            e for e in data_folder.iterdir()
            ]
        return files

    def get_text_from_file(_file):
        '''
        Return text from file
        '''
        text = _file.read_text()
        return text
    
    ### getting data from folder
    
    positives_folder = folder / 'pos'
    negatives_folder = folder / 'neg'
    
    positive_files = import_files(positives_folder)
    negative_files = import_files(negatives_folder)

    positive = []
    negative = []
    
    for f in positive_files:
        text = get_text_from_file(f)
        positive.append(text)
    
    for f in negative_files:
        text = get_text_from_file(f)
        negative.append(text)
    
    ### cleaning data
    
    # removing part of <br > tags
    regex2 = re.compile(r'<br\s')

    # than matching only alphabetic characters
    regex = re.compile(r'[^a-zA-z\s]')

    # applying
    for data_list in [positive, negative]:
        for i in range(len(data_list)):
            data_list[i] = regex2.sub('', data_list[i])
            data_list[i] = regex.sub('', data_list[i])
    
    ### making a pre-dataset in a list
    
    ones = [1 for item in positive]
    zeros = [0 for item in negative]

    positive = list(zip(positive, ones))
    negative = list(zip(negative, zeros))

    # now each list is a [('<some text>', 1 or 0), ('<some text>'), 1 or 0, ...]
    
    ### making a pd.DataFrame object

    training_set = positive + negative

    # shuffling data

    shuffle(training_set)

    text, label = tuple(zip(*training_set))
    
    # text and label vars contain correspondent pairs: 
    # (text, text, text...) and (label, label, label)
    
    data_frame = pd.DataFrame({'text': text, 'label': label})
    
    return data_frame

In [13]:
train_set = get_data_frame(train_folder)
test_set = get_data_frame(test_folder)

In [16]:
# saving sets in csv files

os.system("mkdir 'csv'")
train_set.to_csv('csv/train_set.csv', index=False)
test_set.to_csv('csv/test_set.csv', index=False)

In [None]:
#train_set = pd.read_csv('csv/train_set.csv')
#test_set = pd.read_csv('csv/test_set.csv')

In [51]:
# vectorizing the sets

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer()
vectorizer.fit(train_set['text'].values)
X_train = vectorizer.transform(train_set['text'].values)
X_test = vectorizer.transform(test_set['text'].values)

# making np.arrays from labels

y_train = np.array(train_set['label'].values)
y_test = np.array(test_set['label'].values)

In [41]:
# training model, getting scores

from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn import metrics

def train_and_score(
    classifier,
    create_instance=True,
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test):
    '''
    Train classifier and print out its train/test score
    '''
    if create_instance:
        clf = classifier()
    else:
        clf = classifier
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    
    y_pred = clf.predict(X_test)
    
    accur = round(metrics.accuracy_score(y_test, y_pred), 3)
    prec = round(metrics.precision_score(y_test, y_pred), 3)
    recall = round(metrics.recall_score(y_test, y_pred), 3)
    f1 = round(metrics.f1_score(y_test, y_pred), 3)

    print(' ***')
    print(f'{classifier.__name__}:\nTrain score: {round(train_score, 3)} ; Test score: {round(test_score, 3)}\n')
    print(f'F1_score: {f1}\nAccuracy: {accur}\nPrecision: {prec}\nRecall: {recall}\n')
    
clf_list = [
    dict(classifier=SGDClassifier)
# too slow   dict(classifier=svm.SVC(kernel='linear'), create_instance=False)
    ]

for clf in clf_list:
    train_and_score(**clf)

 ***
SGDClassifier:
Train score: 0.94 ; Test score: 0.884

F1_score: 0.883
Accuracy: 0.884
Precision: 0.885
Recall: 0.882



In [64]:
text1 = '''
That was a gorgeous experience and the best way to spend that Friday evening of mine.
Actors played brilliant, screenplay seemed a nut in the start but then goes smoothly
'''
text2 = '''
A couldn't say it is a boring staff. In fact, it is just silly and obcure, nothing personal.
This and that was rather good, but who can state that all that staff worth a damn?
'''

text3 = '''
I love all his films, but that stuff was a kind of puff. You could expect a sophisticated
story, nevertheless your emotions would tell you to disdain complex expectations and to drain
your dreams in mustard and mayonaisse. All is good, but the bad stuff can't get itself out.
Dear fans, you would be in a weird mess of profanity.
'''

good = vectorizer.transform([text1])
bad = vectorizer.transform([text2])
complex_one = (vectorizer.transform([text3]))

clf = SGDClassifier()
clf.fit(X_train, y_train)

print(clf.predict(good))
print(clf.predict(bad))
print(clf.predict(complex_one))

[1]
[0]
[1]
