In [80]:
import os
import re
import pandas as pd
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [13]:
DATA = "data"
POS = "pos"
NEG = "neg"
ACL_IMDB = "aclImdb"
TRAIN = "train"
TEST = "test"

In [42]:
from nltk.corpus import stopwords

In [63]:
dataset_parts = (TRAIN, TEST)
text_sentiments = (POS, NEG)

filename_parser = re.compile(r"(?P<id>\d+)_(?P<rating>\d{,2}).txt")

train_data_list = []
test_data_list = []

for part in dataset_parts:
    for sentiment in text_sentiments:
        for filename in os.listdir(os.path.join(DATA, ACL_IMDB, part, sentiment)):
            with open(os.path.join(DATA, ACL_IMDB, part, sentiment, filename), "r", encoding="utf-8") as file:
                if part == TRAIN:
                    needed_list = train_data_list
                else:
                    needed_list = test_data_list
                needed_list.append({"text": file.read().strip(),
                                    "sentiment": sentiment,
                                    "rating": int(filename_parser.match(filename).group("rating"))})

In [64]:
data_train = pd.DataFrame(train_data_list)
data_test = pd.DataFrame(test_data_list)

In [55]:
stopwords_en = stopwords.words("english")

In [38]:
from sklearn.pipeline import Pipeline

In [77]:
vectorizer = TfidfVectorizer(use_idf=True, min_df=10, max_df=1000, stop_words=stopwords_en)
def evaluate_model(model_class,param_grid):
    grid = GridSearchCV(model_class(), param_grid=param_grid)
    pipeline = Pipeline([("vectorizer", vectorizer),
                         ("grid", grid)])
    pipeline.fit(data_train.text, data_train.sentiment)
    predictions = pipeline.predict(data_test.text)
    print(classification_report(data_test.sentiment, predictions))

# Logistic Regression

In [76]:
evaluate_model(LogisticRegression, dict(tol=[0.000005], solver=["lbfgs"]))



              precision    recall  f1-score   support

         neg       0.83      0.85      0.84     12500
         pos       0.84      0.82      0.83     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



# SGD Classifier

In [78]:
evaluate_model(SGDClassifier, {})



              precision    recall  f1-score   support

         neg       0.83      0.84      0.83     12500
         pos       0.84      0.83      0.83     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



# Naive Bayes

In [81]:
evaluate_model(MultinomialNB, {})



              precision    recall  f1-score   support

         neg       0.78      0.85      0.82     12500
         pos       0.84      0.76      0.80     12500

    accuracy                           0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000

