# 0. Imports

In [1]:
import nltk
import pandas as pd
from nltk import regexp_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report

# 1. Load the corpus

In [2]:
filename = 'SAR14.txt'

In [3]:
corpus = pd.read_csv(filename, names=['Review', 'Score'])

# 2. Preprocess the data

## 1. Tokenize

In [4]:
def tokenize(review: str) -> list:
    pattern = '\w+'
    return regexp_tokenize(review, pattern)

In [5]:
tokens = corpus['Review'].apply(tokenize)

## 2. Lemmatize

In [6]:
def lemmatize(tokens: list) -> list:
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token.lower()) for token in tokens]

In [7]:
try:
    lemmas = tokens.apply(lemmatize)
except LookupError:
    nltk.download('omw-1.4')
    lemmas = tokens.apply(lemmatize)

## 3. Remove stopwords

In [8]:
def remove_stopwords(lemmas: list) -> list:
    stop_words = stopwords.words('english')
    return [lemma for lemma in lemmas if lemma not in stop_words]

In [9]:
lemmas = lemmas.apply(remove_stopwords)

# 3. Machine learning preparations

## 1. Create lables

In [10]:
corpus['Sentiment'] = corpus['Score'].map(lambda score: 'Positive' if 7 <= score <= 10 else 'Negative')

## 2. Vectorize reviwes

In [11]:
vectorizer = TfidfVectorizer()
data = lemmas.map(lambda review: " ".join(review))
tfidf_matrix = vectorizer.fit_transform(data)

## 3. Split the data into train and test sets

In [12]:
x, y = tfidf_matrix, corpus['Sentiment']
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75)

# 4. Predict with SDGClassifier

In [13]:
def train_model(x, y) -> SGDClassifier:
    model = SGDClassifier()
    model.fit(x, y)
    return model

In [14]:
classifier = train_model(x_train, y_train)
y_predicted = classifier.predict(x_test)

In [16]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

    Negative       0.92      0.72      0.81     16562
    Positive       0.90      0.98      0.94     41838

    accuracy                           0.90     58400
   macro avg       0.91      0.85      0.87     58400
weighted avg       0.91      0.90      0.90     58400

