In [None]:
import time
import random

# For Data science
import pandas as pd
import spacy
from spacy.util import minibatch
from spacy.training.example import Example

from sklearn import (
    metrics,
    linear_model,
)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# For graph
import matplotlib.pyplot as plt
import seaborn as sns

# For sources
from sources import three_sigma_cleared

# Getting data, observations
Get dataset

In [None]:
# Get dataset from file
df = pd.read_csv(
    f"../data/IMDB_Dataset.csv",
)

# Show dataset head
df.head()

Check if categories balanced

In [None]:
review = df.columns[-1]

grouped = df.groupby(review).size()
grouped.plot(
    kind='bar',
    xlabel='Review',
    ylabel='Count',
    title='Count of Reviews',
);

Dataframe is balanced.

Check review clause length distribution

In [None]:
# Set feature and target subsets
df['word_count'] = df['review'].apply(lambda x: len(x.split()))

# Plot the distribution of word counts
plt.figure(figsize=(10, 6))
plt.title("Distribution of Word Counts")
sns.histplot(x="word_count", data=df, kde=True)
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.show()

The length of reviews hase the wide range but most of them are less than 500 words.

Remove out-layers.

In [None]:
df2 = three_sigma_cleared(
    dataset=df,
    feature_names=['word_count'],
)

Plot word count by sentiment.

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='sentiment', y='word_count')
plt.title('Box Plot of Sentiment vs Word Count')
plt.xlabel('Sentiment')
plt.ylabel('Word Count');

Average length of positive and negative reviews are the same. The length is not a classification feature.

# Lasy use of spaCy
Use only for preprocessing.

In [None]:
# Load the pre-trained English model
nlp = spacy.load(
    name="en_core_web_sm",
    # disable=[
    #     "tagger",
    #     "attribute_ruler",
    #     "lemmatizer"
    # ],
)

Set word vector length.

In [None]:
nlp.vocab.reset_vectors(width=64)

Get lemmas from words in reviews

In [None]:
# Set function to process text
def spacy_preprocessed_text(text_to_process: str):
    doc = nlp(text_to_process)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    
    return ' '.join(tokens)

start_time = time.time()

df = df[:1000] # :TODO: drop this line

# Apply the preprocess function to reviews
df['processed_review'] = df['review'].apply(spacy_preprocessed_text)

print("--- %s seconds ---" % (time.time() - start_time))

Encode sentiments.

In [None]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

Split dataframe to test and train data

In [None]:
train_df, test_df = train_test_split(
    df,
    test_size=0.3,
    random_state=42,
)

Vectorize reviews

In [None]:
# Get vectorizer
vectorizer = CountVectorizer()

# Vectorize data
X_train = vectorizer.fit_transform(train_df['processed_review'])
X_test = vectorizer.transform(test_df['processed_review'])
y_train = train_df['sentiment']
y_test = test_df['sentiment']

Perform classification.

In [None]:
start_time = time.time()

# Get classifier
classifier = linear_model.LogisticRegression(
    max_iter=500,
)

# Fit classifier
classifier.fit(X_train, y_train)

print(f"--- {((time.time() - start_time)):.2f} seconds ---")

Make prediction.

In [None]:
prediction = classifier.predict(X_test)

Get prediction score.

In [None]:
accuracy = classifier.score(X_test, y_test)

accuracy

Get confusion matrix

In [None]:
confusion_matrix = metrics.confusion_matrix(
    y_true=y_test,
    y_pred=prediction,
    labels=classifier.classes_,
)

Visualize confusion matrix.

In [None]:
disp = metrics.ConfusionMatrixDisplay(
    confusion_matrix=confusion_matrix,
    display_labels=classifier.classes_,
)

disp.plot(cmap='Blues');

The amount of fail answers is low.

Visualize observations in test dataset and and predicted dataset.

In [None]:
# Get density plot
# for test data
sns.kdeplot(
    y_test,
    fill=False,
    color='r',
    label='test subset',
)

# for predicted data
sns.kdeplot(
    prediction,
    fill=True,
    color='b',
    label='predicted',
)

# Plot
plt.title('Distribution of observations in test dataset and and predicted dataset')
plt.legend();

Get report.

In [None]:
print(
    metrics.classification_report(
    y_true=y_test,
    y_pred=prediction,
    target_names=['class 1', 'class 2'],
    )
)

## Use spaCy for classification
Get dataframe

In [None]:
# Get dataset from file
df = pd.read_csv(
    '../data/IMDB_Dataset.csv',
)

Create blank nlp object

In [None]:
# Create a blank nlp object for a given language code.
nlp = spacy.blank(name="en")

# Show pipe names
nlp.pipe_names

Add 'textcat' pipe

In [None]:
# Add 'textcat' pipe if non-present in model
if 'textcat' not in nlp.pipe_names:
    nlp.add_pipe(
        'textcat',
        last=True
    )

# Show pipe names
nlp.pipe_names

Add labels to pipe

In [None]:
# Get pipeline component by name
textcat = nlp.get_pipe("textcat")

# Set list of categories from dataframe target column
categories = [_ for _ in df['sentiment'].unique()]

# Set labels for pipeline
[textcat.add_label(category) for category in categories]

textcat.labels

## Train model
Split dataframe to train and test data

In [None]:
train_data, test_data = train_test_split(df, test_size=0.3, random_state=42)

Get texts and labels

In [None]:
train_texts = train_data['review']#.values
train_labels = train_data['sentiment']#.values

test_texts = test_data['review']#.values
test_labels = test_data['sentiment']#.values

Set learning hyperparameters

In [None]:
n_epochs = 1
batch_size = 8
learn_rate = 0.001

Set optimizer 

In [None]:
# Set responsible for updating weights training
optimizer = nlp.initialize()

# Set control to step size for model parameters updating
optimizer.learn_rate = learn_rate

Get information for each training instance for further use

In [None]:
# Set information as list
train_examples = []

# Get train examples and store them in list
for text, label in zip(train_texts, train_labels):
    example = Example.from_dict(
        predicted=nlp.make_doc(text),
        example_dict={
            'cats': {label: 1.0}
        },
    )
    
    train_examples.append(example)

Train model using batches

In [None]:
print("Training the model...")

start_time = time.time()

# Run training
for epoch in range(n_epochs):
    random.shuffle(train_examples)
    losses = {}
    batches = minibatch(
        items=train_examples,
        size=batch_size
    )
    
    for batch in batches:
        nlp.update(
            examples=batch,
            sgd=optimizer,
            losses=losses,
        )
        
    print(f"Epoch {epoch+1} - Loss: {losses['textcat']:.2f}")

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
losses

In [None]:
correct = 0
total = 0
for text, true_label in zip(test_texts, test_labels):
    doc = nlp(text)
    predicted_label = max(doc.cats, key=doc.cats.get)
    if predicted_label == true_label:
        correct += 1
    total += 1

accuracy = correct / total
print(f"Accuracy: {accuracy:.2f}")