In [None]:
# import warnings
import time
import random

# For Data science
import spacy
from spacy.util import minibatch

import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt

from sklearn import (
    metrics,
    linear_model,
)

from sources import three_sigma_cleared

# Getting data, observations
## Get dataset

In [None]:
# Get dataset from file
df = pd.read_csv(
    f"../data/IMDB_Dataset.csv",
)

# Show dataset head
df.head()

In [None]:
review = df.columns[-1]

grouped = df.groupby(review).size()

In [None]:
grouped.plot(
    kind='bar',
    xlabel='Review',
    ylabel='Count',
    title='Count of Reviews',
);

Dataframe is balanced.

# Preprocessing

In [None]:
# Set feature and target subsets
df['word_count'] = df['review'].apply(lambda x: len(x.split()))

# Plot the distribution of word counts
plt.figure(figsize=(10, 6))
plt.title("Distribution of Word Counts")
sns.histplot(x="word_count", data=df, kde=True)
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.show()

In [None]:
df2 = three_sigma_cleared(
    dataset=df,
    feature_names=['word_count'],
)

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='sentiment', y='word_count')
plt.title('Box Plot of Sentiment vs Word Count')
plt.xlabel('Sentiment')
plt.ylabel('Word Count');

# Lasy use

In [None]:
# Load the English model
nlp = spacy.load(
    name="en_core_web_sm",
    # disable=[
    #     "tagger",
    #     "attribute_ruler",
    #     "lemmatizer"
    # ],
)

nlp.vocab.reset_vectors(width=64)

# Preprocess text function
def spacy_preprocessed_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    
    return ' '.join(tokens)

start_time = time.time()

df = df[:1000]

df['processed_review'] = df['review'].apply(spacy_preprocessed_text)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
list(nlp.pipe_labels.keys())

In [None]:
nlp.pipe_names

In [None]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
train_df, test_df = train_test_split(
    df,
    test_size=0.3,
    random_state=42,
)

In [None]:
train_df

In [None]:
vectorizer = CountVectorizer()

X_train = vectorizer.fit_transform(train_df['processed_review'])
X_test = vectorizer.transform(test_df['processed_review'])
y_train = train_df['sentiment']
y_test = test_df['sentiment']

In [None]:
classifier = linear_model.LogisticRegression(
    max_iter=500,
)

classifier.fit(X_train, y_train)

prediction = classifier.predict(X_test)

In [None]:
confusion_matrix = metrics.confusion_matrix(
    y_true=y_test,
    y_pred=prediction,
    labels=classifier.classes_,
)

confusion_matrix

In [None]:
disp = metrics.ConfusionMatrixDisplay(
    confusion_matrix=confusion_matrix,
    display_labels=classifier.classes_,
)

disp.plot(cmap='Blues');

In [None]:
# Get density plot
# for test data
sns.kdeplot(
    y_test,
    fill=False,
    color='r',
    label='test subset',
)

# for predicted data
sns.kdeplot(
    prediction,
    fill=True,
    color='b',
    label='predicted',
)

# Plot
plt.title('Distribution of observations in test dataset and and predicted dataset')
plt.legend();

In [None]:
print(
    metrics.classification_report(
    y_true=y_test,
    y_pred=prediction,
    target_names=['class 1', 'class 2'],
    )
)

Use spaCy for classification

In [None]:
nlp= spacy.load(r'en_core_web_sm')

# Adding the built-in textcat component to the pipeline.
if 'textcat' not in nlp.pipe_names:
    textcat=nlp.add_pipe(
        'textcat',
        last=True,
    )#, config={"exclusive_classes": True, "architecture": "simple_cnn"})

else:
    nlp.get_pipe('textcat')

nlp.pipe_names[-1]

In [None]:
train_texts = df['review'].values

train_labels = [
    {
        'cats': {
            'positive': label == 'positive',
            'negative': label == 'negative',
        }
    } for label in df['sentiment']
]

train_data = list(zip(train_texts, train_labels))

In [None]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']

In [None]:
optimizer = nlp.create_optimizer()

In [None]:
# nlp= spacy.load(r'en_core_web_sm')
# 
# # Adding the built-in textcat component to the pipeline.
# textcat = nlp.add_pipe("textcat")
# 
# # Adding the labels to textcat
# textcat.add_label("positive")
# textcat.add_label("negative")
# 
# print(nlp.pipe_names)

In [None]:
# train_data = list(zip(train_texts, train_labels))

In [None]:
# train the model
# optimizer = nlp.begin_training()

# losses = {}
# 
# for epoch in range(2):
#     random.shuffle(train_data)
    # batches = minibatch(train_data,size=10)
    
    # for batch in batches:
    #     texts,labels = zip(*batch)
    #     nlp.update(texts, labels, sgd=optimizer, losses=losses)
        
    # print(losses)

In [None]:

# Construction via add_pipe with custom model
# config = {"model": {"@architectures": "my_textcat"}}
# parser = nlp.add_pipe("textcat", config=config)

# Construction from class
# Use 'MultiLabel_TextCategorizer' for multi-label classification

# textcat = TextCategorizer(nlp.vocab, model, threshold=0.5)

In [None]:
# # Create a blank spacy model
# nlp = spacy.blank("en")
# 
# textcat = nlp.create_pipe("textcat", config={
#                                         "exclusive_classes": True,
#                                         "architecture": "simple_cnn"})
# textcat.add_label("positive")
# textcat.add_label("negative")
# # nlp.add_pipe(textcat, last=True)