# References

- https://realpython.com/sentiment-analysis-python/
- https://github.com/realpython/materials/blob/master/nlp-sentiment-analysis/sentiment_analyzer.py

# Let's Get Started

## Tokenizing

In [2]:
import spacy
text = """
Dave watched as the forest burned up on the hill,
only a few miles from his house. The car had
been hastily packed and Marta was inside trying to round
up the last of the pets. "Where could she be?" he wondered
as he continued to wait for Marta to appear with the pets.
"""
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
token_list = [token for token in doc]
token_list

[,
 Dave,
 watched,
 as,
 the,
 forest,
 burned,
 up,
 on,
 the,
 hill,
 ,,
 ,
 only,
 a,
 few,
 miles,
 from,
 his,
 house,
 .,
 The,
 car,
 had,
 ,
 been,
 hastily,
 packed,
 and,
 Marta,
 was,
 inside,
 trying,
 to,
 round,
 ,
 up,
 the,
 last,
 of,
 the,
 pets,
 .,
 ",
 Where,
 could,
 she,
 be,
 ?,
 ",
 he,
 wondered,
 ,
 as,
 he,
 continued,
 to,
 wait,
 for,
 Marta,
 to,
 appear,
 with,
 the,
 pets,
 .,
 ]

## Removing Stop Words

In [3]:
filtered_tokens = [token for token in doc if not token.is_stop]
filtered_tokens

[,
 Dave,
 watched,
 forest,
 burned,
 hill,
 ,,
 ,
 miles,
 house,
 .,
 car,
 ,
 hastily,
 packed,
 Marta,
 inside,
 trying,
 round,
 ,
 pets,
 .,
 ",
 ?,
 ",
 wondered,
 ,
 continued,
 wait,
 Marta,
 appear,
 pets,
 .,
 ]

## Normalizing Words


### Lemmatization

- lemmatization is generally more powerful than stemming, it’s the only normalization strategy offered by spaCy.

In [5]:
lemmas = [f"Token: {token}, lemma: {token.lemma_}" for token in filtered_tokens]
lemmas

['Token: \n, lemma: \n',
 'Token: Dave, lemma: Dave',
 'Token: watched, lemma: watch',
 'Token: forest, lemma: forest',
 'Token: burned, lemma: burn',
 'Token: hill, lemma: hill',
 'Token: ,, lemma: ,',
 'Token: \n, lemma: \n',
 'Token: miles, lemma: mile',
 'Token: house, lemma: house',
 'Token: ., lemma: .',
 'Token: car, lemma: car',
 'Token: \n, lemma: \n',
 'Token: hastily, lemma: hastily',
 'Token: packed, lemma: pack',
 'Token: Marta, lemma: Marta',
 'Token: inside, lemma: inside',
 'Token: trying, lemma: try',
 'Token: round, lemma: round',
 'Token: \n, lemma: \n',
 'Token: pets, lemma: pet',
 'Token: ., lemma: .',
 'Token: ", lemma: "',
 'Token: ?, lemma: ?',
 'Token: ", lemma: "',
 'Token: wondered, lemma: wonder',
 'Token: \n, lemma: \n',
 'Token: continued, lemma: continue',
 'Token: wait, lemma: wait',
 'Token: Marta, lemma: Marta',
 'Token: appear, lemma: appear',
 'Token: pets, lemma: pet',
 'Token: ., lemma: .',
 'Token: \n, lemma: \n']

## Vectorizing Text


In [6]:
filtered_tokens[1].vector

array([ 1.8371646 ,  1.4529226 , -1.6147211 ,  0.678362  , -0.6594443 ,
        1.6417935 ,  0.5796405 ,  2.3021278 , -0.13260496,  0.5750932 ,
        1.5654886 , -0.6938864 , -0.59607106, -1.5377437 ,  1.9425622 ,
       -2.4552505 ,  1.2321601 ,  1.0434952 , -1.5102385 , -0.5787632 ,
        0.12055647,  3.6501784 ,  2.6160972 , -0.5710199 , -1.5221789 ,
        0.00629176,  0.22760668, -1.922073  , -1.6252862 , -4.226225  ,
       -3.495663  , -3.312053  ,  0.81387717, -0.00677544, -0.11603224,
        1.4620426 ,  3.0751472 ,  0.35958546, -0.22527039, -2.743926  ,
        1.269633  ,  4.606786  ,  0.34034157, -2.1272311 ,  1.2619178 ,
       -4.209798  ,  5.452852  ,  1.6940253 , -2.5972986 ,  0.95049495,
       -1.910578  , -2.374927  , -1.4227567 , -2.2528825 , -1.799806  ,
        1.607501  ,  2.9914255 ,  2.8065152 , -1.2510269 , -0.54964066,
       -0.49980402, -1.3882618 , -0.470479  , -2.9670253 ,  1.7884955 ,
        4.5282774 , -1.2602427 , -0.14885521,  1.0419178 , -0.08

## Using Machine Learning Classifiers to Predict Sentiment

tools:
- TensorFlow
- PyTorch
- scikit-learn

Luckily, spaCy provides a fairly straightforward built-in text classifier that you’ll learn about a little later. First, however, it’s important to understand the general workflow for any sort of classification problem.



### How Classification Works

- Split your data into training and evaluation sets.
- Select a model architecture.
- Use training data to train your model.
- Use test data to evaluate the performance of your model.
- Use your trained model on new data to generate predictions, which in this case will be a number between -1.0 and 1.0.

machine learning practitioners often split their datasets into three sets:

- Training
- Validation
- Test

## Building Your Own NLP Sentiment Analyzer


### Functions

In [25]:
import os
import random
import spacy
from spacy.util import minibatch, compounding
import pandas as pd

TEST_REVIEW = """
Transcendently beautiful in moments outside the office, it seems almost
sitcom-like in those scenes. When Toni Colette walks out and ponders
life silently, it's gorgeous.<br /><br />The movie doesn't seem to decide
whether it's slapstick, farce, magical realism, or drama, but the best of it
doesn't matter. (The worst is sort of tedious - like Office Space with less
humor.)
"""

eval_list = []

def train_model(
    training_data: list, test_data: list, iterations: int = 20
) -> None:
    # Build pipeline
    nlp = spacy.load("en_core_web_sm")
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")

    textcat.add_label("pos")
    textcat.add_label("neg")

    # Train only textcat
    training_excluded_pipes = [
        pipe for pipe in nlp.pipe_names if pipe != "textcat"
    ]
    with nlp.disable_pipes(training_excluded_pipes):
        optimizer = nlp.begin_training()
        # Training loop
        print("Beginning training")
        print("Loss\tPrecision\tRecall\tF-score")
        batch_sizes = compounding(
            4.0, 32.0, 1.001
        )  # A generator that yields infinite series of input numbers
        for i in range(iterations):
            print(f"Training iteration {i}")
            loss = {}
            random.shuffle(training_data)
            batches = minibatch(training_data, size=batch_sizes)
            for batch in batches:
                text, labels = zip(*batch)
                nlp.update(text, labels, drop=0.2, sgd=optimizer, losses=loss)
            with textcat.model.use_params(optimizer.averages):
                evaluation_results = evaluate_model(
                    tokenizer=nlp.tokenizer,
                    textcat=textcat,
                    test_data=test_data,
                )
                print(
                    f"{loss['textcat']}\t{evaluation_results['precision']}"
                    f"\t{evaluation_results['recall']}"
                    f"\t{evaluation_results['f-score']}"
                )

    # Save model
    with nlp.use_params(optimizer.averages):
        nlp.to_disk("model_artifacts")


def evaluate_model(tokenizer, textcat, test_data: list) -> dict:
    reviews, labels = zip(*test_data)
    reviews = (tokenizer(review) for review in reviews)
    true_positives = 0
    false_positives = 1e-8  # Can't be 0 because of presence in denominator
    true_negatives = 0
    false_negatives = 1e-8
    for i, review in enumerate(textcat.pipe(reviews)):
        true_label = labels[i]["cats"]
        for predicted_label, score in review.cats.items():
            # Every cats dictionary includes both labels, you can get all
            # the info you need with just the pos label
            if predicted_label == "neg":
                continue
            if score >= 0.5 and true_label["pos"]:
                true_positives += 1
            elif score >= 0.5 and true_label["neg"]:
                false_positives += 1
            elif score < 0.5 and true_label["neg"]:
                true_negatives += 1
            elif score < 0.5 and true_label["pos"]:
                false_negatives += 1
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    if precision + recall == 0:
        f_score = 0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"precision": precision, "recall": recall, "f-score": f_score}


def test_model(input_data: str = TEST_REVIEW):
    #  Load saved trained model
    loaded_model = spacy.load("model_artifacts")
    # Generate prediction
    parsed_text = loaded_model(input_data)
    # Determine prediction to return
    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
        prediction = "Positive"
        score = parsed_text.cats["pos"]
    else:
        prediction = "Negative"
        score = parsed_text.cats["neg"]
    print(
        f"Review text: {input_data}\nPredicted sentiment: {prediction}"
        f"\tScore: {score}"
    )


def load_training_data(
    data_directory: str = "dataset/aclImdb/train", split: float = 0.8, limit: int = 0
) -> tuple:
    # Load from files
    reviews = []
    for label in ["pos", "neg"]:
        labeled_directory = f"{data_directory}/{label}"
        for review in os.listdir(labeled_directory):
            if review.endswith(".txt"):
                with open(f"{labeled_directory}/{review}", encoding="utf8") as f:
                    text = f.read()
                    text = text.replace("<br />", "\n\n")
                    if text.strip():
                        spacy_label = {
                            "cats": {
                                "pos": "pos" == label,
                                "neg": "neg" == label,
                            }
                        }
                        reviews.append((text, spacy_label))
    random.shuffle(reviews)

    if limit:
        reviews = reviews[:limit]
    split = int(len(reviews) * split)
    return reviews[:split], reviews[split:]

In [26]:
train, test = load_training_data(limit=25)
print("Training model")
train_model(train, test)
df = pd.DataFrame(eval_list)
pd.DataFrame.plot(df)
print("Testing model")
test_model()

Training model
Beginning training
Loss	Precision	Recall	F-score
Training iteration 0
0.1594244632869959	0.5999999988	0.9999999966666667	0.749999998125
Training iteration 1
0.14076493866741657	0.5999999988	0.9999999966666667	0.749999998125
Training iteration 2
0.10239794291555882	0.0	0.0	0
Training iteration 3
0.06422404432669282	0.6666666644444444	0.6666666644444444	0.6666666644444444
Training iteration 4
0.031850377563387156	0.6666666644444444	0.6666666644444444	0.6666666644444444
Training iteration 5
0.016199110366869718	0.6666666644444444	0.6666666644444444	0.6666666644444444
Training iteration 6
0.024295946509937494	0.7499999981250001	0.9999999966666667	0.8571428546938776
Training iteration 7
0.006184204679811955	0.0	0.0	0
Training iteration 8
0.012749479505146155	0.0	0.0	0
Training iteration 9
0.00023829929494922908	0.4999999975	0.3333333322222222	0.3999999984
Training iteration 10
0.0051634207387536435	0.6666666644444444	0.6666666644444444	0.6666666644444444
Training iteration 11