In [1]:
##Creates a sound when training is done
## Import up sound alert dependencies
from IPython.display import Audio, display

def allDone():
  display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))
## Insert whatever audio file you want above

virtual environment 'sentiment', virtual env kernel active

    conda install -n sent.analyzer spacy
    python -m spacy download en_core_web_sm

# Quick Tutorial

### Tokenizing
Tokenization is the process of breaking down chunks of text into smaller pieces. spaCy comes with a default processing pipeline that begins with tokenization, making this process a snap. In spaCy, you can do either sentence tokenization or word tokenization:

    Word tokenization breaks text down into individual words.
    Sentence tokenization breaks text down into individual sentences.

In [3]:
import spacy

In [4]:
text = """
Dave watched as the forest burned up on the hill,
only a few miles from his house. The car had
been hastily packed and Marta was inside trying to round
up the last of the pets. "Where could she be?" he wondered
as he continued to wait for Marta to appear with the pets.
"""

In [5]:
nlp = spacy.load('en_core_web_sm')


In [6]:
doc = nlp(text)

In [7]:
token_list = [token for token in doc]

In [18]:
token_list;

In [19]:
filtered_tokens = [token for token in doc if not token.is_stop]
filtered_tokens;

In [20]:
lemmas = [f'Token: {token}, lemma: {token.lemma_}' for token in filtered_tokens]
lemmas;

# Building Your Own NLP Sentiment Analyzer
From the previous sections, you’ve probably noticed four major stages of building a sentiment analysis pipeline:

    Loading data
    Preprocessing
    Training the classifier
    Classifying data
    
For building a real-life sentiment analyzer, you’ll work through each of the steps that compose these stages. You’ll use the Large Movie Review Dataset compiled by Andrew Maas to train and test your sentiment analyzer. Once you’re ready, proceed to the next section to load your data.

## Loading and Preprocessing Data

We want to iterate through all the files in the dataset and load them into a list

<b>Below randomly shuffle the order of the reviews to reduce the possible bias produced from loading order

In [11]:
import os
import random


def load_training_data(data_directory: str = "aclImdb/train",split: float = 0.8,limit: int = 0) -> tuple:
    """
    Split (float) is the proportion of data used to train, remainder tests
    """
    #load from files
    reviews = []
    for label in ["pos", "neg"]:
        labeled_directory = f"{data_directory}/{label}"
        for review in os.listdir(labeled_directory):
            if review.endswith(".txt"):
                with open(f"{labeled_directory}/{review}",encoding = "utf8") as f:
                          text = f.read()
                          text = text.replace("<br />", "\n\n")
                          if text.strip():
                              spacy_label = {
                                  "cats": {
                                      "pos": "pos" == label,
                                      "neg": "neg" == label
                                  }}
                              reviews.append((text,spacy_label))
    random.shuffle(reviews)
    
    if limit:
        reviews = reviews[:limit]
    split = int(len(reviews) * split)
    return reviews[:split], reviews[split:]

# Training Your Classifier

Putting the spaCy pipeline together allows you to rapidly build and train a convolutional neural network (CNN) for classifying text data. 
    
    1. Modifying the base spaCy pipeline to include the textcat component
    2. Building a training loop to train the textcat component
    3. Evaluating the progress of your model training after a given number of training loops

## Build Pipeline

## Build Your Training Loop to Train textcat

# Evaluating the Progress of Model Training

In [12]:
def evaluate_model(tokenizer,textcat,test_data:list) -> dict:
    reviews, labels = zip(*test_data)
    reviews = (tokenizer(review) for review in reviews)
    true_positives = 0
    false_positives = 1e-8 #can't be 0 because of the presence in denominator
    true_negatives = 0
    false_negatives = 1e-8
    
    for i,review in enumerate(textcat.pipe(reviews)):
        true_label = labels[i]['cats']
        for predicted_label, score in review.cats.items():
        #every category's dictionary includes both labels. You can get all the info you need with just the positive label
            if predicted_label == "neg":
                continue

            if score >= 0.5 and true_label["pos"]:
                true_positives +=1
            elif score >= 0.5 and true_label["neg"]:
                false_positives += 1
            elif score < 0.5 and true_label["neg"]:
                true_negatives +=1
            elif score < 0.5 and true_label["pos"]:
                false_negatives +=1
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    
    if precision + recall == 0:
        f_score = 0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"precision": precision, "recall":recall,"f-score":f_score}

In [13]:
import os
import random
import spacy
from spacy.util import minibatch, compounding

def train_model(training_data: list, test_data: list, iterations: int= 20) -> None:
    #Build Pipeline
    nlp = spacy.load('en_core_web_sm') # load the english model
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe("textcat", config = {"architecture": "simple_cnn"})
        nlp.add_pipe(textcat,last=True)
    else:
        textcat = nlp.get_pipe('textcat')
    
    textcat.add_label("pos")
    textcat.add_label('neg')
    
    #Train only textcat
    training_excluded_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    
    with nlp.disable_pipes(training_excluded_pipes):
        optimizer = nlp.begin_training()
        # Training loop
        print("Beginning Training")
        print("Loss\tPrecision\tRecall\tF-score")
        batch_sizes = compounding(4.0,32.0,1.001) # a generator that yields infinite series of input numbers
        
        for i in range(iterations):
            loss = {}
            random.shuffle(training_data)
            batches = minibatch(training_data,size = batch_sizes)
            for batch in batches:
                text, labels = zip(*batch)
                nlp.update(
                    text,
                    labels,
                    drop = 0.2,
                    sgd = optimizer,
                    losses = loss
                )
            with textcat.model.use_params(optimizer.averages):
                evaluation_results = evaluate_model(tokenizer = nlp.tokenizer,textcat=textcat,test_data = test_data) #evaluate model function
                
                print(
                    f"{loss['textcat']}\t{evaluation_results['precision']}"
                    f"\t{evaluation_results['recall']}"
                    f"\t{evaluation_results['f-score']}"
                )
    #Save model
    with nlp.use_params(optimizer.averages):
        nlp.to_disk("model_artifacts")
        
    allDone()

In [22]:
TEST_REVIEW = ''

def test_model(input_data: str = TEST_REVIEW):
    #  Load saved trained model
    loaded_model = spacy.load("model_artifacts")
    # Generate prediction
    parsed_text = loaded_model(input_data)
    # Determine prediction to return
    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
        prediction = "Positive"
        score = parsed_text.cats["pos"]
    else:
        prediction = "Negative"
        score = parsed_text.cats["neg"]
    print(
        f"Review text: {input_data}\nPredicted sentiment: {prediction}"
        f"\tScore: {score}"
    )

# Classifying Reviews

The first step with this new function will be to load the previously saved model. While you could use the model in memory, loading the saved model artifact allows you to optionally skip training altogether, which you’ll see later. Here’s the test_model() signature along with the code to load your saved model:

In [15]:
train, test = load_training_data(limit = 2500)

In [16]:
train_model(train,test)

Beginning Training
Loss	Precision	Recall	F-score
11.092800595564768	0.7715517241046744	0.7458333333022569	0.7584745762390477
1.9931817497708835	0.8251121075863178	0.7666666666347222	0.7948164146524916
0.5325804846115716	0.8035714285355549	0.74999999996875	0.775862068932075
0.202051523645423	0.8156682027273885	0.7374999999692708	0.7746170677997979
0.07674300500184472	0.8165137614304351	0.7416666666357639	0.7772925763852709
0.03717643609240895	0.8224299065036247	0.7333333333027777	0.7753303964416154
0.017342102308703033	0.827102803699668	0.7374999999692708	0.7797356827850337
0.017334968310365184	0.8279069767056787	0.7416666666357639	0.7824175823831905
0.006170690562953496	0.8287037036653377	0.7458333333022569	0.7850877192638118
0.003991205511340468	0.8254716980742701	0.7291666666362847	0.774336283151578
0.002378680516658793	0.8301886792061232	0.7333333333027777	0.7787610619124441
0.0017550996148685272	0.8262910797734135	0.7333333333027777	0.7770419425705499
0.0012567566650076856	0.826291

In [23]:
import os
import random
import spacy
from spacy.util import minibatch, compounding



test_model()

Review text: An odd thought occurred to me a few hours after I saw writer/director Wes Anderson's "The Grand Budapest Hotel" for the first time. It was that Anderson would be the ideal director for a film of "Lolita," or a mini-series of "Ada." Now I know that "Lolita" has been filmed, twice, but the fundamental problem with each version has nothing to do with ability to depict or handle risky content but with a fundamental misapprehension that Nabokov's famous novel took place in the "real world." For all the authentic horror and tragedy of its story, it does not. "I am thinking of aurochs and angels, the secret of durable pigments, prophetic sonnets, the refuge of art," Humbert Humbert, the book's monstrous protagonist/narrator, writes at the end of "Lolita." Nabokov created Humbert so Humbert might create his own world (with a combination of detail both geographically verifiable and stealthily fanciful), a refuge from his own wrongdoing.

"The Grand Budapest Hotel" uses a not dissim