In [32]:
import pandas as pd
import random
import spacy
from spacy.util import minibatch

nlp = spacy.blank('en')

In [6]:
spam = pd.read_csv('spam.csv')
spam.head(10)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [16]:
textcat = nlp.create_pipe (
    "textcat",
    config={
        "exclusive_classes": True,
        "architecture": "bow"
    })
nlp.add_pipe(textcat)

ValueError: [E007] 'textcat' already exists in pipeline. Existing names: ['textcat']

In [18]:
textcat.add_label("ham")
textcat.add_label("spam")

1

In [19]:
train_text = spam['text'].values

In [22]:
train_labels = [{'cats': {'ham': label == 'ham',
                         'spam': label == 'spam'}}
               for label in spam['label']]

In [23]:
train_data = list(zip(train_text, train_labels))

In [24]:
train_data[:3]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  {'cats': {'ham': True, 'spam': False}}),
 ('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}})]

In [30]:
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()
batches = minibatch(train_data, size=8)

In [31]:
for batch in batches:
    texts, labels = zip(*batch)
    nlp.update(texts, labels, sgd=optimizer)

In [40]:
nlp.add_pipe(textcat)
random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}

for epoch in range(10):
    random.shuffle(train_data)
    batches = minibatch(train_data, size=8)
    for batch in batches:
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)
        

{'textcat': 1.3284014923210634}
{'textcat': 1.6731042324448708}
{'textcat': 1.8601833738159996}
{'textcat': 1.981754278128804}
{'textcat': 2.06292024399758}
{'textcat': 2.1138158423080626}
{'textcat': 2.1489295823825976}
{'textcat': 2.1737224776938806}
{'textcat': 2.1928323683634687}
{'textcat': 2.207256399714952}


In [41]:
texts = ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA" ]
docs = [nlp.tokenizer(text) for text in texts]
    
# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)

print(scores)

[[9.9995220e-01 4.7758371e-05]
 [1.5278918e-02 9.8472100e-01]]


In [45]:
predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])

['ham', 'spam']


In [46]:
def load_data(csv_file, split=0.9):
    data = pd.read_csv(csv_file)
    
    # Shuffle data
    train_data = data.sample(frac=1, random_state=7)
    
    texts = train_data.text.values
    labels = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)}
              for y in train_data.sentiment.values]
    split = int(len(train_data) * split)
    
    train_labels = [{"cats": labels} for labels in labels[:split]]
    val_labels = [{"cats": labels} for labels in labels[split:]]
    
    return texts[:split], train_labels, texts[split:], val_labels

train_texts, train_labels, val_texts, val_labels = load_data('yelp_ratings.csv')

In [47]:
print('Texts from training data\n------')
print(train_texts[:2])
print('\nLabels from training data\n------')
train_labels[:2]

Texts from training data
------
["Some of the best sushi I've ever had....and I come from the East Coast.  Unreal toro, have some of it's available."
 "One of the best burgers I've ever had and very well priced. I got the tortilla burger and is was delicious especially with there tortilla soup!"]

Labels from training data
------


[{'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}}]

In [48]:
import spacy

# Create an empty model
nlp = spacy.blank("en")

# Create the TextCategorizer with exclusive classes and "bow" architecture
textcat = nlp.create_pipe(
              "textcat",
              config={
                "exclusive_classes": True,
                "architecture": "bow"})

# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat)

# Add labels to text classifier
textcat.add_label("NEGATIVE")
textcat.add_label("POSITIVE")

1

In [49]:
from spacy.util import minibatch
import random

def train(model, train_data, optimizer):
    losses = {}
    random.seed(1)
    random.shuffle(train_data)
    
    batches = minibatch(train_data, size=8)
    for batch in batches:
        # train_data is a list of tuples [(text0, label0), (text1, label1), ...]
        # Split batch into texts and labels
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
        
    return losses

In [50]:
spacy.util.fix_random_seed(1)
random.seed(1)

optimizer = nlp.begin_training()
train_data = list(zip(train_texts, train_labels))
losses = train(nlp, train_data, optimizer)
print(losses['textcat'])

8.702600155742998


In [51]:
text = "This tea cup was full of holes. Do not recommend."
doc = nlp(text)
print(doc.cats)

{'NEGATIVE': 0.773860514163971, 'POSITIVE': 0.22613944113254547}


In [52]:
def predict(model, texts): 
    # Use the model's tokenizer to tokenize each input text
    docs = [nlp.tokenizer(text) for text in texts]
    
    # Use textcat to get the scores for each doc
    textcat = nlp.get_pipe('textcat')
    scores, _ = textcat.predict(docs)

    print(scores)
    
    # From the scores, find the class with the highest score/probability
    predicted_class = scores.argmax(axis=1)
    
    return predicted_class

In [53]:
texts = val_texts[34:38]
predictions = predict(nlp, texts)

for p, t in zip(predictions, texts):
    print(f"{textcat.labels[p]}: {t} \n")

[[0.00112233 0.9988777 ]
 [0.00190598 0.998094  ]
 [0.10516807 0.8948319 ]
 [0.9828241  0.01717597]]
POSITIVE: Came over and had their "Pick 2" lunch combo and chose their best selling 1/2 chicken sandwich with quinoa.  Both were tasty, the chicken salad is a bit creamy but was perfect with quinoa on the side.  This is a good lunch joint, casual and clean! 

POSITIVE: Went here last night and got oysters, fried okra, fries, and onion rings. I cannot complain. The portions were great and tasty!!! I will definitely be back for more. I cannot wait to try the crawfish boudin and soft shell crab. 

POSITIVE: This restaurant was fantastic! 
The concept of eating without vision was intriguing. The dinner was filled with laughs and good conversation. 

We were lead in a line to our table and each person to their seat. This was not just dark but you could not see something right in front of your face. 

The waiters/waitresses were all blind and allowed us to see how aware you need to be without

In [None]:
def evaluate(model, texts, labels):
    """ Returns the accuracy of a TextCategorizer model. 
    
        Arguments
        ---------
        model: ScaPy model with a TextCategorizer
        texts: Text samples, from load_data function
        labels: True labels, from load_data function
    
    """
    # Get predictions from textcat model (using your predict method)
    predicted_class = predict(model, texts)
    
    # From labels, get the true class as a list of integers (POSITIVE -> 1, NEGATIVE -> 0)
    true_class = [1 if label['cats']['POSITIVE'] is True else 0 for label in labels]
    
    # A boolean or int array indicating correct predictions
    correct_predictions = [predicted_class[i] == true_class[i] for i in range(0, len(predicted_class))]
    
    # The accuracy, number of correct predictions divided by all predictions
    accuracy = sum(correct_predictions)/len(correct_predictions)
    
    return accuracy

In [80]:
a = [1, 0, 1, 1, 0]
b = [1, 1, 1, 0, 0]
c = a == b
# c = [a[i] == b[i] for i in range(0, len(a))]
print(c)
# print(sum(c))
# print(len(c))
# sum(c)/len(c)

False


In [73]:
accuracy = evaluate(nlp, val_texts, val_labels)
print(f"Accuracy: {accuracy:.4f}")

[[3.7061116e-06 9.9999630e-01]
 [1.5990483e-05 9.9998403e-01]
 [2.3178293e-01 7.6821709e-01]
 ...
 [1.0764683e-04 9.9989235e-01]
 [9.9742532e-01 2.5747693e-03]
 [3.7466020e-06 9.9999630e-01]]
Accuracy: 0.9488


In [75]:
n_iters = 5
for i in range(n_iters):
    losses = train(nlp, train_data, optimizer)
    accuracy = evaluate(nlp, val_texts, val_labels)
    print(f"Loss: {losses['textcat']:.3f} \t Accuracy: {accuracy:.3f}")

[[1.8111309e-09 1.0000000e+00]
 [3.6852990e-07 9.9999964e-01]
 [9.2874631e-02 9.0712535e-01]
 ...
 [3.4402149e-06 9.9999654e-01]
 [9.9972624e-01 2.7379423e-04]
 [3.6849697e-11 1.0000000e+00]]
Loss: 2.632 	 Accuracy: 0.944
[[4.9286886e-09 1.0000000e+00]
 [2.7708534e-06 9.9999726e-01]
 [6.9757633e-02 9.3024242e-01]
 ...
 [9.8649059e-07 9.9999905e-01]
 [9.9964166e-01 3.5838332e-04]
 [1.1472999e-11 1.0000000e+00]]
Loss: 2.094 	 Accuracy: 0.945
[[5.8475030e-10 1.0000000e+00]
 [2.9355556e-07 9.9999976e-01]
 [2.0976745e-01 7.9023260e-01]
 ...
 [3.0348113e-07 9.9999964e-01]
 [9.9982399e-01 1.7603805e-04]
 [2.1351785e-12 1.0000000e+00]]
Loss: 1.654 	 Accuracy: 0.943
[[7.14472023e-11 1.00000000e+00]
 [1.10835266e-07 9.99999881e-01]
 [9.30078924e-02 9.06992137e-01]
 ...
 [8.82729001e-09 1.00000000e+00]
 [9.99905467e-01 9.45439024e-05]
 [1.24056351e-13 1.00000000e+00]]
Loss: 1.421 	 Accuracy: 0.943
[[3.5418141e-11 1.0000000e+00]
 [1.0822401e-07 9.9999988e-01]
 [1.2319910e-01 8.7680095e-01]
 ...
 [