# Text preprocessing with Spacy 

(Alternatives can be torchtext, keras text-preprocessing module, gensim, etc.)

## Installing required libraries and files

In [0]:
!python -m spacy download en_core_web_md

Collecting en_core_web_lg==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz#egg=en_core_web_lg==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz (826.9MB)
[K     |████████████████████████████████| 826.9MB 1.2MB/s 
[?25hBuilding wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hcanceled
Traceback (most recent call last):
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.6/dist-packages/spacy/__main__.py", line 35, in <module>
    plac.call(commands[command], sys.argv[1:])
  File "/usr/local/lib/python3.6/dist-packages/plac_core.py", line 328, in call


In [0]:
import numpy as np
import pandas as pd
import spacy

nlp=spacy.load("en_core_web_md")

## Preprocessing

In [0]:
# open file and read the data

with open("sample-train","r") as file:
  string = file.read()
  sentences = string.lower().strip().split("\n")                     # this converts the data into a list of sentences

In [0]:
# total no. of sentences in the provided sample dataset

len(sentences)

200

In [0]:
# example sentences

sentences[0]

'love finding out important things about my friends over twitter and snapchat... 😒'

## Tokenization

In [0]:
######### test ###########

tokens = sentences[0].split()                           #see how the split function breaks the sentence into tokens
print(tokens)

['love', 'finding', 'out', 'important', 'things', 'about', 'my', 'friends', 'over', 'twitter', 'and', 'snapchat...', '😒']


In [0]:
######### test ###########

tokens = nlp(sentences[0])                               # see how spacy breaks the sentence and analyse the difference
print([tok.text for tok in tokens])

['love', 'finding', 'out', 'important', 'things', 'about', 'my', 'friends', 'over', 'twitter', 'and', 'snapchat', '...', '😒']


In [0]:
######### test ###########

tokens = "what is wrong.Don't disturb me".split()
print(tokens)

['what', 'is', "wrong.Don't", 'disturb', 'me']


In [0]:
######### test ###########

tokens = nlp("what is wrong.Don't disturb me ncjfkdls.")
print([tok.text for tok in tokens])

['what', 'is', 'wrong', '.', "Don't", 'disturb', 'me', 'ncjfkdls', '.']


In [0]:
# other attributes provided by spacy

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

what True 5.135811 False
is True 4.890306 False
wrong True 5.3789964 False
. True 4.9316354 False
Don't True 7.6098676 False
disturb True 6.343346 False
me True 5.75488 False
ncjfkdls False 0.0 True
. True 4.9316354 False


## Reading data labels

In [0]:
# reading labels

labels = pd.read_csv("sample-train-label", header=None, names = ["labels"])

In [0]:
labels.head()

Unnamed: 0,labels
0,1
1,1
2,1
3,1
4,0


## Creating a vocabulary

In [0]:
# this code block basically accounts for each word in the text

words = []
sen_list=[]

for sen in sentences:
  tokens = nlp(sen)
  temp = [tok.text for tok in tokens]
  sen_list.append(temp)
  words+=temp
  

print(len(sen_list))
print(sen_list[0])
print("Total no. of words in the text: ",len(words))
words = list(set(words))
print("No. of unique words: ",len(words))
print(words[:30])

200
['love', 'finding', 'out', 'important', 'things', 'about', 'my', 'friends', 'over', 'twitter', 'and', 'snapchat', '...', '😒']
Total no. of words in the text:  3854
No. of unique words:  1491
['@arvindkejriwal', 'http://t.co/fvmol1kltx', 'paid', '@', 'attractive', '@billbarnwell', 'soexcited', 'p', 'sound', 'twitter', 'deserve', 'but', 'tshirt', 'thinking', 'seems', '*', 'highway', 'too', 'hospital', 'writing', 'their', 'thrash', 'i', 'dramatictvactress', 'o_o', 'service', 'watch', '5x11', 'row', 'ariana']


In [0]:
vocab = []

print(len(nlp.vocab.strings))                          # shows all the words already present in the spacy vocabulary
print("@billbarnwell" in nlp.vocab.strings)
print("what" in nlp.vocab.strings)
print(len(nlp.vocab.strings))

1476551
True
True
1476551


### Note:

If you pass any sentence with new words into the spacy nlp module, it adds these new words into its default vocabulary. Thus these words will no longer be new for the vocabulary the next time they are encountered.
Thus, it is advisable to reload the module in order to reset the vocabulary.

In [0]:
nlp = spacy.load("en_core_web_md")

In [0]:
# Creating vocabulary out of the given words

vocab = ["<pad>","<unk>","<start>","<end>"]

for word in words:
  if word in nlp.vocab.strings:                         # only take common words, which are already present in the nlp vocabulary
    vocab.append(word)

In [0]:
# checking the custom vocabulary attributes

print(len(vocab))
print(vocab[:20])

1284
['<pad>', '<unk>', '<start>', '<end>', 'paid', '@', 'attractive', 'p', 'sound', 'twitter', 'deserve', 'but', 'tshirt', 'thinking', 'seems', '*', 'highway', 'too', 'hospital', 'writing']


## Creting Dictionaries

In [0]:
idx2word = {i:vocab[i] for i in range(len(vocab))}                   # mapping form index to word
word2idx = {word:idx for idx,word in idx2word.items()}               # mapping from words to indices

In [0]:
print(word2idx["what"])
print(word2idx["<pad>"])

96
0


In [0]:
print(idx2word[1])
print(idx2word[22])

<unk>
i


## Converting sentences to sequence of indices

In [0]:
def to_indices(sentences):
  indices=[]
  count=0
  
  for sentence in sentences:
    
    index=[word2idx["<start>"]]
    
    for i in list(nlp(sentence.lower())):
      if i.text in vocab:
        #print(i.text,type(i.text))
        index.append(word2idx[i.text])
        
      else:
        #print("unknown encountered")
        index.append(word2idx["<unk>"])
        
    index.append(word2idx["<end>"])
    indices.append(index)
    #count=count+1
    #print(count)
  return indices

In [0]:
print(to_indices(["This is a sample sentence. What's up bro.","This is a second sentence that will blow your mind."]))

[[2, 1193, 534, 650, 1, 1, 1260, 96, 860, 302, 1, 1260, 3], [2, 1193, 534, 650, 697, 1, 1060, 120, 1, 1046, 1, 1260, 3]]


In [0]:
indices = to_indices(sentences)
print(indices[:3])

[[2, 707, 731, 1179, 789, 837, 665, 825, 204, 744, 9, 582, 417, 793, 1054, 3], [2, 825, 149, 607, 483, 521, 851, 235, 844, 22, 373, 650, 773, 1261, 797, 1, 400, 1, 3], [2, 1, 283, 20, 363, 937, 1193, 788, 1013, 1013, 1013, 1013, 1013, 96, 665, 438, 913, 619, 1132, 1089, 3]]


## Padding

In [0]:
MAX_LEN = max(list(map(len,indices)))
print(MAX_LEN)

37


In [0]:
# padding function


def pad_sentences(sentences, max_len):
  
  pad_index = word2idx["<pad>"]
  
  for sen in sentences:
    sen += (max_len-len(sen))*[pad_index]
  
    

In [0]:
temp_indices = indices

pad_sentences(temp_indices,MAX_LEN)

print(temp_indices[:4])

[[2, 707, 731, 1179, 789, 837, 665, 825, 204, 744, 9, 582, 417, 793, 1054, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 825, 149, 607, 483, 521, 851, 235, 844, 22, 373, 650, 773, 1261, 797, 1, 400, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 1, 283, 20, 363, 937, 1193, 788, 1013, 1013, 1013, 1013, 1013, 96, 665, 438, 913, 619, 1132, 1089, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 188, 616, 1046, 119, 454, 761, 758, 942, 650, 61, 190, 1260, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [0]:
model_input = np.array(temp_indices)

print(model_input.shape)

(200, 37)


## Generating Word Vectors

In [0]:
tokens = nlp("Go away. cdbcjdksnk")
for tok in tokens:
  print(tok.text)
  print(tok.is_oov)
  print(tok.vector.shape)
  print(tok.vector)

Go
False
(300,)
[ 1.3893e-01 -1.9056e-02 -3.3891e-01  1.2151e-01  3.6523e-01 -1.7391e-01
 -2.6735e-02 -5.0335e-02  2.4743e-01  2.4531e+00 -4.2113e-01  2.3632e-01
  2.0513e-01 -1.0937e-02 -1.1480e-01 -3.7648e-02 -1.3440e-01  8.6124e-01
 -3.5803e-01  9.2525e-02  2.8075e-01  1.3649e-01  2.0819e-01  6.0206e-02
 -1.8229e-01  1.0172e-01 -1.3200e-01 -3.1598e-01  2.2241e-01 -1.9076e-01
 -1.0884e-02  1.6988e-01  8.0345e-03  1.3337e-01  1.7724e-01 -1.9162e-01
  3.3681e-01  3.0186e-01  6.1654e-02  7.6906e-03 -5.4406e-01  5.0142e-02
 -4.3115e-02 -2.6241e-01  4.7462e-02  3.3670e-01 -2.8649e-01 -2.7414e-01
  2.6776e-02 -6.5939e-02  1.1021e-01  2.8869e-01  4.6712e-01  1.2063e-01
  3.3831e-01 -3.0427e-04 -1.2116e-01 -1.5900e-01 -1.0514e-01 -3.8560e-02
 -6.2205e-02  3.5631e-02 -1.7852e-01 -1.3308e-01  2.6103e-01 -1.1082e-01
 -2.7463e-01  1.8556e-01  4.5257e-01  3.0336e-01  6.1801e-02  7.7310e-02
  3.4645e-01  3.6526e-03  4.6815e-01  2.0228e-02 -2.5509e-02 -1.9465e-02
 -5.3998e-03  8.6497e-02 -5.3099e-0