# Vectorization

Vectorizing is the process of encoding text as integers to create feature vectors

A feature vector is an n-dimensional vector of numerical features that represent some object

This is important because it formats the data in a way that facilitates the building of ML models

The model will associate correlations between the values in the document term matrix and the labels

There are different methods of vectorization:

-- Count

-- N-grams

-- Term Frequency - Inverse Document Frequency (TFIDF)

We will start with the Count Vectorizer

In [25]:
import nltk
import re
import pandas as pd
import string

In [59]:
pd.set_option('display.max_colwidth', 100)

stops = nltk.corpus.stopwords.words('english')

data = pd.read_csv('SMSSpamCollection.tsv', sep = '\t', header = None)

data.columns = ['label', 'text']

In [27]:
data.head(10)

Unnamed: 0,label,text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
5,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...
6,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To c...
7,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with came...
8,ham,"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried ..."
9,spam,"SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, ..."


In [22]:
# create lemmatizer

wn = nltk.WordNetLemmatizer()

In [34]:
# make function with all text cleaning steps including lemmatization

# we will not directly create a new column using this function - it will go into the Count Vectorizer as the analyzer

def clean_text(t):
    
    text = ''.join(x for x in t if x not in string.punctuation)
    
    tokens = re.split('\W+', text)
    
    text = [wn.lemmatize(word.lower()) for word in tokens if word not in stops]
    
    return text

In [35]:
# import count vectorizer from scikit learn

from sklearn.feature_extraction.text import CountVectorizer

In [36]:
# create instance of CountVectorizer()

count_vec = CountVectorizer(analyzer=clean_text)

In [37]:
# fit and transform the data 

X_counts = count_vec.fit_transform(data['text'])

In [38]:
# shape of document term matrix - 11039 unique words counted

X_counts.shape

(5568, 9013)

In [39]:
# lets see the words

count_vec.get_feature_names()

['',
 '0',
 '008704050406',
 '0089my',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '020603',
 '0207',
 '02070836089',
 '02072069400',
 '02073162414',
 '02085076972',
 '020903',
 '021',
 '050703',
 '0578',
 '06',
 '060505',
 '061104',
 '07008009200',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '071104',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '0784987',
 '0789xxxxxxx',
 '0794674629107880867867',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '08452810071',
 '0

In [47]:
# it will be difficult to visualize a matrix with this many words, so lets use a smaller sample

data_sample = data[0:20]

In [48]:
# create new count vectorizer object

count_vec_sample = CountVectorizer(analyzer=clean_text)

In [49]:
# fit and transform data for model

xcounts_sample = count_vec_sample.fit_transform(data_sample['text'])

In [50]:
# check out shape

xcounts_sample.shape

(20, 219)

In [51]:
# check out feature names

count_vec_sample.get_feature_names()

['08002986030',
 '08452810075over18s',
 '09061701461',
 '1',
 '100',
 '100000',
 '11',
 '12',
 '150pday',
 '16',
 '2',
 '20000',
 '2005',
 '21st',
 '3',
 '4',
 '4403ldnw1a7rw18',
 '4txtú120',
 '6days',
 '81010',
 '87077',
 '87121',
 '87575',
 '9',
 '900',
 'a',
 'aft',
 'aid',
 'already',
 'anymore',
 'apply',
 'ard',
 'around',
 'b',
 'blessing',
 'breather',
 'brother',
 'call',
 'caller',
 'callertune',
 'camera',
 'cash',
 'chance',
 'claim',
 'click',
 'co',
 'code',
 'colour',
 'comin',
 'comp',
 'copy',
 'cost',
 'credit',
 'cried',
 'csh11',
 'cup',
 'customer',
 'da',
 'date',
 'dont',
 'eg',
 'eh',
 'england',
 'enough',
 'entitled',
 'entry',
 'even',
 'fa',
 'feel',
 'final',
 'fine',
 'finish',
 'first',
 'free',
 'friend',
 'from',
 'fulfil',
 'go',
 'goalsteam',
 'going',
 'gonna',
 'gota',
 'granted',
 'ha',
 'had',
 'have',
 'he',
 'help',
 'hl',
 'home',
 'hour',
 'httpwap',
 'i',
 'im',
 'info',
 'is',
 'ive',
 'jackpot',
 'joking',
 'k',
 'kim',
 'kl341',
 'lar',
 '

In [52]:
# count vectorizers are sparse matrices - most of the values in the matrix are 0

# to print the actual matrix, we need to make it a dataframe first (need to convert to arrays first to do this)

xcounts_df = pd.DataFrame(xcounts_sample.toarray())

In [54]:
xcounts_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,209,210,211,212,213,214,215,216,217,218
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [55]:
# Python interprets each number at the top as a specific word 

# we can pull in the feature names to understand more clearly what is going on

xcounts_df.columns = count_vec_sample.get_feature_names()

xcounts_df

Unnamed: 0,08002986030,08452810075over18s,09061701461,1,100,100000,11,12,150pday,16,...,wkly,wonderful,wont,word,wwwdbuknet,xxxmobilemovieclub,xxxmobilemovieclubcomnqjkgighjjgcbl,yes,you,ü
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


# N-gram Vectorizer

N-grams creates a document term matrix where counts still occupy the cell but instead of the columns representing single terms, they represent all combinations of adjacent words of length n in your text

Example: "NLP is an interesting topic"

-- Bigrams would return a list with: "NLP is", "is an", "an interesting", "interesting topic"

-- Trigrams would return a list with: "NLP is an", "is an interesting", "an interesting topic"

-- Four-grams would return a list with: "NLP is an interesting", "is an interesting topic"

Note that n can be any size (doesn't stop at 4)

-- However, a certain n value usually yields optimal performance

-- We can tune the value to see which works best

This approach is how Google tries to finish the sentence you are typing into the search bar

Note that the n-gram tokenizer takes in a string, so we have to start by adjusting the cleaning function to return a string instead of a list

In [84]:
def clean_text(t):
    
    text = ''.join(x for x in t if x not in string.punctuation)
    
    tokens = re.split('\W+', text)
    
    text = " ".join([wn.lemmatize(word.lower()) for word in tokens if word not in stops])
    
    return text

In [85]:
data['clean_text'] = data['text'].apply(lambda x: clean_text(x))

In [86]:
data.head(10)

Unnamed: 0,label,text,clean_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,ive searching right word thank breather i promise wont take help granted fulfil promise you wond...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry questions...
2,ham,"Nah I don't think he goes to usf, he lives around here though",nah i dont think go usf life around though
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,even brother like speak they treat like aid patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,i have a date on sunday with will
5,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,a per request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy ...
6,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To c...,winner a valued network customer selected receivea 900 prize reward to claim call 09061701461 cl...
7,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with came...,had mobile 11 month u r entitled update latest colour mobile camera free call the mobile update ...
8,ham,"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried ...",im gonna home soon dont want talk stuff anymore tonight k ive cried enough today
9,spam,"SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, ...",six chance win cash from 100 20000 pound txt csh11 send 87575 cost 150pday 6days 16 tsandcs appl...


In [87]:
# lets set up the count vectorizer again - we don't need to put in an analyzer because text is already cleaned

# instead we can make the ngram_range argument = (1,2) to search for unigrams and bigrams

# if we make it (2,2) it will search for only bigrams

# if we make it (1,3) it searches for unigrams, bigrams, and trigrams, while (3,3) only searches for trigrams

from sklearn.feature_extraction.text import CountVectorizer

ngram_vec = CountVectorizer(ngram_range=(2,2))

In [88]:
X_counts = ngram_vec.fit_transform(data['clean_text'])

In [89]:
# look at vector shape - there are 34530 features

X_counts.shape

(5568, 34530)

In [90]:
# what are the features?

ngram_vec.get_feature_names()

['008704050406 sp',
 '0089my last',
 '0121 2025050',
 '01223585236 xx',
 '01223585334 cum',
 '0125698789 ring',
 '02 user',
 '020603 2nd',
 '020603 this',
 '0207 153',
 '02072069400 bx',
 '02073162414 cost',
 '02085076972 reply',
 '020903 this',
 '021 3680',
 '021 3680offer',
 '050703 tcsbcm4235wc1n3xx',
 '06 good',
 '07046744435 arrange',
 '07090298926 reschedule',
 '07099833605 reschedule',
 '07123456789 87077',
 '0721072 find',
 '07732584351 rodger',
 '07734396839 ibh',
 '07742676969 show',
 '07753741225 show',
 '0776xxxxxxx uve',
 '077xxx won',
 '07801543489 guaranteed',
 '07808 xxxxxx',
 '07808247860 show',
 '07808726822 awarded',
 '07815296484 show',
 '0784987 show',
 '0789xxxxxxx today',
 '0796xxxxxx today',
 '07973788240 show',
 '07xxxxxxxxx 2000',
 '07xxxxxxxxx show',
 '0800 0721072',
 '0800 169',
 '0800 18',
 '0800 195',
 '0800 1956669',
 '0800 505060',
 '0800 542',
 '08000407165 18',
 '08000776320 reply',
 '08000839402 2stoptx',
 '08000839402 2stoptxt',
 '08000839402 call',


In [91]:
# once again we will use a smaller sample to illustrate what is going on

data_sample = data[0:20]

In [92]:
# new vectorizer

sample_ngram_vec = CountVectorizer(ngram_range=(2,2))

In [93]:
# fit and transform

X_counts_sample = sample_ngram_vec.fit_transform(data_sample['clean_text'])

In [94]:
# check shape

X_counts_sample.shape

(20, 227)

In [95]:
# get feature names

sample_ngram_vec.get_feature_names()

['09061701461 claim',
 '100 20000',
 '100000 prize',
 '11 month',
 '12 hour',
 '150pday 6days',
 '16 tsandcs',
 '20000 pound',
 '2005 text',
 '21st may',
 '4txtú120 poboxox36504w45wq',
 '6days 16',
 '81010 tc',
 '87077 eg',
 '87077 trywales',
 '87121 receive',
 '87575 cost',
 '900 prize',
 'aft finish',
 'aid patent',
 'anymore tonight',
 'apply 08452810075over18s',
 'apply reply',
 'ard smth',
 'around though',
 'blessing time',
 'breather promise',
 'brother like',
 'call 09061701461',
 'call the',
 'caller press',
 'callertune caller',
 'camera free',
 'cash from',
 'chance win',
 'claim call',
 'claim code',
 'claim no',
 'click httpwap',
 'click wap',
 'co free',
 'code kl341',
 'colour mobile',
 'comp win',
 'copy friend',
 'cost 150pday',
 'credit click',
 'cried enough',
 'csh11 send',
 'cup final',
 'customer selected',
 'da stock',
 'date on',
 'dont miss',
 'dont think',
 'dont want',
 'eg england',
 'eh remember',
 'england 87077',
 'england macedonia',
 'enough today',
 'e

In [96]:
# need to make it into a dataframe to print out

sampledf = pd.DataFrame(X_counts_sample.toarray())

In [97]:
# change column names to feature names

sampledf.columns = sample_ngram_vec.get_feature_names()

In [98]:
# print dataframe

sampledf

Unnamed: 0,09061701461 claim,100 20000,100000 prize,11 month,12 hour,150pday 6days,16 tsandcs,20000 pound,2005 text,21st may,...,wkly comp,wonderful blessing,wont take,word claim,word thank,wwwdbuknet lccltd,xxxmobilemovieclub to,yes he,you week,you wonderful
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,0,0,0,0,1
1,0,0,0,0,0,0,0,0,1,1,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# TFIDF vectorization

TFIDF = term frequency times log(Number of documents / document frequency)

The rarer a word is, the higher its TFIDF score will be

In other words, it pulls out important but seldom used words

In [106]:
# create the clean_text function

def clean_text(t):
    
    text = "".join([word for word in t if word not in string.punctuation])
    
    tokens = re.split('\W+', text)
    
    text = [wn.lemmatize(word) for word in tokens if word not in stops]
    
    return text

In [107]:
# import TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

In [108]:
# instantiate vectorizer with clean text as the analyzer

tfidf_vec = TfidfVectorizer(analyzer=clean_text)

In [109]:
# fit and transform on the uncleaned text column and assign to a variable

X_tfidf = tfidf_vec.fit_transform(data['text'])

In [110]:
# lets see the shape - note that this is the same shape as the count vectorizer

X_tfidf.shape

(5568, 11039)

In [111]:
# once again we will make a sample version

data_sample = data[0:20]

In [119]:
# complete necessary steps to rerun the vectorizer (create instance then fit and transform)

tfidf_sample = TfidfVectorizer(analyzer=clean_text)

X_tfidf_sample = tfidf_sample.fit_transform(data_sample['text'])

In [120]:
# see shape

X_tfidf_sample.shape

(20, 230)

In [121]:
# get feature names

tfidf_sample.get_feature_names()

['08002986030',
 '08452810075over18s',
 '09061701461',
 '1',
 '100',
 '100000',
 '11',
 '12',
 '150pday',
 '16',
 '2',
 '20000',
 '2005',
 '21st',
 '3',
 '4',
 '4403LDNW1A7RW18',
 '4txtú120',
 '6days',
 '81010',
 '87077',
 '87121',
 '87575',
 '9',
 '900',
 'A',
 'Aft',
 'Ard',
 'As',
 'CASH',
 'CLAIM',
 'CSH11',
 'Call',
 'Callers',
 'Callertune',
 'Claim',
 'Co',
 'Cost',
 'Cup',
 'DATE',
 'ENGLAND',
 'Eh',
 'England',
 'Even',
 'FA',
 'FREE',
 'Fine',
 'Free',
 'From',
 'HAVE',
 'HL',
 'Had',
 'He',
 'I',
 'Im',
 'Is',
 'Ive',
 'Jackpot',
 'KL341',
 'LCCLTD',
 'Macedonia',
 'May',
 'Melle',
 'Minnaminunginte',
 'Mobile',
 'Nah',
 'No',
 'Nurungu',
 'ON',
 'Oh',
 'Oru',
 'POBOX',
 'POBOXox36504W45WQ',
 'Press',
 'Prize',
 'R',
 'Reply',
 'SCOTLAND',
 'SIX',
 'SUNDAY',
 'So',
 'TC',
 'Text',
 'That',
 'The',
 'Then',
 'They',
 'To',
 'TryWALES',
 'TsandCs',
 'Txt',
 'U',
 'URGENT',
 'Update',
 'Valid',
 'Vettam',
 'WAP',
 'WILL',
 'WINNER',
 'WITH',
 'XXXMobileMovieClub',
 'Yes',
 'You

In [122]:
# make it a DataFrame so we can print it out

tfidf_df = pd.DataFrame(X_tfidf_sample.toarray(), columns = tfidf_sample.get_feature_names())

In [124]:
tfidf_df

Unnamed: 0,08002986030,08452810075over18s,09061701461,1,100,100000,11,12,150pday,16,...,week,wet,win,wkly,wonderful,wont,word,wwwdbuknet,xxxmobilemovieclubcomnQJKGIGHJJGCBL,ü
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.230352,0.230352,0.202483,0.0,0.0,0.0
1,0.0,0.197734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.173811,0.197734,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.226193,0.0,0.0,0.0,0.0,0.226193,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.208105,0.0,0.0,0.0,0.0,0.0,0.208105,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.218493,0.0,0.0,0.0,0.218493,0.192059,...,0.0,0.0,0.192059,0.0,0.0,0.0,0.0,0.0,0.0,0.0
