Natural Language Processing (NLP) is a sub-field of artificial intelligence that deals understanding and processing human language. In light of new advancements in machine learning, many organizations have begun applying natural language processing for translation, chatbots and candidate filtering.

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
documentA='the man went out for a walk'
documentB='the man is watching a movie'
documentC='the children sat around the fire'

Machine learning algorithms cannot work with raw text directly. Rather, the text must be converted into vectors of numbers. In natural language processing, a common technique for extracting features from text is to place all of the words that occur in the text in a bucket. This aproach is called a bag of words model or BoW for short. It’s referred to as a “bag” of words because any information about the structure of the sentence is lost.

In [3]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')
bagOfWordsC = documentC.split(' ')

In [4]:
print(bagOfWordsA)

['the', 'man', 'went', 'out', 'for', 'a', 'walk']


In [5]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB)).union(set(bagOfWordsC))

In [7]:
print(uniqueWords)

{'watching', 'the', 'children', 'around', 'went', 'movie', 'is', 'a', 'walk', 'out', 'man', 'fire', 'sat', 'for'}


In [9]:
numOfWordsA = dict.fromkeys(uniqueWords,0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords,0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1
numOfWordsC = dict.fromkeys(uniqueWords,0)
for word in bagOfWordsC:
    numOfWordsC[word] += 1

Another problem with the bag of words approach is that it doesn’t account for noise. In other words, certain words are used to formulate sentences but do not add any semantic meaning to the text. For example, the most commonly used word in the english language is the which represents 7% of all words written or spoken. You couldn’t make deduce anything about a text given the fact that it contains the word the. On the other hand, words like good and awesome could be used to determine whether a rating was positive or not.

In natural language processing, useless words are referred to as stop words. The python natural language toolkit library provides a list of english stop words.

In [10]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

# Term Frequency (TF)
The number of times a word appears in a document divded by the total number of words in the document. Every document has its own term frequency.


In [12]:
def computeTF(wordDict,bagOfWords):
    tfidf = {}
    bagOfWordsCount = len(bagOfWords)
    for word,count in wordDict.items():
        tfidf[word] = count/float(bagOfWordsCount)
    return tfidf

In [13]:
tfA = computeTF(numOfWordsA,bagOfWordsA)
tfB = computeTF(numOfWordsB,bagOfWordsB)
tfC = computeTF(numOfWordsC,bagOfWordsC)

In [14]:
tfA

{'watching': 0.0,
 'the': 0.14285714285714285,
 'children': 0.0,
 'around': 0.0,
 'went': 0.14285714285714285,
 'movie': 0.0,
 'is': 0.0,
 'a': 0.14285714285714285,
 'walk': 0.14285714285714285,
 'out': 0.14285714285714285,
 'man': 0.14285714285714285,
 'fire': 0.0,
 'sat': 0.0,
 'for': 0.14285714285714285}

# Inverse Data Frequency (IDF)
The log of the number of documents divided by the number of documents that contain the word w. Inverse data frequency determines the weight of rare words across all

In [19]:
def computeIdf(documents):
    import math
    N = len(documents)
      
    idfDict = dict.fromkeys(documents[0].keys(),0)
    for document in documents:
        for word,val in document.items():
            if val > 0:
                idfDict[word] += 1
                
    for word, val in idfDict.items():
        idfDict[word] = math.log(N/float(val))
    return idfDict

In [20]:
idf = computeIdf([numOfWordsA,numOfWordsB,numOfWordsC])

In [21]:
print(idf)

{'watching': 1.0986122886681098, 'the': 0.0, 'children': 1.0986122886681098, 'around': 1.0986122886681098, 'went': 1.0986122886681098, 'movie': 1.0986122886681098, 'is': 1.0986122886681098, 'a': 0.4054651081081644, 'walk': 1.0986122886681098, 'out': 1.0986122886681098, 'man': 0.4054651081081644, 'fire': 1.0986122886681098, 'sat': 1.0986122886681098, 'for': 1.0986122886681098}


In [24]:
def computeTFIDF(tfBagOfWords,idfs):
    tfidf = {}
    for word,val in tfBagOfWords.items():
        tfidf[word] = val*idf[word]
    return tfidf

In [27]:
tfidfA = computeTFIDF(tfA,idf)
tfidfB = computeTFIDF(tfB,idf)
tfidfC = computeTFIDF(tfC,idf)
df = pd.DataFrame([tfidfA,tfidfB,tfidfC])

In [28]:
print(df)

   watching  the  children    around      went     movie        is         a  \
0  0.000000  0.0  0.000000  0.000000  0.156945  0.000000  0.000000  0.057924   
1  0.183102  0.0  0.000000  0.000000  0.000000  0.183102  0.183102  0.067578   
2  0.000000  0.0  0.183102  0.183102  0.000000  0.000000  0.000000  0.000000   

       walk       out       man      fire       sat       for  
0  0.156945  0.156945  0.057924  0.000000  0.000000  0.156945  
1  0.000000  0.000000  0.067578  0.000000  0.000000  0.000000  
2  0.000000  0.000000  0.000000  0.183102  0.183102  0.000000  


In [29]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA,documentB,documentC])

In [30]:
feature_names = vectorizer.get_feature_names()
print(feature_names)

['around', 'children', 'fire', 'for', 'is', 'man', 'movie', 'out', 'sat', 'the', 'walk', 'watching', 'went']


In [31]:
dense = vectors.todense()
print(dense)
print(vectors)

[[0.         0.         0.         0.45050407 0.         0.34261996
  0.         0.45050407 0.         0.26607496 0.45050407 0.
  0.45050407]
 [0.         0.         0.         0.         0.50461134 0.38376993
  0.50461134 0.         0.         0.29803159 0.         0.50461134
  0.        ]
 [0.4305185  0.4305185  0.4305185  0.         0.         0.
  0.         0.         0.4305185  0.50854232 0.         0.
  0.        ]]
  (0, 10)	0.450504072643198
  (0, 3)	0.450504072643198
  (0, 7)	0.450504072643198
  (0, 12)	0.450504072643198
  (0, 5)	0.3426199591918006
  (0, 9)	0.2660749625405929
  (1, 6)	0.5046113401371842
  (1, 11)	0.5046113401371842
  (1, 4)	0.5046113401371842
  (1, 5)	0.3837699307603192
  (1, 9)	0.2980315863446099
  (2, 2)	0.4305184979719882
  (2, 0)	0.4305184979719882
  (2, 8)	0.4305184979719882
  (2, 1)	0.4305184979719882
  (2, 9)	0.5085423203783267


In [32]:
denselist = dense.tolist()

In [33]:
print(denselist)

[[0.0, 0.0, 0.0, 0.450504072643198, 0.0, 0.3426199591918006, 0.0, 0.450504072643198, 0.0, 0.2660749625405929, 0.450504072643198, 0.0, 0.450504072643198], [0.0, 0.0, 0.0, 0.0, 0.5046113401371842, 0.3837699307603192, 0.5046113401371842, 0.0, 0.0, 0.2980315863446099, 0.0, 0.5046113401371842, 0.0], [0.4305184979719882, 0.4305184979719882, 0.4305184979719882, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4305184979719882, 0.5085423203783267, 0.0, 0.0, 0.0]]


In [34]:
df_tfidf = pd.DataFrame(denselist,columns=feature_names)
print(df_tfidf)

     around  children      fire       for        is      man     movie  \
0  0.000000  0.000000  0.000000  0.450504  0.000000  0.34262  0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.504611  0.38377  0.504611   
2  0.430518  0.430518  0.430518  0.000000  0.000000  0.00000  0.000000   

        out       sat       the      walk  watching      went  
0  0.450504  0.000000  0.266075  0.450504  0.000000  0.450504  
1  0.000000  0.000000  0.298032  0.000000  0.504611  0.000000  
2  0.000000  0.430518  0.508542  0.000000  0.000000  0.000000  
