In [65]:
import pandas as pd
import sklearn
import math

In [68]:
first_sentence = "Artificial Intelligence Engineer is the sexiest job in the 21st century"
second_sentence = "Machine Learning is the key for artificial intelligence"

In [69]:
#splitting so that each word have their own string
first_sentence = first_sentence.split(" ")
second_sentence = second_sentence.split(" ")
total = set(first_sentence).union(set(second_sentence))
print(total)

{'Intelligence', 'sexiest', 'Artificial', 'job', 'artificial', 'for', 'Engineer', 'Machine', 'intelligence', '21st', 'Learning', 'the', 'century', 'in', 'key', 'is'}


In [71]:
#counting the words using a dictionary key-value pair for both the sentences
dictA = dict.fromkeys(total, 0)
dictB = dict.fromkeys(total, 0)
for word in first_sentence:
    dictA[word]+=1
for word in second_sentence:
    dictB[word]+=1

In [72]:
pd.DataFrame([dictA, dictB])

Unnamed: 0,21st,Artificial,Engineer,Intelligence,Learning,Machine,artificial,century,for,in,intelligence,is,job,key,sexiest,the
0,1,1,1,1,0,0,0,1,0,1,0,1,1,0,1,2
1,0,0,0,0,1,1,1,0,1,0,1,1,0,1,0,1


### Coding the TFIDF function from Scratch:

In [82]:
"""Writing the TF function"""
def computeTF(dict, doc):
    tfDict = {}
    corpusCount = len(doc)
    for word, count in dict.items():
        tfDict[word] = count/float(corpusCount)
    return(tfDict)

tfFirst = computeTF(dictA, first_sentence)
tfSecond = computeTF(dictB, second_sentence)
tf = pd.DataFrame([tfFirst, tfSecond])

In [83]:
pd.DataFrame([tfFirst, tfSecond])

Unnamed: 0,21st,Artificial,Engineer,Intelligence,Learning,Machine,artificial,century,for,in,intelligence,is,job,key,sexiest,the
0,0.090909,0.090909,0.090909,0.090909,0.0,0.0,0.0,0.090909,0.0,0.090909,0.0,0.090909,0.090909,0.0,0.090909,0.181818
1,0.0,0.0,0.0,0.0,0.125,0.125,0.125,0.0,0.125,0.0,0.125,0.125,0.0,0.125,0.0,0.125


In [84]:
#using nltk to download the stopwords
import nltk
from nltk.corpus import stopwords
sw = stopwords.words("english")

In [86]:
len(sw)

179

In [89]:
set(sw)

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [91]:
filtered_sentence = []
for word in dictA:
    if str(word) not in set(sw):
        filtered_sentence.append(word)

In [92]:
filtered_sentence

['Intelligence',
 'sexiest',
 'Artificial',
 'job',
 'artificial',
 'Engineer',
 'Machine',
 'intelligence',
 '21st',
 'Learning',
 'century',
 'key']

In [95]:
"""Implementing the IDF formula"""
def computeIDF(docList):
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / (float(val)+1))
        
    return(idfDict)

idfs = computeIDF([dictA, dictB])

In [97]:
"""Calculating the TfIdf"""
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return(tfidf)

idfFirst = computeTFIDF(tfFirst, idfs)
idfSecond = computeTFIDF(tfSecond, idfs)
idf = pd.DataFrame([idfFirst, idfSecond])

In [99]:
idf

Unnamed: 0,21st,Artificial,Engineer,Intelligence,Learning,Machine,artificial,century,for,in,intelligence,is,job,key,sexiest,the
0,0.027366,0.027366,0.027366,0.027366,0.0,0.0,0.0,0.027366,0.0,0.027366,0.0,0.027366,0.027366,0.0,0.027366,0.054733
1,0.0,0.0,0.0,0.0,0.037629,0.037629,0.037629,0.0,0.037629,0.0,0.037629,0.037629,0.0,0.037629,0.0,0.037629


### Using the sklearn library for the calculation:

In [103]:
from sklearn.feature_extraction.text import TfidfVectorizer
firstV = "Artificial Intelligence Engineer is the sexiest job in the 21st century"
secondV = "Machine Learning is the key for artificial intelligence"
vectorize = TfidfVectorizer()
response = vectorize.fit_transform([firstV, secondV])

In [105]:
print(response)

  (0, 1)	0.23031454453380953
  (0, 6)	0.23031454453380953
  (0, 3)	0.3236990562320933
  (0, 7)	0.23031454453380953
  (0, 13)	0.46062908906761907
  (0, 12)	0.3236990562320933
  (0, 8)	0.3236990562320933
  (0, 5)	0.3236990562320933
  (0, 0)	0.3236990562320933
  (0, 2)	0.3236990562320933
  (1, 1)	0.28986933576883284
  (1, 6)	0.28986933576883284
  (1, 7)	0.28986933576883284
  (1, 13)	0.28986933576883284
  (1, 11)	0.40740123733358447
  (1, 10)	0.40740123733358447
  (1, 9)	0.40740123733358447
  (1, 4)	0.40740123733358447


In [109]:
from sklearn.feature_extraction.text import CountVectorizer
firstV1 = "Artificial Intelligence Engineer is the sexiest job in the 21st century"
secondV2 = "Machine Learning is the key for artificial intelligence"
vectorize1 = TfidfVectorizer()
response1 = vectorize1.fit_transform([firstV1, secondV2])

In [110]:
print(response1)

  (0, 1)	0.23031454453380953
  (0, 6)	0.23031454453380953
  (0, 3)	0.3236990562320933
  (0, 7)	0.23031454453380953
  (0, 13)	0.46062908906761907
  (0, 12)	0.3236990562320933
  (0, 8)	0.3236990562320933
  (0, 5)	0.3236990562320933
  (0, 0)	0.3236990562320933
  (0, 2)	0.3236990562320933
  (1, 1)	0.28986933576883284
  (1, 6)	0.28986933576883284
  (1, 7)	0.28986933576883284
  (1, 13)	0.28986933576883284
  (1, 11)	0.40740123733358447
  (1, 10)	0.40740123733358447
  (1, 9)	0.40740123733358447
  (1, 4)	0.40740123733358447
