In [None]:
# # TF_IDF
# # TF - Term Frequency 
# # IDF - Inverse Document Frequency

# #TF - No.of repeative words in sentence / No.of.words in sentence

# #IDF - loge(No.of.sentences/No.of.sentences containing the word)

# For eg : 
#  after cleaning the text.
#  s1 - good boy
#  s2 - good girl 
#  s3 - boy girl good

#  vocubulary - good boy girl 

# Term_frequency: (For each sentence)

#  words      s1     s2      s3   
#  1. good    1/2    1/2     1/3
#  2. boy     1/2     0      1/3
#  3.girl      0     1/2     1/3

#  IDF : (Depends of whole sentences)
# words      IDF     
#  1. good   loge(3/3)=0
#  2. boy    loge(3/2)
#  3.girl    loge(3/2) 


#  Final Tf_IDF ==> TF*IDF 
#        good      boy          girl 
#  s1    0     1/2*loge(3/2)     0

#  s2    0     0               1/2*loge(3/2)

#  s3    0     1/3*loge(3/2)    1/3*loge(3/2)



# Advantages : 
# * intuitive 
# * Fixed size -> vocabulary
# * Word importance is getting captured. 
# why ? 
#    If the word present in all the sentences is giving less important, for eg in table : good is having TF-IDF is zero 

# Disadvantages : 
# * Sparsity still exits 
# * OOV - out of vocubulary 

In [1]:
import pandas as pd
import numpy as np

In [15]:
ds = pd.read_csv('spam.csv',encoding ='latin1')
ds

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [16]:
ds.drop(columns=["Unnamed: 2","Unnamed: 3","Unnamed: 4" ],inplace=True)


In [17]:
ds 

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [18]:
import re
import nltk

from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

lemmatiser = WordNetLemmatizer()

In [19]:
stop_words = stopwords.words('english')
stopwords.words()

['tyre',
 'rreth',
 'le',
 'atyre',
 'këta',
 'megjithëse',
 'kemi',
 'per',
 'ndonëse',
 'dytë',
 'pse',
 'tha',
 'aty',
 'ndaj',
 'ke',
 'këtë',
 'duhet',
 'pa',
 'perket',
 'veç',
 'ndonje',
 'një',
 'keshtu',
 's',
 'janë',
 'jane',
 'ti',
 'ia',
 'megjithese',
 'prej',
 'ishte',
 'tjerë',
 'ai',
 'se',
 'tillë',
 'do',
 'si',
 'ja',
 'tonë',
 'keta',
 'pastaj',
 'ndersa',
 'siç',
 'unë',
 'gjate',
 'di',
 'kësaj',
 'cilin',
 'kjo',
 'dhënë',
 'da',
 'teper',
 'ketij',
 'ama',
 'pasi',
 'fjalë',
 'kanë',
 'vetem',
 'za',
 'd.m.th.',
 'ose',
 'pas',
 'ndonjë',
 'cila',
 'ndodhur',
 'dyte',
 'ardhur',
 'kësi',
 'nga',
 'vete',
 'atij',
 'ta',
 'jenë',
 'rendit',
 'tane',
 'keso',
 'deri',
 'tone',
 'të',
 'prandaj',
 'bëjë',
 'domethënë',
 'dhe',
 'qi',
 'mirepo',
 'tona',
 'që',
 'u',
 'këtu',
 'cilet',
 'jene',
 'tjere',
 'gjë',
 'së',
 'gjatë',
 'duhej',
 't',
 'dhene',
 'thuhet',
 'po',
 'une',
 'dy',
 'cfare',
 'ndërsa',
 'sepse',
 'edhe',
 'cilen',
 'to',
 'meqenese',
 'meje',


In [20]:
ds_message = ds["v2"]
ds_label = ds["v1"]
print (ds_message)
print (ds_label)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object
0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: v1, Length: 5572, dtype: object


In [21]:
for i in range(len(ds_message)):
    
    sentence = ds_message[i]
    s = re.sub("[^A-Za-z0-9]"," ",sentence)
    ss = s.lower().split()
    word = [lemmatiser.lemmatize(word,pos='v') for word in ss if word not in stopwords.words() ]
    w1 = " ".join(word)
    ds_message[i] = w1


ds_message
   

0       jurong point crazy available bugis great world...
1                                            lar joke wif
2       free entry 2 wkly comp win cup final tkts 21st...
3                                               dun early
4                                                usf live
                              ...                        
5567    2nd time 2 contact 750 pound prize 2 claim eas...
5568                                  b esplanade fr home
5569                                pity mood suggestions
5570            guy bitch act interest buy week give free
5571                                                 rofl
Name: v2, Length: 5572, dtype: object

In [22]:
New_dataframe = pd.concat([ds_message,ds_label],axis=True)
New_dataframe.rename(columns={'v1': 'labels', 'v2': 'messages'},inplace=True)
New_dataframe

Unnamed: 0,messages,labels
0,jurong point crazy available bugis great world...,ham
1,lar joke wif,ham
2,free entry 2 wkly comp win cup final tkts 21st...,spam
3,dun early,ham
4,usf live,ham
...,...,...
5567,2nd time 2 contact 750 pound prize 2 claim eas...,spam
5568,b esplanade fr home,ham
5569,pity mood suggestions,ham
5570,guy bitch act interest buy week give free,ham


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

Tf_idf = TfidfVectorizer(max_features=100)
cv = Tf_idf.fit_transform(New_dataframe["messages"]).toarray()
# cv[30]


array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.75931748,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [33]:
# for better visualisatation

import numpy as np 
np.set_printoptions(edgeitems=30, linewidth=100000,formatter=dict(float=lambda x: "%.3g" %x))

In [34]:
cv

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.453, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.483, 0, 0, 0, 0, 0, 0.489, 0, 0, 0, 0, 0, 0, 0, 0.568, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 

In [None]:
Tf_idf = TfidfVectorizer(max_features=100,ngram_range=(2,2))
cv = Tf_idf.fit_transform(New_dataframe["messages"]).toarray()


In [37]:
Tf_idf.vocabulary_

{'free entry': np.int64(37),
 'claim call': np.int64(26),
 'claim code': np.int64(27),
 'free call': np.int64(36),
 'call mobile': np.int64(19),
 'chance win': np.int64(24),
 'txt word': np.int64(87),
 'call 08000930705': np.int64(16),
 'lt gt': np.int64(57),
 'miss call': np.int64(59),
 'sms ac': np.int64(81),
 'sorry call': np.int64(82),
 'ur award': np.int64(88),
 'call free': np.int64(18),
 'call customer': np.int64(17),
 'customer service': np.int64(30),
 'guarantee 1000': np.int64(43),
 '1000 cash': np.int64(0),
 'draw show': np.int64(34),
 'prize guarantee': np.int64(70),
 'guarantee call': np.int64(44),
 'valid 12hrs': np.int64(97),
 'select receive': np.int64(77),
 'account statement': np.int64(10),
 'identifier code': np.int64(52),
 'urgent mobile': np.int64(96),
 'bonus caller': np.int64(14),
 'caller prize': np.int64(21),
 'receive 350': np.int64(72),
 '350 award': np.int64(7),
 'match call': np.int64(58),
 'give call': np.int64(40),
 'ur mob': np.int64(92),
 'gud ni8': np.