In [2]:
import pandas as pd
import nltk
from nltk import FreqDist
from nltk import word_tokenize
from nltk.corpus import stopwords

\begin{equation*}
P(垃圾短信/具体关键词) = \frac{P(具体关键词/垃圾短信)\times P(垃圾短信)}{P(具体关键词)}\\
=\frac{P(具体关键词/垃圾短信)\times P(垃圾短信)}{P(具体关键词/垃圾短信)\times P(垃圾短信)+P(具体关键词/正常短信)\times P(正常短信)}
\end{equation*}

根据关键词在垃圾短信和正常短信中的相对词频高低，来判断是否为垃圾短信

1. 读取垃圾短信信息与正常短信信息
2. 建立垃圾短信与正常短信语料库
    * 去掉标点符号
    * 分词(tokenized)
    * 去掉stopword
    * 计算词频
3. 根据贝叶斯公式计算
    * P(垃圾短信/具体关键词) 

In [6]:
text = pd .read_csv('./spam.csv', encoding='latin-1')

In [7]:
text.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [15]:
spam_text = text['v2'][text['v1']=='spam'] # houjue course
ham_text = text.loc[text['v1']=='ham']['v2'] # 取v1==ham这些行的第二列

In [17]:
spam_text.head()

2     Free entry in 2 a wkly comp to win FA Cup fina...
5     FreeMsg Hey there darling it's been 3 week's n...
8     WINNER!! As a valued network customer you have...
9     Had your mobile 11 months or more? U R entitle...
11    SIX chances to win CASH! From 100 to 20,000 po...
Name: v2, dtype: object

## 2. 建立垃圾短信与正常短信语料库
* 去掉标点符号
* 分词(tokenize)
* 去掉stopword
* 计算词频

In [20]:
import string

In [21]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [22]:
trantable = str.maketrans("","",string.punctuation)

def data_clean(text):
    text_clean = text.translate(trantable)
    return text_clean

In [23]:
def remove_stopword(text):
    return [word.lower() for word in text if word.lower() not in stopwords.words('english')]

In [24]:
spam_corp = []

for line in spam_text:
    line_clean = data_clean(str(line))    
    word_tk = nltk.word_tokenize(line_clean)    
    word_tk_wo_stop = remove_stopword(word_tk)    
    spam_corp.extend(word_tk_wo_stop)

In [27]:
spam_corp[:10]

['free', 'entry', '2', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts']

In [26]:
spam_corp_tk = nltk.Text(spam_corp)

spam_dic = FreqDist(spam_corp_tk)

In [29]:
spam_dic.most_common(10)

[('call', 347),
 ('free', 216),
 ('2', 173),
 ('txt', 150),
 ('u', 147),
 ('ur', 144),
 ('mobile', 123),
 ('text', 120),
 ('4', 119),
 ('claim', 113)]

In [42]:
spam_pd = pd.DataFrame(list(spam_dic.items()),columns=['word','freq'])
spam_pd.sort_values(by='freq',inplace=True,ascending=False)
spam_pd.reset_index(drop=True)  #drop排序之前的idx，重新生成新的idx

Unnamed: 0,word,freq
0,call,347
1,free,216
2,2,173
3,txt,150
4,u,147
5,ur,144
6,mobile,123
7,text,120
8,4,119
9,claim,113


In [44]:
ham_corp = []
for line in ham_text:
    line_clean = data_clean(str(line))
    word_tk = nltk.word_tokenize(line_clean)
    word_tk_wo_stop = remove_stopword(word_tk)    
    ham_corp.extend(word_tk_wo_stop)

ham_corp_tk = nltk.Text(ham_corp)
ham_dic = FreqDist(ham_corp_tk)

In [45]:
ham_pd = pd.DataFrame(list(ham_dic.items()),columns=['word','freq'])
ham_pd.sort_values(by='freq',inplace=True,ascending=False)
ham_pd.reset_index(drop=True)

Unnamed: 0,word,freq
0,u,972
1,im,449
2,2,305
3,get,303
4,ltgt,276
5,ok,272
6,dont,257
7,go,247
8,got,243
9,ur,240


\begin{equation*}
P(垃圾短信/具体关键词) = \frac{P(具体关键词/垃圾短信)\times P(垃圾短信)}{P(具体关键词)}\\
=\frac{P(具体关键词/垃圾短信)\times P(垃圾短信)}{P(具体关键词/垃圾短信)\times P(垃圾短信)+P(具体关键词/正常短信)\times P(正常短信)}
\end{equation*}

* P(具体关键词/垃圾短信) = $\frac {垃圾短信中出现具体关键词的词频}{垃圾短信中所有词的词频}$ = $\frac {spam\_dic['具体关键词']}{spam\_tk\_freq\_sum} $
* P(具体关键词/正常短信) = $\frac {正常短信中出现具体关键词的词频}{正常短信中所有词的词频}$ = $\frac {ham\_dic['具体关键词']}{ham\_tk\_freq\_sum} $


In [46]:
spam_tk_freq_sum = spam_pd['freq'].sum()

ham_tk_freq_sum = ham_pd['freq'].sum()

* P(垃圾短信的概率) = $\frac {垃圾短信的数量}{垃圾短信的数量+正常短信的数量}$
* P(正常短信的概率) = $\frac {正常短信的数量}{垃圾短信的数量+正常短信的数量}$

In [48]:
P_Spam = len(spam_text)/(len(spam_text)+len(ham_text))
P_Ham = len(ham_text)/(len(spam_text)+len(ham_text))

In [49]:
P_Spam

0.13406317300789664

In [50]:
def bayesian_spam_keyword(keyword):
    P_spam_keyword = spam_dic[keyword]/spam_tk_freq_sum * P_Spam/(spam_dic[keyword]/spam_tk_freq_sum * P_Spam + ham_dic[keyword]/ham_tk_freq_sum * P_Ham)
    return P_spam_keyword

In [59]:
bayesian_spam_keyword('free')

0.6438375875113181