In [1]:
import numpy as np
import pandas as pd


df=pd.read_table('SMSSpamCollection',header=None,encoding='UTF-8')

In [2]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [6]:
classes=df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


In [7]:
from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()
output=encoder.fit_transform(classes)

print(output[:10])


[0 0 1 0 0 1 0 0 1 1]


In [9]:
text_messages=df[1]
print(text_messages[0:10])


0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [11]:
# use regular expressions to replace email addresses, URLs, phone numbers, other numbers

# Replace email addresses with 'email'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'email')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymbol' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymbol')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumber')
    
# Replace numbers with 'number'
processed = processed.str.replace(r'\d+(\.\d+)?', 'number')

# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')


In [12]:
processed=processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in number a wkly comp to win fa cup...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbernd time we have tried number...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [13]:
from nltk.corpus import stopwords

stopwords=set(stopwords.words('english'))
processed=processed.apply(lambda x:" ".join(term for term in x.split() if term not in stopwords))

In [15]:
import nltk
ps=nltk.PorterStemmer()
processed=processed.apply(lambda x:' '.join(ps.stem(term) for term in x.split()))

In [16]:
from nltk.tokenize import word_tokenize
all_words=[]


for text in processed:
    words=word_tokenize(text)
    for w in words:
        all_words.append(w)
all_words=nltk.FreqDist(all_words)

In [18]:
print(len(all_words))

6573


In [20]:
print('Most Common:{}'.format(all_words.most_common(15)))

Most Common:[('number', 2759), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbolnumb', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [21]:
word_features=list(all_words.keys())

In [23]:
def find_feature(message):
    words=word_tokenize(message)
    features={}
    for word in word_features:
        features[word]=(word in words)
    return features


features=find_feature(processed[0])
for key,value in features.items():
    if(value==True):
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [29]:
messages=zip(processed,output)
seed=1

np.random.seed=seed
# np.random.shuffle(messages)

featuresets=[(find_feature(text),label) for (text,label) in messages]

In [32]:
from sklearn import model_selection
training,testing=model_selection.train_test_split(featuresets,test_size=0.25,random_state=seed)

In [33]:
print(len(training))
print(len(testing))

4179
1393


In [34]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model=SklearnClassifier(SVC(kernel='linear'))
model.train(training)
accuracy=nltk.classify.accuracy(model,testing)*100
print('Accuracy of SVC:{}'.format(accuracy))

Accuracy of SVC:98.85139985642498


In [36]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [37]:
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers=[
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(),
    MultinomialNB(),
    SVC()
]


models=zip(names,classifiers)


for name,model in models:
    nltk_model=SklearnClassifier(model)
    nltk_model.train(training)
    accuracy=nltk.classify.accuracy(nltk_model,testing)*100
    print("{} Accuracy: {}".format(name, accuracy))    

K Nearest Neighbors Accuracy: 92.82124910265614
Decision Tree Accuracy: 97.48743718592965




Random Forest Accuracy: 97.5592246949031




Logistic Regression Accuracy: 98.85139985642498
SGD Classifier Accuracy: 98.85139985642498
Naive Bayes Accuracy: 98.63603732950466




SVM Linear Accuracy: 86.71931083991386


In [42]:
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 86.71931083991386


In [43]:
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)


In [44]:
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1208
           1       0.99      0.92      0.96       185

    accuracy                           0.99      1393
   macro avg       0.99      0.96      0.97      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1207,1
actual,spam,15,170
