In [3]:
import numpy as np 
import pandas as pd

In [4]:
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sqlite3 import Error
from sklearn.ensemble import RandomForestClassifier
import sqlite3
import pickle
import nltk
%matplotlib inline

In [5]:
dataset = pd.read_csv("SMSSpamCollection.txt",delimiter = '\t', header = None)
dataset.columns = ['spam/ham', 'text']

In [6]:
stemmer = PorterStemmer()
words = stopwords.words("english")
dataset['cleaned'] = dataset['text'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

In [7]:
dataset.head()

Unnamed: 0,spam/ham,text,cleaned
0,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
1,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i think goe usf live around though


In [8]:
vectorizer = TfidfVectorizer(stop_words="english")
final_features = vectorizer.fit_transform(dataset['cleaned']).toarray()
final_features.shape

(5572, 6186)

In [9]:
from sklearn.linear_model import LogisticRegression
X = dataset['cleaned']
Y = dataset['spam/ham']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

pipeline = Pipeline([('vect', vectorizer),
                     ('clf', LogisticRegression(random_state=0))])

model = pipeline.fit(X_train, y_train)

In [10]:
ytest = np.array(y_test)

# confusion matrix and classification report(precision, recall, F1-score)
print(classification_report(ytest, model.predict(X_test)))
print(confusion_matrix(ytest, model.predict(X_test)))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       970
        spam       0.99      0.72      0.83       145

    accuracy                           0.96      1115
   macro avg       0.97      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115

[[969   1]
 [ 41 104]]


In [11]:
dataset[dataset['spam/ham'] == "spam"]

Unnamed: 0,spam/ham,text,cleaned
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...
5,spam,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey darl week word back i like fun sti...
8,spam,WINNER!! As a valued network customer you have...,winner as valu network custom select receivea ...
9,spam,Had your mobile 11 months or more? U R entitle...,had mobil month u r entitl updat latest colour...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",six chanc win cash from pound txt csh send cos...
...,...,...,...
5537,spam,Want explicit SEX in 30 secs? Ring 02073162414...,want explicit sex sec ring cost p min gsex pob...
5540,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...,ask mobil if chatlin inclu in free min india c...
5547,spam,Had your contract mobile 11 Mnths? Latest Moto...,had contract mobil mnth latest motorola nokia ...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...,remind from o to get pound free call credit de...


In [12]:
example_tweet = ["Free entry in 2 a wkly comp to win FA Cup final"]
predictions1 = pipeline.predict(example_tweet)
print(predictions1)

['ham']
