In [26]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# reading the file
data =  pd.read_csv("spam classifier.csv", encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data = data[['v1','v2']]
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.iloc[1,0]

'ham'

In [6]:
# data information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
v1    5572 non-null object
v2    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [7]:
# describing the data 
data.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [8]:
data.v1.value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [9]:
stopwords_set = set(stopwords.words('english'))

In [17]:
spam =[]

for index, row in data.iterrows():
    words = [s.lower() for s in row.v2.split() 
             if len(s)>3 and s.lower() not in stopwords_set]
#     words_cleaned = [word for word in words]
#     words_wo_sw = [word for word in words 
#                   if word not in stopwords_set]
    words_joined = ' '.join(words)
    spam.append((words_joined, row.v1))

In [18]:
spam

[('jurong point, crazy.. available bugis great world buffet... cine amore wat...',
  'ham'),
 ('lar... joking oni...', 'ham'),
 ("free entry wkly comp final tkts 21st 2005. text 87121 receive entry question(std rate)t&c's apply 08452810075over18's",
  'spam'),
 ('early hor... already say...', 'ham'),
 ('think goes usf, lives around though', 'ham'),
 ("freemsg darling week's word back! like still? chgs send, å£1.50", 'spam'),
 ('even brother like speak treat like aids patent.', 'ham'),
 ("request 'melle melle (oru minnaminunginte nurungu vettam)' callertune callers. press copy friends callertune",
  'ham'),
 ('winner!! valued network customer selected receivea å£900 prize reward! claim call 09061701461. claim code kl341. valid hours only.',
  'spam'),
 ('mobile months more? entitled update latest colour mobiles camera free! call mobile update free 08002986030',
  'spam'),
 ("gonna home soon want talk stuff anymore tonight, i've cried enough today.",
  'ham'),
 ('chances cash! 20,000 pou

In [20]:
spam_dataframe = pd.DataFrame(spam, columns=['text','target'])
spam_dataframe.head()

Unnamed: 0,text,target
0,"jurong point, crazy.. available bugis great wo...",ham
1,lar... joking oni...,ham
2,free entry wkly comp final tkts 21st 2005. tex...,spam
3,early hor... already say...,ham
4,"think goes usf, lives around though",ham


In [21]:
cv = CountVectorizer()
X = cv.fit_transform(spam_dataframe.text)
X = X.toarray()
X.shape

(5572, 8105)

In [22]:
spamle = LabelEncoder()
Y = spamle.fit_transform(spam_dataframe.target)
Y

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [23]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(3900, 8105)
(3900,)
(1672, 8105)
(1672,)


In [27]:
clf = LogisticRegression()
clf.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
y_train_pred = clf.predict(x_train)
print("Train accuracy = ", accuracy_score(y_train, y_train_pred))

Train accuracy =  0.9958974358974358


In [29]:
y_test_pred = clf.predict(x_test)
print("Test accuracy = ", accuracy_score(y_test, y_test_pred))

Test accuracy =  0.9784688995215312


In [49]:
cv = TfidfVectorizer()
X = cv.fit_transform(spam_dataframe.text)
X = X.toarray()

In [50]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [51]:
spamle = LabelEncoder()
Y = spamle.fit_transform(spam_dataframe.target)
Y

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [52]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)


(4457, 8105)
(4457,)
(1115, 8105)
(1115,)


In [53]:
y_train_pred = clf.predict(x_train)
print("Train accuracy = ", accuracy_score(y_train, y_train_pred))

y_test_pred = clf.predict(x_test)
print("Test accuracy = ", accuracy_score(y_test, y_test_pred))

Train accuracy =  0.8669508638097375
Test accuracy =  0.8618834080717489
