In [100]:
import pandas as pd
import numpy as np

# Loading the Dataset

In [101]:
dt = pd.read_csv('spam.csv')
dt.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [102]:
dt['spam'] = dt['type'].map({'spam': 1, 'ham': 0})
dt.head()

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


# Tokenization

In [103]:
dt['text'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [104]:
def tokenizer(text):
    return text.split()

dt['text'] = dt['text'].apply(tokenizer)
dt['text'][0]

['Go',
 'until',
 'jurong',
 'point,',
 'crazy..',
 'Available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet...',
 'Cine',
 'there',
 'got',
 'amore',
 'wat...']

# Stemming

In [105]:
from nltk.stem.snowball import SnowballStemmer
abc = SnowballStemmer("english", ignore_stopwords = False)

In [106]:
def stem_it(text):
    return [abc.stem(word) for word in text]

dt['text'] = dt['text'].apply(stem_it)

In [107]:
dt['text'][2]

['free',
 'entri',
 'in',
 '2',
 'a',
 'wkli',
 'comp',
 'to',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 '21st',
 'may',
 '2005.',
 'text',
 'fa',
 'to',
 '87121',
 'to',
 'receiv',
 'entri',
 'question(std',
 'txt',
 'rate)t&c',
 'appli',
 '08452810075over18']

# Lemmitizer

In [108]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [109]:
def lemmat(text):
    return [lemmatizer.lemmatize(word, pos = 'a') for word in text]

In [110]:
dt['text'] = dt['text'].apply(lemmat)
dt['text'][2]

['free',
 'entri',
 'in',
 '2',
 'a',
 'wkli',
 'comp',
 'to',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 '21st',
 'may',
 '2005.',
 'text',
 'fa',
 'to',
 '87121',
 'to',
 'receiv',
 'entri',
 'question(std',
 'txt',
 'rate)t&c',
 'appli',
 '08452810075over18']

# Stopwords Removal

In [111]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [112]:
def stop_wordss(text):
    stop = [word for word in text if not word in stop_words]
    return stop

In [113]:
dt['text'] = dt['text'].apply(stop_wordss)
dt['text'][2]

['free',
 'entri',
 '2',
 'wkli',
 'comp',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 '21st',
 'may',
 '2005.',
 'text',
 'fa',
 '87121',
 'receiv',
 'entri',
 'question(std',
 'txt',
 'rate)t&c',
 'appli',
 '08452810075over18']

In [114]:
dt.head(5)

Unnamed: 0,type,text,spam
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0


In [115]:
dt['text'] = dt['text'].apply(' '.join)
dt['text'][2]


'free entri 2 wkli comp win fa cup final tkts 21st may 2005. text fa 87121 receiv entri question(std txt rate)t&c appli 08452810075over18'

In [116]:
dt.head(5)

Unnamed: 0,type,text,spam
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",0
1,ham,ok lar... joke wif u oni...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1
3,ham,u dun say earli hor... u c alreadi say...,0
4,ham,"nah think goe usf, live around though",0


# TfidfFVectorizer

In [156]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

y = dt.spam.values
x = tfidf.fit_transform(dt['text'])

In [184]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state = 1, test_size = 0.2, shuffle = True)

# Classification using Logistic Regression

In [182]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

from sklearn.metrics import accuracy_score
acc_log = accuracy_score(y_pred, y_test)*100
print("accuracy", acc_log)

accuracy 91.66666666666666


# Classification using LinearSVC

In [185]:
from sklearn.svm import LinearSVC
linear_svc = LinearSVC(random_state = 0)
linear_svc.fit(x_train, y_train)
y_pred = linear_svc.predict(x_test)

acc_linear_svc= accuracy_score(y_pred, y_test)*100
print("accuracy", acc_linear_svc)

accuracy 83.33333333333334


# ---------xxxxx---------