In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("SPAM-210331-134237.csv")

In [3]:
df.head(3)

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [4]:
df['spam'] = df['type'].map( {'spam':1, 'ham':0} ).astype(int)
df.head(4)

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0


In [5]:
print('Colums in the given data: ')
for col in df.columns:
    print(col)

Colums in the given data: 
type
text
spam


In [6]:
df.shape

(116, 3)

In [7]:
df['text'][1]

'Ok lar... Joking wif u oni...'

In [8]:
def tokenizer(text):
    return text.split()

In [9]:
df['text'] = df['text'].apply(tokenizer)

In [10]:
df['text'][1]

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

In [11]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer('english', ignore_stopwords=False)

In [12]:
def stem_it(text):
    return [porter.stem(words) for words in text]

In [13]:
df['text'] = df['text'].apply(stem_it)

In [14]:
df['text'][1]

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

In [15]:
df['text'][60]

['your',
 'gonna',
 'have',
 'to',
 'pick',
 'up',
 'a',
 '$1',
 'burger',
 'for',
 'yourself',
 'on',
 'your',
 'way',
 'home.',
 'i',
 "can't",
 'even',
 'move.',
 'pain',
 'is',
 'kill',
 'me.']

In [16]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [17]:
def lemmit_it(text):
    return [lemmatizer.lemmatize(word, pos = 'a') for word in text]

In [18]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\amit kumar
[nltk_data]     choudhary\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
df['text'] = df['text'].apply(lemmit_it)

In [20]:
df['text'][60]

['your',
 'gonna',
 'have',
 'to',
 'pick',
 'up',
 'a',
 '$1',
 'burger',
 'for',
 'yourself',
 'on',
 'your',
 'way',
 'home.',
 'i',
 "can't",
 'even',
 'move.',
 'pain',
 'is',
 'kill',
 'me.']

In [21]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\amit kumar
[nltk_data]     choudhary\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [23]:
def stop_it(text):
    review = [word for word in text if not word in stop_words]
    return review

In [24]:
df['text'] = df['text'].apply(stop_it)

In [25]:
df['text'][60]

['gonna',
 'pick',
 '$1',
 'burger',
 'way',
 'home.',
 "can't",
 'even',
 'move.',
 'pain',
 'kill',
 'me.']

In [26]:
df['text'] = df['text'].apply(' '.join)

In [27]:
df.head(3)

Unnamed: 0,type,text,spam
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",0
1,ham,ok lar... joke wif u oni...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
y = df.spam.values
x = tfidf.fit_transform(df['text'])

In [29]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=1, test_size=0.2, shuffle=False)

In [30]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [31]:
clf.fit(x_train, y_train)
clf.predict(x_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [32]:
clf.score(x_test, y_test)

0.875

In [33]:
from sklearn.svm import LinearSVC
linear_svc = LinearSVC(random_state=0)
linear_svc.fit(x_train, y_train)
y_pred = linear_svc.predict(x_test)


In [34]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

0.875