In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("../data/spam.csv",encoding="utf-8")[["v1","v2"]]
df.drop_duplicates(inplace=True)

In [6]:
df = df.reset_index()

### 0 -> spam & 1 -> ham

In [7]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[["spam","ham"]]) 
encoded = oe.fit_transform(df[["v1"]])
d2 = pd.DataFrame(encoded,columns=oe.get_feature_names_out())
df["is_spam"] = d2[oe.get_feature_names_out()]

In [8]:
df["is_spam"] = df["is_spam"].astype("int")

In [9]:
df = df.drop(["index","v1"],axis=1)

In [10]:
df["msg_len"] = df["v2"].str.len()

In [11]:
df.groupby("is_spam")["msg_len"].mean()

is_spam
0    137.455657
1     70.457263
Name: msg_len, dtype: float64

In [17]:
import re,string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
def processed(text):
    
    new_text = None
    exclude = string.punctuation
    new_text =  text.lower()
    pattern = re.compile(r"https?://\S+|www\.\S+")
    new_text = pattern.sub(r"",new_text)
    new_text = new_text.translate(str.maketrans("","",exclude))
    newest_text = []
    for word in new_text.split():
        if word in stopwords.words("english"):
            pass
        else:
            newest_text.append(word)
    return " ".join(newest_text)
stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [18]:
txtpr = processed('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')
txtpr

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [19]:
df["v2"] = df["v2"].apply(processed)

In [20]:
df

Unnamed: 0,v2,is_spam,msg_len
0,go jurong point crazy available bugis n great ...,1,111
1,ok lar joking wif u oni,1,29
2,free entry 2 wkly comp win fa cup final tkts 2...,0,155
3,u dun say early hor u c already say,1,49
4,nah dont think goes usf lives around though,1,61
...,...,...,...
5165,2nd time tried 2 contact u u 750 pound prize 2...,0,160
5166,b going esplanade fr home,1,37
5167,pity mood soany suggestions,1,57
5168,guy bitching acted like id interested buying s...,1,125


In [21]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [22]:
df["v2"] = df["v2"].apply(stem_words)

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df["v2"]).toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,df["is_spam"],test_size=0.2,random_state=42)

In [25]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)

In [26]:
y_pred = lr.predict(X_test)

In [27]:
from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.9622823984526112

In [28]:
from sklearn.svm import SVC
sv = SVC()
sv.fit(X_train,y_train)

In [29]:
y_sv_pred = sv.predict(X_test)
accuracy_score(y_test,y_sv_pred)

0.9777562862669246

In [30]:
def predict(text):
    transformed = tfidf.transform([text]).toarray()
    return sv.predict(transformed)

In [31]:
import pickle
with open("vectorizer.pkl","wb") as f:
    pickle.dump(tfidf,f)
with open("model.pkl","wb") as s:
    pickle.dump(sv,s)

In [32]:
df["v2"][100]

'okay name ur price long legal wen pick u ave x am xx'

In [33]:
predict("okay name ur price long legal wen pick u ave x ams xx")

array([1])