In [None]:
#import libraries
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
#https://www.kaggle.com/uciml/sms-spam-collection-dataset
df_train = pd.read_csv('spam_train.csv', encoding='ISO-8859-1')
df_train.head(5)

In [None]:
df_test = pd.read_csv('spam_test.csv', encoding='ISO-8859-1')
df_test.head(5)

In [None]:
tokenizer=RegexpTokenizer('r\w+')
stopwords_english=set(stopwords.words('english'))

#Tokenizing & stemming & removing stop words
def cleanSms(sms):
 sms=sms.replace("<br /><br />"," ")
 sms=sms.lower()
 sms_tokens=tokenizer.tokenize(sms)
 sms_tokens_without_stopwords=[token for token in sms_tokens if token not in stopwords_english]
 stemmed_sms_tokens_without_stopwords=[PorterStemmer().stem(token) for token in sms_tokens_without_stopwords]
 cleaned_sms=' '.join(stemmed_sms_tokens_without_stopwords)
 return cleaned_sms

In [None]:
#Clean the data & plot it on X & Y
df_train['sms'].apply(cleanSms)
x_train = df_train['sms'].values
y_train = df_train['category'].values

df_test['sms'].apply(cleanSms)
x_test = df_test['sms'].values
y_test = df_test['category'].values

In [None]:
#Vectorze the data
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='ISO-8859-1')
vectorizer.fit(x_train)
x_train=vectorizer.transform(x_train)
x_test=vectorizer.transform(x_test)

In [None]:
#Create model
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(solver='lbfgs')
model.fit(x_train,y_train)

In [None]:
#Predict Spam
model.predict(vectorizer.transform(["you won $900 in the new lottery draw. Call +123456789."]))

In [None]:
#Predict Ham
model.predict(vectorizer.transform(["Hello there. How are you doing?"]))

# Pickle operation

In [None]:
import joblib
joblib.dump(model,'spam_ham_model.pkl')
joblib.dump(vectorizer,'vectorizer.pkl')