### Spam Classfier Use Case

Data got from: https://archive.ics.uci.edu/dataset/228/sms+spam+collection

##### Importing Libraries

In [16]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

##### Reading Data

In [8]:
data = pd.read_csv('SMSSpamCollection', sep = '\t', names = ['label', 'message'])

In [10]:
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


##### Text Preprocessing

In [26]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felipe_q/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/felipe_q/nltk_data...


True

In [27]:
global lemmatizer 
lemmatizer = WordNetLemmatizer()

def remove_noise(str_noise):
    
    
    str_clean = re.sub('[^a-zA-Z]', ' ', str_noise)
    str_clean = str_clean.lower()
    str_clean = str_clean.split()
    str_clean_2 = [lemmatizer.lemmatize(word) for word in str_clean if word not in 
                   stopwords.words('english')]
    str_clean_f = ' '.join(str_clean_2)
    
    return str_clean_f

data['message_clean'] = ''
data.message_clean = data.message.apply(remove_noise)

In [28]:
data

Unnamed: 0,label,message,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think go usf life around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,nd time tried contact u u pound prize claim ea...
5568,ham,Will ü b going to esplanade fr home?,b going esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood suggestion
5570,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...


#### Creating Bag of Words Model

In [55]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
corpus = data.message_clean
corpus = list(corpus)
X = cv.fit_transform(corpus).toarray()

In [56]:
X.shape

(5572, 7098)

The number 7098 equals the number of unique words in the texts

##### To correct this and select the words that actually give information to the text we correct the max_features parameter

In [60]:
cv = CountVectorizer(max_features = 5000)
corpus = data.message_clean
corpus = list(corpus)
X = cv.fit_transform(corpus).toarray()

In [61]:
X.shape

(5572, 5000)

In [64]:
y = pd.get_dummies(data['label'])
y = y.iloc[:,1].values

In [67]:
y.shape

(5572,)

### Model Training

In [68]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state = 42)

In [69]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [70]:
y_pred = spam_detect_model.predict(X_test)

In [72]:
from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test, y_pred)

In [73]:
print(confusion_m)

[[951  15]
 [  7 142]]
