In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('spam.csv', sep='\t')

In [3]:
df


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
761,spam,"Romantic Paris. 2 nights, 2 flights from £79 B..."
762,ham,"We are at grandmas. Oh dear, u still ill? I fe..."
763,spam,Urgent Ur £500 guaranteed award is still uncla...
764,ham,Nothing but we jus tot u would ask cos u ba gu...


In [4]:
df.groupby('Category').describe()


# to check how many of each we have

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,654,643,"Sorry, I'll call later",5
spam,112,108,#ERROR!,2


In [5]:
# we now have to convert all the text data to numbers

In [6]:
df['spam'] = df['Category'].apply(lambda x : 1 if x=='spam' else 0)

In [7]:
df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
761,spam,"Romantic Paris. 2 nights, 2 flights from £79 B...",1
762,ham,"We are at grandmas. Oh dear, u still ill? I fe...",0
763,spam,Urgent Ur £500 guaranteed award is still uncla...,1
764,ham,Nothing but we jus tot u would ask cos u ba gu...,0


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

### We will now use Count Vectoriser Technique to convert the messsage columns text data into numbers

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [24]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_count, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Here X_train_count is the text converted into a number matrix using CountVectorizer

In [15]:
# lets now test our model

In [25]:
emails = [
    'Hey mohan, lets go for a football play tomorrow?',
    'Upto 20% off on parking, exclusive offer just for you. Dont miss the reward! Free Free'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 0])

In [27]:
X_test_count = v.transform(X_test)
model.score(X_test_count, y_test)

0.9935064935064936

In [28]:
# Above procedure can be shortened by using pipelines:


In [29]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [30]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [31]:
clf.score(X_train, y_train)

0.9934640522875817

# Exercise

## To classify wine into one of the three categories

In [35]:
from sklearn.datasets import load_wine
wine = load_wine()

In [36]:
dir(wine)

['DESCR', 'data', 'feature_names', 'target', 'target_names']

In [44]:
len(wine.data)


178

In [53]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.2)

In [54]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB

model_m = MultinomialNB()
model_g = GaussianNB()

In [55]:
model_m.fit(X_train, y_train)
model_m.score(X_test, y_test)

0.8611111111111112

In [56]:
model_g.fit(X_train, y_train)
model_g.score(X_test, y_test)

0.9444444444444444

In [57]:
# Lets find average using K Fold cross validation

In [59]:
from sklearn.model_selection import cross_val_score

score1 = cross_val_score(MultinomialNB(), wine.data, wine.target, cv=40)
score2 = cross_val_score(GaussianNB(), wine.data, wine.target, cv=40)

In [60]:
sum(score1)/len(score1)

0.8550000000000001

In [61]:
sum(score2)/len(score2)

0.9787500000000001