In [16]:
#Necessary Imports
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [17]:
#Reading The Data From File
data = pd.read_csv('SMSSpam', sep='\t', names=['Status','Message'])
data.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
#Length Of The DataFrame
len(data)

5572

In [19]:
#No Of Spams
sum(data.Status == 'spam')

747

In [20]:
#Converting The Status Variables To 0 And 1 For More Fesability
data.loc[data.Status == 'spam', 'Status'] = 0
data.loc[data.Status == 'ham', 'Status'] = 1
data.head()

Unnamed: 0,Status,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [21]:
#Separating The DataFrame To Represent Features And Labels.
data_x = data['Message']
data_y = data['Status']

#Splitting The Data For Training And Testing
x_train, x_text, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=100)

In [22]:
#Example - #Implementing The Count Vectorizer
cv = CountVectorizer()
lst = ['Hi, How are you, what are you doing', 'Hey, Whats Up, How is it Going', 'Count Vectorizer is cool', 
       'Text learning is great']
lst_cv = cv.fit_transform(lst)
array = lst_cv.toarray()
print(array)

[[2 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 2]
 [0 0 0 0 1 0 1 0 1 1 1 0 0 1 0 0 1 0]
 [0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0]]


In [23]:
cv.inverse_transform(array[0])

[array(['are', 'doing', 'hi', 'how', 'what', 'you'], 
       dtype='<U10')]

In [24]:
#List OF Features Identified By CountVectorizer
print(cv.get_feature_names())

['are', 'cool', 'count', 'doing', 'going', 'great', 'hey', 'hi', 'how', 'is', 'it', 'learning', 'text', 'up', 'vectorizer', 'what', 'whats', 'you']


In [25]:
#Implementing The Above Count Vectorizer To SMSData
cv1 = CountVectorizer()
x_trainCV = cv1.fit_transform(x_train)
array = x_trainCV.toarray()
print(array)

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [26]:
#Total Number Of Features Identified
print(len(cv1.get_feature_names()))

7764


In [27]:
x_train.iloc[0]

'K da:)how many page you want?'

In [28]:
#Applying Inverse transform
print(cv1.inverse_transform(array[0]))

[array(['da', 'how', 'many', 'page', 'want', 'you'], 
      dtype='<U34')]


In [35]:
#Example - Implementing The TdIdfVectorizer
cv2 = TfidfVectorizer(stop_words = 'english')
lst_cv2 = cv2.fit_transform(lst)
array = lst_cv2.toarray()
print(array)

[[ 0.          0.          0.70710678  0.          0.          0.
   0.70710678  0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.57735027  0.          0.57735027
   0.          0.          0.          0.          0.57735027]
 [ 0.57735027  0.57735027  0.          0.          0.          0.          0.
   0.          0.          0.57735027  0.        ]
 [ 0.          0.          0.          0.          0.57735027  0.          0.
   0.57735027  0.57735027  0.          0.        ]]


In [36]:
print(cv2.get_feature_names())

['cool', 'count', 'doing', 'going', 'great', 'hey', 'hi', 'learning', 'text', 'vectorizer', 'whats']


In [37]:
print(cv2.inverse_transform(array[0]))

[array(['doing', 'hi'], 
      dtype='<U10')]


In [38]:
print(lst[0])

Hi, How are you, what are you doing


In [45]:
#Implementing The Above TfIdf Vectorizer To SMSData
cv3 = TfidfVectorizer(stop_words='english')
x_train_TfIdf = cv3.fit_transform(x_train)
array = x_train_TfIdf.toarray()
print(array)

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [46]:
#Total Number Of Features Identified.
print(len(cv3.get_feature_names()))

7499


In [47]:
x_train.iloc[0]

'K da:)how many page you want?'

In [50]:
#Applying The inverse Transform
print(cv3.inverse_transform(array[0]))

[array(['da', 'page', 'want'], 
      dtype='<U34')]


In [71]:
#Now Lets Apply The Above Created Vectorizer To Naive Bayes Classifier
y_train = y_train.astype('int')
y_test = np.array(y_test)
x_test_Tfidf = cv3.transform(x_text)
cls = MultinomialNB()
cls.fit(x_train_TfIdf,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [72]:
pred = cls.predict(x_test_Tfidf)
pred

array([1, 1, 0, ..., 0, 1, 1])

In [73]:
from sklearn.metrics import accuracy_score
print(accuracy_score(pred,y_test))

ValueError: Can't handle mix of binary and unknown

In [69]:
y_test

array([1, 1, 0, ..., 0, 1, 1], dtype=object)