In [1]:
# usual imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib notebook
from sklearn.cross_validation import train_test_split
# Each is a different implemntation of a text transform tool: Bag of Words & Tfidf
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

#### Let's first play with Yelp data. Earlier, we performed sentiment analysis on this dataset using Random Forest.  For this practice project you shall refer to our earlier codes i.e. [notebook 1](https://github.com/ga-students/DS-SF-24/blob/master/Code/Lecture13.ipynb) and [notebook 2](https://github.com/ga-students/DS-SF-24/blob/master/Code/Lecture13-Practice-Solution.ipynb)

In [2]:
# let's load data and put it in a dataframe
url = "https://raw.githubusercontent.com/ga-students/DS-SF-24/master/Data/yelp_labelled.txt"
Yelp_data = pd.read_csv(url , sep = "\t", names = ['text','sentiment'])
Yelp_data.dropna(inplace = True)
Yelp_data.head()

Unnamed: 0,text,sentiment
0,Wow... Loved this place.,1.0
3,Crust is not good.,0.0
4,Not tasty and the texture was just nasty.,0.0
10,Stopped by during the late May bank holiday of...,1.0
11,The selection on the menu was great and so wer...,1.0


#### Split data to 80% training and 20% test set. (Use Random State  = 24) 

In [3]:
X = Yelp_data['text']
y = Yelp_data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = .20,
                                                    random_state = 24)

#### Here are few libararies we do need from here on

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import BernoulliNB

#### Use Pipeline and define CountVectorizer() as 'vect' and MultiNomial Naive Bayes as your 'clf' - classifier. Then set your parameters to

'vect__min_df':[1,2,3,5,10], 

'vect__max_df':[50,100,150,200,500,1000,1200], 

'clf__alpha':[0,0.1,0.2,0.5,.8,1]


In [5]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', MultinomialNB())])

parameters = {'vect__min_df':[1,2,3,5,10], 'vect__max_df':[50,100,150,200,500,1000,1200], 'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

#### Using GridSearchCV find the best parameters and use it to calculate test error. Did you beat Random Forest?

In [7]:
gs_clf = GridSearchCV(text_clf, parameters, cv = 10, n_jobs = -1)
fit_grid = gs_clf.fit(X_train, y_train)
fit_grid.score(X_test, y_test)

0.755

In [None]:
# RandomForest score = 0.78500000000000003

#### Use Pipeline and define CountVectorizer() as 'vect' and Bernoulli Naive Bayes as your 'clf' - classifier. Then set your parameters to

'vect__min_df':[1,2,3,5,10], 

'vect__max_df':[50,100,150,200,500,1000,1200], 

'clf__alpha':[0,0.1,0.2,0.5,.8,1]


In [9]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', BernoulliNB())])

parameters = {'vect__min_df':[1,2,3,5,10], 'vect__max_df':[50,100,150,200,500,1000,1200], 'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

#### Using GridSearchCV find the best parameters and use it to calculate test error. Did you beat Random Forest?


In [10]:
gs_clf = GridSearchCV(text_clf, parameters, cv = 10, n_jobs = -1)
fit_grid = gs_clf.fit(X_train, y_train)
fit_grid.score(X_test, y_test)

0.72999999999999998

#### What parameters are chosen by GridSearchCV?

In [11]:
fit_grid.best_params_

{'clf__alpha': 0.2, 'vect__max_df': 200, 'vect__min_df': 2}

#### Now it's time for a new dataset! Let's play with SMS dataset. We would like to develop a model by which filter spam/ham text messages. Let's explore this dataset first.

In [12]:
import pandas as pd
url = "https://raw.githubusercontent.com/ga-students/DS-SF-24/master/Data/SMSSpamCollection.tsv"
col_names = ['label', 'message']
smsData = pd.read_csv(url, sep='\t', header = 0, names = col_names)
smsData.head(5)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
smsData.shape

(5572, 2)

#### Repeat the procedure you applied on Yelp data on SMS data. Can you get better results by using Bernoulli Naive Bayes or MultiNomial Naive Bayes? What is the best score on test set using best tuning parameters? 

In [16]:
X = smsData['message']
y = smsData['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = .20,
                                                    random_state = 24)

In [19]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', MultinomialNB())])

parameters = {'vect__min_df':[1,2,3,5,10], 'vect__max_df':[50,100,150,200,500,1000,1200], 'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

gs_clf = GridSearchCV(text_clf, parameters, cv = 10, n_jobs = -1)
fit_grid = gs_clf.fit(X_train, y_train)
fit_grid.score(X_test, y_test)

0.98475336322869955

In [20]:
fit_grid.best_params_

{'clf__alpha': 0.8, 'vect__max_df': 500, 'vect__min_df': 1}

In [25]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', MultinomialNB())])

parameters = {'clf__alpha': [0.8], 'vect__max_df': [500], 'vect__min_df': [1]}

gs_clf = GridSearchCV(text_clf, parameters, cv = 10, n_jobs = -1)
fit_grid = gs_clf.fit(X_train, y_train)
fit_grid.score(X_test, y_test)

0.98475336322869955

In [21]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', BernoulliNB())])

parameters = {'vect__min_df':[1,2,3,5,10], 'vect__max_df':[50,100,150,200,500,1000,1200], 'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

gs_clf = GridSearchCV(text_clf, parameters, cv = 10, n_jobs = -1)
fit_grid = gs_clf.fit(X_train, y_train)
fit_grid.score(X_test, y_test)

0.98475336322869955

In [22]:
fit_grid.best_params_

{'clf__alpha': 0.1, 'vect__max_df': 500, 'vect__min_df': 2}

In [24]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', BernoulliNB())])

parameters = {'clf__alpha': [0.1], 'vect__max_df': [500], 'vect__min_df': [2]}

gs_clf = GridSearchCV(text_clf, parameters, cv = 10, n_jobs = -1)
fit_grid = gs_clf.fit(X_train, y_train)
fit_grid.score(X_test, y_test)

0.98475336322869955

Answer: they are both about the same 

#### Print out misclassified instances in your test set. 

In [26]:
# Misclassified instances
count  = range(len(y_test))
for i in count:
    if fit_grid.predict(X_test)[i] != y_test.values[i]:
        print (X_test.values[i])

2/2 146tf150p
SMS. ac sun0819 posts HELLO:"You seem cool, wanted to say hi. HI!!!" Stop? Send STOP to 62468
LookAtMe!: Thanks for your purchase of a video clip from LookAtMe!, you've been charged 35p. Think you can do better? Why not send a video in a MMSto 32323.
Hi this is Amy, we will be sending you a free phone number in a couple of days, which will give you an access to all the adult parties...
You can donate £2.50 to UNICEF's Asian Tsunami disaster support fund by texting DONATE to 864233. £2.50 will be added to your next bill
Check Out Choose Your Babe Videos @ sms.shsex.netUN fgkslpoPW fgkslpo
Cps is causing the outages to conserve energy.
K:)eng rocking in ashes:)
Not heard from U4 a while. Call me now am here all night with just my knickers on. Make me beg for it like U did last time 01223585236 XX Luv Nikiyu4.net
For sale - arsenal dartboard. Good condition but no doubles or trebles!
Gettin rdy to ship comp
Sorry I missed your call let's talk when you have the time. I'm on 0