# Vectorizer Tuning

In [1]:
import pandas as pd

data = pd.read_csv("reviews.csv")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [13]:
import re
def remove_punctuation(oldtext):
    newtext = re.sub(r'[^A-Za-z]+', ' ', oldtext)
    return newtext



data['clean'] = data['reviews'].apply(remove_punctuation)
data['clean'] = data['clean'].str.lower()
data.loc[data['target'] == 'pos', "num_target"] = int(1)
data.loc[data['target'] == 'neg', "num_target"] = int(0)
data["num_target"] = data["num_target"].astype(int)
data

Unnamed: 0,target,reviews,clean,num_target
0,neg,"plot : two teen couples go to a church party ,...",plot two teen couples go to a church party dri...,0
1,neg,the happy bastard's quick movie review \ndamn ...,the happy bastard s quick movie review damn th...,0
2,neg,it is movies like these that make a jaded movi...,it is movies like these that make a jaded movi...,0
3,neg,""" quest for camelot "" is warner bros . ' firs...",quest for camelot is warner bros first featur...,0
4,neg,synopsis : a mentally unstable man undergoing ...,synopsis a mentally unstable man undergoing ps...,0
...,...,...,...,...
1995,pos,wow ! what a movie . \nit's everything a movie...,wow what a movie it s everything a movie can b...,1
1996,pos,"richard gere can be a commanding actor , but h...",richard gere can be a commanding actor but he ...,1
1997,pos,"glory--starring matthew broderick , denzel was...",glory starring matthew broderick denzel washin...,1
1998,pos,steven spielberg's second epic film on world w...,steven spielberg s second epic film on world w...,1


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [4]:
X = vectorizer.fit_transform(data['clean'])

In [6]:
vectorizer.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

## Tuning

👇 Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

In [10]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

pipe = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [12]:
from sklearn import set_config
set_config(display='diagram')
pipe

In [None]:
# Create Pipeline

# Set parameters to search (model and vectorizer)

# Perform grid search on pipeline

# v0

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from pprint import pprint
from time import time
import logging

# logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", SGDClassifier()),
    ]
)

parameters = {
    "vect__max_df": (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    "clf__max_iter": (20,),
    "clf__alpha": (0.00001, 0.000001),
    "clf__penalty": ("l2", "elasticnet"),
    # 'clf__max_iter': (10, 50, 80),
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)



In [18]:

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(data.clean, data.num_target)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__max_iter': (20,),
 'clf__penalty': ('l2', 'elasticnet'),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 24 candidates, totalling 120 fits
done in 104.673s

Best score: 0.862
Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'elasticnet'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)


⚠️ Please push the exercise once you are done 🙃

## 🏁 

# V1

In [27]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", SGDClassifier()),
    ]
)

parameters = {
    "vect__max_df": (0.5,),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    "vect__ngram_range": ((1, 2),),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    "clf__max_iter": (20,25),
    "clf__alpha": (0.0005,0.0001),
    "clf__penalty": ("elasticnet",),
    'clf__max_iter': (10, 50, 80),
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(data.clean, data.num_target)
print("done in %0.3fs" % (time() - t0))
#print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (0.0005, 0.0001),
 'clf__max_iter': (10, 50, 80),
 'clf__penalty': ('elasticnet',),
 'vect__max_df': (0.5,),
 'vect__ngram_range': ((1, 2),)}
Fitting 5 folds for each of 6 candidates, totalling 30 fits




done in 50.713s
Best score: 0.873
Best parameters set:
	clf__alpha: 0.0001
	clf__max_iter: 10
	clf__penalty: 'elasticnet'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)




Fitting 5 folds for each of 16 candidates, totalling 80 fits
done in 180.194s
Best score: 0.871
Best parameters set:
	clf__alpha: 0.0001
	clf__max_iter: 20
	clf__penalty: 'elasticnet'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)


Best score: 0.862
Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'elasticnet'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)


Best score: 0.872
Best parameters set:
	clf__alpha: 0.0001
	clf__max_iter: 20
	clf__penalty: 'elasticnet'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)

Best score: 0.873
Best parameters set:
	clf__alpha: 0.0001
	clf__max_iter: 10
	clf__penalty: 'elasticnet'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)

In [31]:
from sklearn.naive_bayes import MultinomialNB
pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", MultinomialNB()),
    ]
)

parameters = {
    # "vect__max_df": (0.5,),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    # "vect__ngram_range": ((1, 2),),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    #"clf__max_iter": (20,25),
    "clf__alpha": (0.0005,0.0001),
    #"clf__penalty": ("elasticnet",),
    #'clf__max_iter': (10, 50, 80),
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(data.clean, data.num_target)
print("done in %0.3fs" % (time() - t0))
#print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (0.0005, 0.0001)}
Fitting 5 folds for each of 2 candidates, totalling 10 fits
done in 4.861s
Best score: 0.758
Best parameters set:
	clf__alpha: 0.0005
