# Vectorizer Tuning

In [14]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

data = pd.read_csv("reviews.csv")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [15]:
#locating the default characters considered as punctuations.
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [16]:
import string
import re
# Define a function to remove punctuation in our messages
def no_punctuation(reviews):
    reviews = "".join([char for char in reviews if char not in string.punctuation])
    return reviews
def removelines(value):
    return ''.join(value.splitlines())
def lower_text(clean_text):
    # converting clean text to lowercase
    clean_text = clean_text.lower()
    return clean_text
def remove_num(clean_text):
    # remove numbers
    clean_textnonum = re.sub(r'\d+', '', clean_text)
    return clean_textnonum
data['reviews'] = data['reviews'].apply(lambda x: removelines(x))
data['reviews'] = data['reviews'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]"," ",x.lower().strip()))
data['reviews'] = data['reviews'].apply(lambda x: no_punctuation(x))
data['reviews']=data['reviews'].apply(lambda x: lower_text(x))
data['reviews']=data['reviews'].apply(lambda x: remove_num(x))
data.head()

Unnamed: 0,target,reviews
0,neg,plot two teen couples go to a church party ...
1,neg,the happy bastard s quick movie review damn th...
2,neg,it is movies like these that make a jaded movi...
3,neg,quest for camelot is warner bros first...
4,neg,synopsis a mentally unstable man undergoing ...


In [17]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()
stop_words.extend([ 'first', 'second', 'third', 'me', 'haha', 'lol', 'oof', 'cds'])#we added to our list of stopwords

In [18]:
# Lemmatize
import nltk
from nltk.tokenize import word_tokenize
def lemmatizing_text(clean_text):
    #words= nltk.word_tokenize(x)
    clean_text = clean_text.apply(lambda x: ' ' .join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x) if word not in stop_words]))
    return clean_text

data['reviews'] = lemmatizing_text(data['reviews'])

data.head()

    #return clean_tokens 

Unnamed: 0,target,reviews
0,neg,plot two teen couple go church party drink dri...
1,neg,happy bastard quick movie review damn yk bug g...
2,neg,movie like make jaded movie viewer thankful in...
3,neg,quest camelot warner bros feature length fully...
4,neg,synopsis mentally unstable man undergoing psyc...


## Tuning

👇 Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

In [None]:
# Create Pipeline

# Set parameters to search (model and vectorizer)

# Perform grid search on pipeline
#tfidf

In [19]:
# Create Pipeline
text_vect = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB())])

In [20]:
# Set parameters to search (model and vectorizer)
tuned_paramet = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    #'tfidf__fit_prior': [True, False],
    'clf__alpha': [1, 1e-1, 1e-2]
}

In [21]:
from sklearn.metrics import accuracy_score, cohen_kappa_score, roc_auc_score, precision_score, balanced_accuracy_score, recall_score, f1_score, make_scorer

In [22]:
x_train, x_test, y_train, y_test = train_test_split(data['reviews'], data['target'], test_size=0.20, random_state=42)

In [23]:
score = 'f1_macro'
#https://peltarion.com/knowledge-center/documentation/evaluation-view/classification-loss-metrics/macro-f1-score

In [24]:
# Perform grid search on pipeline
#tfidf

from sklearn.metrics import classification_report
clf = GridSearchCV(text_vect, tuned_paramet, cv=10, scoring=score)
clf.fit(x_train, y_train)

print(classification_report(y_test, clf.predict(x_test), digits=4))

              precision    recall  f1-score   support

         neg     0.8187    0.7940    0.8061       199
         pos     0.8019    0.8259    0.8137       201

    accuracy                         0.8100       400
   macro avg     0.8103    0.8099    0.8099       400
weighted avg     0.8103    0.8100    0.8099       400



In [25]:
print(classification_report(y_train, clf.predict(x_train), digits=4))

              precision    recall  f1-score   support

         neg     0.9863    0.9888    0.9875       801
         pos     0.9887    0.9862    0.9875       799

    accuracy                         0.9875      1600
   macro avg     0.9875    0.9875    0.9875      1600
weighted avg     0.9875    0.9875    0.9875      1600



In [26]:
print("Best Score: ", clf.best_score_)
print("Best Params: ", clf.best_params_)

Best Score:  0.8248800931358703
Best Params:  {'clf__alpha': 1, 'tfidf__norm': 'l2', 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}


4. Conslusion
The model, which was trained on the development set, demonstrated 
82% on the evaluation set.



⚠️ Please push the exercise once you are done 🙃

## 🏁 