#### 1. Converting words or sentences into numeric vectors is fundamental when working with text data. To make sure that you have a solid handle on how these vectors work, generate the TF-IDF vectors for the last three sentences of the example from the beginning of this checkpoint (from the BoW revisited: TF-IDF section).



In [1]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
from nltk.corpus import gutenberg
import nltk
import warnings
warnings.filterwarnings("ignore")

In [2]:
# s1 = "The best Monty Python sketch is the one about the dead parrot; I laughed so hard."
# s2 = "I laugh when I think about Python's Ministry of Silly Walks sketch; it is funny, funny, funny, the best!"
# s3 = "Chocolate is the best ice cream dessert topping, with a great taste."
s4 = "The Lumberjack Song is the funniest Monty Python bit; I can't think of it without laughing."
s5 = "I would rather put strawberries on my ice cream for dessert; they have the best taste."
s6 = "The taste of caramel is a fantastic accompaniment to tasty mint ice cream."

In [3]:
# Parse the cleaned novels. This can take some time.
nlp = spacy.load('en_core_web_sm')
com_doc = nlp(s4)
ice_doc = nlp(s5 + s6)

In [4]:
# Group into sentences
com_sents = [[sent, "comedy"] for sent in com_doc.sents]
ice_sents = [[sent, "ice cream"] for sent in ice_doc.sents]

# Combine the sentences from the two novels into one DataFrame
sentences = pd.DataFrame(com_sents + ice_sents, columns = ["text", "type"])
sentences.head()

Unnamed: 0,text,type
0,"(The, Lumberjack, Song, is, the, funniest, Mon...",comedy
1,"(I, would, rather, put, strawberries, on, my, ...",ice cream
2,"(The, taste, of, caramel, is, a, fantastic, ac...",ice cream


In [5]:
# Get rid of stop words and punctuation,
# and lemmatize the tokens
for i, sentence in enumerate(sentences["text"]):
    sentences.loc[i, "text"] = " ".join(
        [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop])

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(use_idf=True, norm=u'l2', smooth_idf=True)

# Applying the vectorizer
X = vectorizer.fit_transform(sentences["text"])

tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
sentences = pd.concat([tfidf_df, sentences[["text", "type"]]], axis=1)

# Keep in mind that log base 2 of 1 is 0,
# so a TF-IDF score of 0 indicates that the word was present once in that sentence.
sentences.head()

Unnamed: 0,accompaniment,bit,caramel,cream,dessert,fantastic,funniest,good,ice,laugh,...,mint,monty,python,song,strawberry,taste,tasty,think,text,type
0,0.0,0.353553,0.0,0.0,0.0,0.0,0.353553,0.0,0.0,0.353553,...,0.0,0.353553,0.353553,0.353553,0.0,0.0,0.0,0.353553,Lumberjack Song funniest Monty Python bit thin...,comedy
1,0.0,0.0,0.0,0.349498,0.459548,0.0,0.0,0.459548,0.349498,0.0,...,0.0,0.0,0.0,0.0,0.459548,0.349498,0.0,0.0,strawberry ice cream dessert good taste,ice cream
2,0.385323,0.0,0.385323,0.293048,0.0,0.385323,0.0,0.0,0.293048,0.0,...,0.385323,0.0,0.0,0.0,0.0,0.293048,0.385323,0.0,taste caramel fantastic accompaniment tasty mi...,ice cream


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

In [9]:
Y = sentences['type']
X = np.array(sentences.drop(['text','type'], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=66)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 1.0

Test set score: 1.0
----------------------Random Forest Scores----------------------
Training set score: 1.0

Test set score: 1.0
----------------------Gradient Boosting Scores----------------------
Training set score: 1.0

Test set score: 1.0


#### 2. In the 2-grams example above, you only used 2-grams as your features. This time, use both 1-grams and 2-grams together as your feature set. Run the same models as in the example and compare the results.

In [10]:
vectorizer = TfidfVectorizer(use_idf=True, norm=u'l2', smooth_idf=True, ngram_range=(2,2))

# Applying the vectorizer
X = vectorizer.fit_transform(sentences["text"])

tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
sentences = pd.concat([tfidf_df, sentences[["text", "type"]]], axis=1)

# Keep in mind that log base 2 of 1 is 0,
# so a TF-IDF score of 0 indicates that the word was present once in that sentence.
sentences.head()

Unnamed: 0,accompaniment tasty,bit think,caramel fantastic,cream dessert,dessert good,fantastic accompaniment,funniest monty,good taste,ice cream,lumberjack song,mint ice,monty python,python bit,song funniest,strawberry ice,taste caramel,tasty mint,think laugh,text,type
0,0.0,0.377964,0.0,0.0,0.0,0.0,0.377964,0.0,0.0,0.377964,0.0,0.377964,0.377964,0.377964,0.0,0.0,0.0,0.377964,Lumberjack Song funniest Monty Python bit thin...,comedy
1,0.0,0.0,0.0,0.467351,0.467351,0.0,0.0,0.467351,0.355432,0.0,0.0,0.0,0.0,0.0,0.467351,0.0,0.0,0.0,strawberry ice cream dessert good taste,ice cream
2,0.389888,0.0,0.389888,0.0,0.0,0.389888,0.0,0.0,0.29652,0.0,0.389888,0.0,0.0,0.0,0.0,0.389888,0.389888,0.0,taste caramel fantastic accompaniment tasty mi...,ice cream


In [11]:
Y = sentences['type']
X = np.array(sentences.drop(['text','type'], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=66)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 1.0

Test set score: 1.0
----------------------Random Forest Scores----------------------
Training set score: 1.0

Test set score: 1.0
----------------------Gradient Boosting Scores----------------------
Training set score: 1.0

Test set score: 1.0


In [12]:
vectorizer = TfidfVectorizer(use_idf=True, norm=u'l2', smooth_idf=True, ngram_range=(1,2))

# Applying the vectorizer
X = vectorizer.fit_transform(sentences["text"])

tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
sentences = pd.concat([tfidf_df, sentences[["text", "type"]]], axis=1)

# Keep in mind that log base 2 of 1 is 0,
# so a TF-IDF score of 0 indicates that the word was present once in that sentence.
sentences.head()

Unnamed: 0,accompaniment,accompaniment tasty,bit,bit think,caramel,caramel fantastic,cream,cream dessert,dessert,dessert good,...,strawberry,strawberry ice,taste,taste caramel,tasty,tasty mint,think,think laugh,text,type
0,0.0,0.0,0.258199,0.258199,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.258199,0.258199,Lumberjack Song funniest Monty Python bit thin...,comedy
1,0.0,0.0,0.0,0.0,0.0,0.0,0.249204,0.327673,0.327673,0.327673,...,0.327673,0.327673,0.249204,0.0,0.0,0.0,0.0,0.0,strawberry ice cream dessert good taste,ice cream
2,0.274064,0.274064,0.0,0.0,0.274064,0.274064,0.208433,0.0,0.0,0.0,...,0.0,0.0,0.208433,0.274064,0.274064,0.274064,0.0,0.0,taste caramel fantastic accompaniment tasty mi...,ice cream


In [13]:
Y = sentences['type']
X = np.array(sentences.drop(['text','type'], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=66)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 1.0

Test set score: 1.0
----------------------Random Forest Scores----------------------
Training set score: 1.0

Test set score: 1.0
----------------------Gradient Boosting Scores----------------------
Training set score: 1.0

Test set score: 1.0
