# Challenge: Build your own NLP model

Choose a corpus of data from nltk or another source that includes categories you can predict and create an analysis pipeline that includes the following steps:

- Data cleaning / processing / language parsing
- Create features using two different NLP methods: For example, BoW vs tf-idf.
- Use the features to fit supervised learning models for each feature set to predict the category outcomes.
- Assess your models using cross-validation and determine whether one model performed better.
- Pick one of the models and try to increase accuracy by at least 5 percentage points.


In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import brown, stopwords
from collections import Counter
from sklearn import ensemble
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

print('*** Categories:\n     ', brown.categories())
print('*** Adventure Words:\n     ', brown.words(categories='adventure'))
print('*** Romance and Sci-Fi Sentences:\n     ', brown.sents(categories=['romance', 'science_fiction']))

*** Categories:
      ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
*** Adventure Words:
      ['Dan', 'Morgan', 'told', 'himself', 'he', 'would', ...]
*** Romance and Sci-Fi Sentences:
      [['Now', 'that', 'he', 'knew', 'himself', 'to', 'be', 'self', 'he', 'was', 'free', 'to', 'grok', 'ever', 'closer', 'to', 'his', 'brothers', ',', 'merge', 'without', 'let', '.'], ["Self's", 'integrity', 'was', 'and', 'is', 'and', 'ever', 'had', 'been', '.'], ...]


## Data Cleaning and Language Parsing

In [2]:
nlp = spacy.load('en')

def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

all_category_texts = {}
for category in ['humor', 'science_fiction']:
    current_categorys_lemmatized_sents = []
    category_sentences = brown.sents(categories=category)
    for original_sent in category_sentences:
        sent = text_cleaner(' '.join(original_sent))
        sent_doc = nlp(sent)
        sent = [
            token.lemma_.lower()
            for token in sent_doc
            if not token.is_stop
            and not token.is_punct
        ]
        current_categorys_lemmatized_sents.append([' '.join(original_sent), sent])
    all_category_texts[category] = current_categorys_lemmatized_sents

for genre, sentences in all_category_texts.items():
    print('***', genre, '***\n     ', sentences[:2])

*** humor ***
      [['It was among these that Hinkle identified a photograph of Barco ! !', ['-pron-', 'hinkle', 'identify', 'photograph', 'barco']], ["For it seems that Barco , fancying himself a ladies' man ( and why not , after seven marriages ? ?", ['for', 'barco', 'fancy', 'lady', 'man', 'seven', 'marriage']]]
*** science_fiction ***
      [['Now that he knew himself to be self he was free to grok ever closer to his brothers , merge without let .', ['now', 'know', 'self', 'free', 'grok', 'closer', 'brother', 'merge', 'let']], ["Self's integrity was and is and ever had been .", ['self', "'s", 'integrity']]]


In [3]:
all_sentence_rows = []
for genre, genre_sentences in all_category_texts.items():
    sentence_row = [[sent[0], sent[1], genre] for sent in genre_sentences]
    all_sentence_rows = all_sentence_rows + sentence_row

sentences_df = pd.DataFrame(all_sentence_rows, columns=['original', 'lemmas', 'genre'])
print(sentences_df.shape)
print(sentences_df['genre'].value_counts())
sentences_df.head()

(2001, 3)
humor              1053
science_fiction     948
Name: genre, dtype: int64


Unnamed: 0,original,lemmas,genre
0,It was among these that Hinkle identified a ph...,"[-pron-, hinkle, identify, photograph, barco]",humor
1,"For it seems that Barco , fancying himself a l...","[for, barco, fancy, lady, man, seven, marriage]",humor
2,") , had listed himself for Mormon Beard roles ...","[list, mormon, beard, rol, instigation, fourth...",humor
3,Mills secured Barco's photograph from the gent...,"[mill, secure, barco, 's, photograph, gentlema...",humor
4,"On their way , they stopped at every gas stati...","[on, way, stop, gas, station, main, boulevard,...",humor


## NLP Feature Engineering

### Strategy 1 - Bag of Words

In [4]:
# List of 100 most common words
def bag_of_words(all_lemmas):
    return [item[0] for item in Counter(all_lemmas).most_common(50)]
    
# Create df with features for each word in our common word set
# Each value is the count of the times the word appears in each sentence
def bow_features(sentences_df, common_words):
    df = pd.DataFrame(columns=common_words)
    df['original'] = sentences_df['original']
    df['lemmas'] = sentences_df['lemmas']
    df['genre'] = sentences_df['genre']
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, lemmas in enumerate(df['lemmas']):
        lemmas = [lemma for lemma in lemmas if lemma in common_words]
        for lemma in lemmas:
            df.loc[i, lemma] += 1
        
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

all_bows = []

# Set up each genres bags
for genre, sentences in all_category_texts.items():
    print('*** Creating bag for', genre)
    all_lemmas = list(map(lambda sent: sent[1], sentences))
    all_lemmas = list(lemma for sent_of_lemmas in all_lemmas for lemma in sent_of_lemmas)
    bow = bag_of_words(all_lemmas)
    all_bows = all_bows + bow

# Combine bags for set of unique words
common_words = set(all_bows)
print(len(common_words), 'Common Words:', common_words)
word_counts = bow_features(sentences_df, common_words)
word_counts.head()

*** Creating bag for humor
*** Creating bag for science_fiction
73 Common Words: {'jack', 'old', 'good', 'letch', 'half', 'look', 'find', 'take', '-pron-', 'girl', 'welch', 'know', 'call', 'mother', 'head', 'thing', 'think', 'say', 'light', 'course', "'s", 'funny', 'a', 'mike', "b'dikkat", 'speak', 'ask', 'little', 'new', 'turn', 'go', 'shell', '``', 'grow', 'have', 'mr.', 'there', 'long', 'ship', 'time', 'day', 'people', 'year', 'feel', 'hal', 'room', 'tell', 'man', 'arlene', 'child', 'barco', 'non', 'and', 'mercer', 'earth', 'be', 'mind', 'body', 'word', 'the', 'but', 'what', 'ekstrohm', 'come', 'as', 'no', 'get', 'like', 'in', 'helva', 'not', 'work', 'way'}
Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000


Unnamed: 0,jack,old,good,letch,half,look,find,take,-pron-,girl,...,get,like,in,helva,not,work,way,original,lemmas,genre
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,It was among these that Hinkle identified a ph...,"[-pron-, hinkle, identify, photograph, barco]",humor
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"For it seems that Barco , fancying himself a l...","[for, barco, fancy, lady, man, seven, marriage]",humor
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,") , had listed himself for Mormon Beard roles ...","[list, mormon, beard, rol, instigation, fourth...",humor
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,Mills secured Barco's photograph from the gent...,"[mill, secure, barco, 's, photograph, gentlema...",humor
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,"On their way , they stopped at every gas stati...","[on, way, stop, gas, station, main, boulevard,...",humor


### Strategy 2 - tf-idf

In [5]:
just_sentences = list(map(lambda sent: ' '.join(sent[1]), sentences))

vectorizer = TfidfVectorizer(max_df=0.1, min_df=0.01, stop_words='english', lowercase=True, use_idf=True, norm=u'l2', smooth_idf=True)
sentences_tfidf = vectorizer.fit_transform(just_sentences)
sentences_tfidf_csr = sentences_tfidf.tocsr()

num_sentences = sentences_tfidf_csr.shape[0]
tfidf_by_sent = [{} for _ in range(0,num_sentences)]

terms = vectorizer.get_feature_names()
print(len(terms), 'features:', terms)

for i, j in zip(*sentences_tfidf_csr.nonzero()):
    tfidf_by_sent[i][terms[j]] = sentences_tfidf_csr[i, j]

print('\nOriginal sentence:', just_sentences[:6])
print('Tf_idf vector:', tfidf_by_sent[:6])

65 features: ['angel', 'ask', 'body', 'brain', 'brother', 'central', 'come', 'course', 'day', 'dikkat', 'earth', 'ekstrohm', 'eye', 'face', 'feel', 'gabriel', 'good', 'grow', 'hal', 'half', 'hand', 'happen', 'happy', 'head', 'help', 'helva', 'hesperus', 'jack', 'jubal', 'know', 'lady', 'lie', 'light', 'like', 'little', 'live', 'long', 'look', 'man', 'mean', 'mercer', 'mike', 'mind', 'need', 'night', 'people', 'planet', 'power', 'ryan', 'say', 'shell', 'ship', 'sleep', 'sound', 'speak', 'tell', 'thing', 'think', 'time', 'turn', 'want', 'way', 'word', 'work', 'year']

Original sentence: ['now know self free grok closer brother merge let', "self 's integrity", 'mike stop cherish brother selv three fulfil mars corporate discorporate precious earth unknown power earth merge cherish long wait grokk cherish', 'mike remain trance', 'grok loose end puzzle fit grow see hear archangel foster tabernacle cusp digby come face face bishop senator boone warily uneasy miss dawn ardent taste like water 

In [6]:
tfidf_terms = list(map(lambda term: 'tfidf_' + term, terms))

tfidf_df = pd.DataFrame(columns=tfidf_terms)
tfidf_df['original'] = sentences_df['original']
tfidf_df['lemmas'] = sentences_df['lemmas']
tfidf_df['genre'] = sentences_df['genre']
tfidf_df.loc[:, tfidf_terms] = 0.0000000001

for idx, each_dict in enumerate(tfidf_by_sent):
    for keyword, val in each_dict.items():
        tfidf_df.loc[idx, 'tfidf_' + keyword] = tfidf_by_sent[idx][keyword]

tfidf_df.head()

Unnamed: 0,tfidf_angel,tfidf_ask,tfidf_body,tfidf_brain,tfidf_brother,tfidf_central,tfidf_come,tfidf_course,tfidf_day,tfidf_dikkat,...,tfidf_time,tfidf_turn,tfidf_want,tfidf_way,tfidf_word,tfidf_work,tfidf_year,original,lemmas,genre
0,1e-10,1e-10,1e-10,1e-10,0.799924,1e-10,1e-10,1e-10,1e-10,1e-10,...,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,It was among these that Hinkle identified a ph...,"[-pron-, hinkle, identify, photograph, barco]",humor
1,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,...,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,"For it seems that Barco , fancying himself a l...","[for, barco, fancy, lady, man, seven, marriage]",humor
2,1e-10,1e-10,1e-10,1e-10,0.37915,1e-10,1e-10,1e-10,1e-10,1e-10,...,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,") , had listed himself for Mormon Beard roles ...","[list, mormon, beard, rol, instigation, fourth...",humor
3,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,...,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,Mills secured Barco's photograph from the gent...,"[mill, secure, barco, 's, photograph, gentlema...",humor
4,1e-10,1e-10,1e-10,1e-10,0.370968,1e-10,0.337492,1e-10,1e-10,1e-10,...,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10,"On their way , they stopped at every gas stati...","[on, way, stop, gas, station, main, boulevard,...",humor


In [7]:
whole_df = word_counts.combine_first(tfidf_df)
print(whole_df.shape)
whole_df.head()

(2001, 141)


Unnamed: 0,'s,-pron-,``,a,and,arlene,as,ask,b'dikkat,barco,...,thing,think,time,turn,way,welch,what,word,work,year
0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


## Supervised Learning Models

### BoW

In [8]:
bow_Y = word_counts['genre']
bow_X = np.array(word_counts.drop(['original','lemmas', 'genre'], 1))

bow_rfc = ensemble.RandomForestClassifier()
bow_lr = LogisticRegression()
bow_clf = ensemble.GradientBoostingClassifier()

### tf-idf

In [9]:
tfidf_Y = tfidf_df['genre']
tfidf_X = np.array(tfidf_df.drop(['original','lemmas', 'genre'], 1))

tfidf_rfc = ensemble.RandomForestClassifier()
tfidf_lr = LogisticRegression()
tfidf_clf = ensemble.GradientBoostingClassifier()

### Both BoW and tf-idf

In [10]:
whole_Y = whole_df['genre']
whole_X = np.array(whole_df.drop(['original','lemmas', 'genre'], 1))

whole_rfc = ensemble.RandomForestClassifier()
whole_lr = LogisticRegression()
whole_clf = ensemble.GradientBoostingClassifier()

## Model Evaluation

### BoW

In [11]:
print('Random Forest Classifier:', cross_val_score(bow_rfc, bow_X, bow_Y, cv=5))
print('Logistic Regression:', cross_val_score(bow_lr, bow_X, bow_Y, cv=5))
print('Gradient Boosting Classifier:', cross_val_score(bow_clf, bow_X, bow_Y, cv=5))

bow_rfc.fit(bow_X, bow_Y)

bow_cols_to_note = []
for col, feature_imp in zip(word_counts.columns, bow_rfc.feature_importances_):
    if feature_imp > 0.019:
        bow_cols_to_note.append(col)

Random Forest Classifier: [0.58104738 0.57356608 0.6084788  0.5914787  0.53634085]
Logistic Regression: [0.52618454 0.6159601  0.63341646 0.61654135 0.55889724]
Gradient Boosting Classifier: [0.54114713 0.6084788  0.58852868 0.63157895 0.55889724]


In [12]:
bow_cols_to_note

['jack',
 '-pron-',
 "'s",
 '``',
 'ship',
 'hal',
 'man',
 'mercer',
 'be',
 'the',
 'ekstrohm',
 'helva',
 'not']

### tf-idf

In [13]:
print('Random Forest Classifier:', cross_val_score(tfidf_rfc, tfidf_X, tfidf_Y, cv=5))
print('Logistic Regression:', cross_val_score(tfidf_lr, tfidf_X, tfidf_Y, cv=5))
print('Gradient Boosting Classifier:', cross_val_score(tfidf_clf, tfidf_X, tfidf_Y, cv=5))

Random Forest Classifier: [0.72319202 0.78553616 0.78304239 0.81453634 0.6641604 ]
Logistic Regression: [0.74064838 0.79551122 0.78553616 0.81453634 0.66917293]
Gradient Boosting Classifier: [0.69825436 0.73316708 0.72817955 0.77694236 0.65914787]


### Both BoW and tf-idf

In [14]:
print('Random Forest Classifier:', cross_val_score(whole_rfc, whole_X, whole_Y, cv=5))
print('Logistic Regression:', cross_val_score(whole_lr, whole_X, whole_Y, cv=5))
print('Gradient Boosting Classifier:', cross_val_score(whole_clf, whole_X, whole_Y, cv=5))

Random Forest Classifier: [0.6957606  0.72568579 0.75062344 0.76942356 0.68922306]
Logistic Regression: [0.73067332 0.74563591 0.77805486 0.8245614  0.66917293]
Gradient Boosting Classifier: [0.70074813 0.69077307 0.72069825 0.75689223 0.64411028]


## Increase Accuracy

I will tune the hyperparameters of my Random Forest Classifier for my tf-idf features to increase accuracy.

In [15]:
tfidf_rfc = ensemble.RandomForestClassifier(n_estimators=100, max_features=5)
print('Random Forest Classifier:', cross_val_score(tfidf_rfc, tfidf_X, tfidf_Y, cv=5))

Random Forest Classifier: [0.74064838 0.78553616 0.78553616 0.81704261 0.66666667]


In [17]:
new_df = tfidf_df.copy()

for col in bow_cols_to_note:
    new_df[col] = word_counts[col]
    
tfidf_Y = new_df['genre']
tfidf_X = np.array(new_df.drop(['original','lemmas', 'genre'], 1))

tfidf_rfc = ensemble.RandomForestClassifier(n_estimators=100, max_features=5)
print('Random Forest Classifier:', cross_val_score(tfidf_rfc, tfidf_X, tfidf_Y, cv=5))

Random Forest Classifier: [0.72319202 0.78054863 0.76558603 0.78947368 0.6641604 ]
