<a href="https://colab.research.google.com/github/zerotodeeplearning/ztdl-masterclasses/blob/master/notebooks/Natural_Language_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Learn with us: www.zerotodeeplearning.com

Copyright © 2021: Zero to Deep Learning ® Catalit LLC.

In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Natural Language Processing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tqdm.notebook import tqdm_notebook
import gzip
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix

import nltk
from nltk.corpus import stopwords

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
url = "https://raw.githubusercontent.com/zerotodeeplearning/ztdl-masterclasses/master/data/"

In [None]:
pos_path = tf.keras.utils.get_file(
    'rotten_tomatoes_positive_reviews.txt',
    url + 'rotten_tomatoes_positive_reviews.txt.gz',
    extract=True)
neg_path = tf.keras.utils.get_file(
    'rotten_tomatoes_negative_reviews.txt',
    url + 'rotten_tomatoes_negative_reviews.txt.gz',
    extract=True)

In [None]:
!head {pos_path}

In [None]:
with gzip.open(pos_path) as fin:
  pos_rev = fin.readlines()
  pos_rev = [r.decode('utf-8') for r in pos_rev]

with gzip.open(neg_path) as fin:
  neg_rev = fin.readlines()
  neg_rev = [r.decode('utf-8') for r in neg_rev]

In [None]:
pos_rev[:3]

In [None]:
neg_rev[:4]

In [None]:
len(pos_rev)

In [None]:
len(neg_rev)

In [None]:
docs = np.array(pos_rev + neg_rev)
y = np.array([1]*len(pos_rev) + [0]*len(neg_rev))

In [None]:
docs_train, docs_test, y_train, y_test = train_test_split(docs, y, test_size=0.15, random_state=0)

### TFIDF Classification

In [None]:
vectorizer = TfidfVectorizer(max_features=10000)

In [None]:
X_train = vectorizer.fit_transform(docs_train)
X_test = vectorizer.transform(docs_test)

In [None]:
model = LogisticRegression(solver='liblinear', C=10)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)

### Exercise 1: Feature importances

What are the top words indicative of a positive or a negative review? Let's find out:

- get the features names from the `vectorizer` using the `.get_feature_names` method
- get the features importances from the Logistic Regression using the `.coef_` attribute
- wrap the coefficients in a Pandas series, with the names as index and rank them by value
- select the top and bottom 20 features and print them
- combine the top features into a single list of keywords and name it `top_features`

### Text exploration with NLTK

In [None]:
positive_reviews_concat = ' '.join(pos_rev)
negative_reviews_concat = ' '.join(neg_rev)

In [None]:
all_reviews = positive_reviews_concat + negative_reviews_concat

In [None]:
all_text = nltk.text.Text(all_reviews.split())

In [None]:
plt.figure(figsize=(12, 8))
all_text.dispersion_plot(top_features)

In [None]:
all_text.concordance('enjoyable')

In [None]:
plt.figure(figsize=(10, 7))
all_text.plot(30)

In [None]:
stop_words = stopwords.words('english')

In [None]:
tokens = all_reviews.lower().split()

clean_tokens = [t for t in tqdm_notebook(tokens) if t not in stop_words]

In [None]:
all_text_lower = nltk.text.Text(clean_tokens)

In [None]:
plt.figure(figsize=(10, 7))
all_text_lower.plot(30)

In [None]:
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [None]:
lemma_tokenizer = LemmaTokenizer()

In [None]:
lemma_stop_words = list(np.unique(lemma_tokenizer(' '.join(stop_words))))

### Exercise 2: Improve the TFIDF vectorizer

Armed with the knowledge acquired in the text analysis, try to improve the configuration of the `TfidfVectorizer`. 

```python
vectorizer = TfidfVectorizer(# YOUR CODE HERE
)
```

- Things you could consider:
    - increasing the number of features
    - enforcing lowercase
    - filtering stop words
    - increasing the ngram range
    - using the `lemma_tokenizer` defined above
- Use the vectorizer to fit and transform the documents
- Re-train the `LogisticRegression` model
- Did the score improve?
- Print out 10 false positives and 10 false negatives and see if you can spot any pattern