In [None]:
# !pip install gensim
# !pip install wordcloud
# !pip install pyldavis

In [None]:
from collections import Counter # Count most common words
import gensim # word2vec model
import matplotlib.pyplot as plt
%matplotlib inline
import nltk # natural language toolkit
from nltk.corpus import stopwords
from nltk.corpus import movie_reviews # another dataset 
import numpy as np 
import os
import pandas as pd
import pyLDAvis.sklearn # visualize our topic models!
import re # regular expressions
import seaborn as sns
# CV (multiple train/test splitting)
from sklearn.model_selection import cross_val_score, train_test_split
# Algorithms (unsupervised)
from sklearn.decomposition import LatentDirichletAllocation, PCA
# (supervised)
from sklearn.linear_model import LogisticRegression
# Tools to create our DTMs
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
# Speed up your machine learning setup
from sklearn.pipeline import Pipeline
# Mix up our training and test sets
from sklearn.utils import shuffle
# Super awesome NLP library
import spacy
# Visualize word clouds 
from wordcloud import WordCloud

import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)

# Learning objectives

1. Fit an LDA topic model and visualize it
2. Fit a word2vec model and visualize it
3. Build a classifier
4. Learn a little about BERT!

# Topic modeling

There are many topic modeling algorithms, but we'll start with [Latent Dirichlet Allocation (LDA)](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation). This is a standard **unsupervised** machine learning text-mining tool that can be used to discover abstract "topics" contained within texts.

Like the rest of this class, the goal is not to learn everything you need to know about topic modeling. Instead, this will provide you some starter code to run some simple models with the idea that you can use this base of knowledge to explore further. Use the sklearn help files, Stack Overflow, and Google searching to review and learn more about what the code is doing and how to go further. 

Can you make this code work for your own data? Can you tweak the parameters to get better output?

# Create a dataframe from individual text files

You've gathered a bunch of text files, so now what? It is useful to get these files into a dataframe. Python does not make this terribly easy for the beginner, so use the boilerplate code below to help you.

Let's concatenate the eleven text files in the "Data/human-rights/" folder into a dataframe so we can manipulate that text like we have seen in the previous few notebooks.

In [None]:
# Where am I?
%pwd

In [None]:
# Define a variable with the file path for the directory containing the text files
# Go two directories up (../../) 
# and into the Data directory
# then into the human-rights subdirectory
dir_path = os.listdir("../../Data/human-rights/")

# View the contents of this directory
dir_path

In [None]:
# Designate an empty dictionary to store the filename and text as columns
for_dataframe = {}

# Loop through the directory of text files and open and read them
for file in dir_path:
    with open("../../Data/human-rights/" + file, "r", encoding="utf-8") as to_open:
         for_dataframe[file] = to_open.read()
            
# Create and append the dataframe with two columns - the file name and the text itself
human_rights = (pd.DataFrame.from_dict(for_dataframe, 
                                       orient = "index")
                .reset_index().rename(index = str, 
                                      columns = {"index": "File", 0: "Text"}))

In [None]:
human_rights

# Review - manipulate and explore text

In [None]:
# Check out text of one row to make sure it looks okay...
list(human_rights[0:1]["Text"])
# human_rights[0:1]["Text"]

# Basic preprocessing

Preprocess the text! What else might you want to do that is not included here? Lemmatization? Spacy stuff? 

In [None]:
# Remove punctuation
human_rights["Text_processed"] = human_rights["Text"].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert to lowercase
human_rights["Text_processed"] = human_rights["Text_processed"].map(lambda x: x.lower())

# Remove digits
human_rights['Text_processed'] = human_rights['Text_processed'].str.replace('\d+', '')

# Punctuation and digits are gone! ... ?
list(human_rights[0:1]["Text_processed"])

In [None]:
human_rights

In [None]:
# Save the "Text_processed" column as one long string
long_string = ','.join(list(human_rights["Text_processed"].values))
long_string

# A few more preprocessing steps...

In [None]:
# Tokenize long_string
hr_tokens = long_string.split()

# Remove stopwords
stop = stopwords.words("english")
no_stops = [word for word in hr_tokens if word not in stopwords.words('english')]
freq_hr = Counter(no_stops)

# Print the 20 most common words
hr_df = pd.DataFrame(freq_hr.most_common(20), columns = ["Word", "Frequency"])
hr_df

# Define a BOW model

In [None]:
# Define an empty bag (of words)
vectorizer = CountVectorizer()

# Use the .fit method to tokenize the text and learn the vocabulary
vectorizer.fit(human_rights["Text_processed"])

# Print the vocabulary
vectorizer.vocabulary_

# Create the DTM

Recall that a [document term matrix](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) displays term frequencies or TFIDF scores that occur across a collection of documents. We want to encode the documents into a [sparse matrix](https://sebastianraschka.com/faq/docs/bag-of-words-sparsity.html#:~:text=By%20definition%2C%20a%20sparse%20matrix,as%20a%20word%2Dcount%20vector.&text=Thus%2C%20if%20most%20of%20your,most%20likely%20sparse%20as%20well!) to represent the frequencies or TFIDF scores of each vocabulary word across the documents.

Again, the column headers could read **(document number, term)   frequency**

In [None]:
# Encode the documents
vector = vectorizer.transform(human_rights["Text_processed"])
print(vector) #
#
#
#
#
print(vector.shape)
print(type(vector))

In [None]:
# View as a multidimensional array before converting to data frame
# Rows are the documents
# Columns are the terms
print(vector.toarray())

In [None]:
# What are the terms?
vectorizer.get_feature_names()

# Define a bigram bag of words

In [None]:
# What other processing steps could you include here
# ... instead of doing them manually above? 
bigram_vectorizer = CountVectorizer(ngram_range = (1,2),
                                    stop_words = "english",
                                    token_pattern = r'\b\w+\b', 
                                    min_df = 1)
bigram_vectorizer

In [None]:
# Analyze long_string in the bigram bag of words
analyze = bigram_vectorizer.build_analyzer()
vocab = analyze(long_string)

In [None]:
# Show the 20 most commons
freq = Counter(vocab)
stop_df = pd.DataFrame(freq.most_common(20), columns = ["Word", "Frequency"])
stop_df

In [None]:
# Define a word cloud variable
cloud = WordCloud(background_color = "black", 
                  max_words = 20, 
                  contour_width = 5, 
                  width = 600, height = 300, 
                  random_state = 5)

# Process the word cloud
cloud.generate(long_string)

# Visualize!
cloud.to_image()

Learn about using [custom colors here](https://amueller.github.io/word_cloud/auto_examples/a_new_hope.html)

In [None]:
hr_barplot = sns.barplot(x = "Frequency", 
                         y = "Word", 
                         data = stop_df, 
                         orient = "h")

# Finally! Fit the topic model

The input to LDA should be a DTM!

In [None]:
# How many topics?
n_topics = 5
# n_topics = 20

In [None]:
# TfidfVectorizer to create the DTM
tfidf_vectorizer = TfidfVectorizer(max_df = 0.90,
                                   max_features = 500,
                                   stop_words = "english")

# Fit
tfidf = tfidf_vectorizer.fit_transform(hr_tokens)

[Check out this question](https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer) to learn more about the `max_df` and `min_df` arguments. 

In [None]:
# Instantiate our LDA model
lda = LatentDirichletAllocation(n_components = n_topics, 
                                max_iter = 20, 
                                random_state = 5)
lda = lda.fit(tfidf)

Below is a function to print out the top words for each topic in a pretty way:

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #{}:".format(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()

In [None]:
# Return the topics
tf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 20)

In [None]:
panel = pyLDAvis.sklearn.prepare(lda_model = lda, 
                                 dtm = tfidf, 
                                 vectorizer = tfidf_vectorizer, 
                                 mds = "tsne")
pyLDAvis.display(panel)

# Challenge 1

1. What is a topic? 
2. What is a "salient" term?
3. What is the relevance metric lambda?

4. What do you know about the eleven human rights documents we used to do this exercise? 
5. Why are all these topics similar in size in the left plot?
6. Why are the overall term frequencies (blue bar) and estimated term frequency, within the selected topic (red bar) similar in the right plot? 
7. Plug in your own data! You might see more distinct topics given the nature of these human rights documents. Why? 

# Challenge 2

Read up on LDA and its visualizations by clicking the below links:
- https://www.objectorientedsubject.net/2018/08/experiments-on-topic-modeling-pyldavis/
- http://www.cs.columbia.edu/~blei/papers/ChaneyBlei2012.pdf
- https://shravan-kuchkula.github.io/topic-modeling/#lda-results
- https://markroxor.github.io/gensim/static/notebooks/gensim_news_classification.html
- http://vis.stanford.edu/files/2012-Termite-AVI.pdf

# word2vec

The word2vec family of algorithms use shallow neural networks to produce word embeddings, or ways to represent similar words similarly as numbers. We will explore neural network architecture in notebook 5-2. 

In [None]:
# First, store the documents we want to explore in a separate dataframe with just one column
w2v_df = pd.DataFrame({'Processed': human_rights["Text_processed"]})
w2v_df

In [None]:
# Turn the text of each row into a list
# We now have a list of lists - one for each document
split_rows = [row.split() for row in w2v_df['Processed']]
split_rows

In [None]:
# Define the word2vec model
model = gensim.models.Word2Vec(split_rows, 
                               min_count = 2,
                               size = 12, 
                               workers = 3, 
                               window = 3, 
                               sg = 1)

In [None]:
# Save the vocabulary 
words = list(model.wv.vocab)

In [None]:
model["human"]

In [None]:
# compare! 
model.similarity("the", "the")

In [None]:
model.similarity("human", "rights")

In [None]:
model.similarity("human", "the")

In [None]:
model.similarity("human", "law")

In [None]:
model.similarity("country", "law")

In [None]:
model.similarity("justice", "law")

In [None]:
model.similarity("international", "law")

In [None]:
model.similarity("united", "nations")

In [None]:
# Adjectives
model.wv.most_similar(positive = "human")

In [None]:
example = model.wv.most_similar(positive = "rights", topn=10)
print(type(example))

In [None]:
df = DataFrame(your_list,columns=['Column_Name'])

In [None]:
model.wv.most_similar(positive = "war")

In [None]:
model.similarity("peace", "human")

In [None]:
model.similarity("war", "human")

# Plot words with PCA

[Principal component analysis](https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60) and related dimension-reduction algorithms are an excellent way to visualize multivariate data in reduced dimensional space - such as a 2D scatterplot. 

In [None]:
# Save the word2vec vocab
features = model[model.wv.vocab]

In [None]:
# Define parameters of our PCA

# Just look at the first two dimensions - the X and Y axes
for_pca = PCA(n_components = 2)
pca_out = for_pca.fit_transform(features)

In [None]:
# Plot!
plt.scatter(pca_out[:, 0], pca_out[:, 1])

# Annotate text labels

What if we want to lable points with just certain words? 

In [None]:
plt.figure(figsize = (8,6))
plt.scatter(pca_out[:, 0], pca_out[:, 1])
words = list(model.wv.vocab)
# Annotate only the top 20 words [0:20]
for i, word in enumerate(words[0:20]):
    plt.annotate(word, size = 20, xy = (pca_out[i, 0], pca_out[i, 1]))
plt.show()

In [None]:
# Or, just the top 20 words?
plt.figure(figsize = (8,6))
plt.scatter(pca_out[:, 0][0:20], pca_out[:, 1][0:20])
words = list(model.wv.vocab)
for i, word in enumerate(words[0:20]):
    plt.annotate(word, size = 20, xy=(pca_out[i, 0], pca_out[i, 1]))
plt.show()

https://kavita-ganesan.com/gensim-word2vec-tutorial-starter-code/#.XuxYm2pKjOQ

https://machinelearningmastery.com/develop-word-embeddings-python-gensim/

https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial

https://towardsdatascience.com/a-beginners-guide-to-word-embedding-with-gensim-word2vec-model-5970fa56cc92

https://www.datacamp.com/community/blog/spacy-cheatsheet

https://code.google.com/archive/p/word2vec/

# Classification of Textual Data

How can we translate this simple model of binary classification to text? Let's look at a corpus from `nltk` and build your own classifier using sklearn's machine learning `Pipeline`

In [None]:
# Download the nltk built-in movie reviews dataset
nltk.download("movie_reviews")

As you might expect, this is a corpus of IMDB movie reviews. Someone went through and read each review, labeling it as either "positive" or "negative". The task we have before us is to create a model that can accurately predict whether a never-before-seen review is positive or negative. 

From the `movie_reviews` object let's take out the reviews and the judgement:

In [None]:
# Extract our x (reviews) and y (judgements) variables
reviews = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]
judgements = [movie_reviews.categories(fileid)[0] for fileid in movie_reviews.fileids()]

In [None]:
# Save in a dataframe
movies = pd.DataFrame({"Reviews" : reviews, 
                      "Judgements" : judgements})

In [None]:
movies.head()

In [None]:
# We ahve 2000 movie reviews
movies.shape

Let's look at a random review and its judgement:

In [None]:
print("The human annotator's review was:", movies.Judgements[0])
print()
print(movies.Reviews[0][:500])

So right now we have a dataframe of movie reviews in the `Reviews` variable and a list of their corresponding judgements in the `Judgements` column. Awesome. What does this sound like to you? Independent and dependent variables? You'd be right!

`Reviews` is our x variable. `Judgements` is our y variable. Let's first reassign x and y for simplicity. While we're at it, we're going to set the random_state for our computer. Remember that this makes our result reproducible. We'll also `shuffle` so that we randomize the order of our observations, and when we split the testing and training data it won't be in a biased order. However, start learning about [stratified sampling](https://en.wikipedia.org/wiki/Stratified_sampling) and when you should use it instead of `shuffle`!

In [None]:
x, y = shuffle(np.array(movies.Reviews), np.array(movies.Judgements), random_state = 1)

If you don't believe me that all we did is reassign and shuffle:

In [None]:
x[0], print("Human annotator's review was:", y[0])

To get meaningful independent variables (words) we have to do some processing too (think DTM!). With `sklearn`'s text pipelines, we can quickly build a text classifier in only a few lines of Python: 

# Cross-validated pipepline

Remember training/test splitting? Lets do this `cv = 20` times! https://en.wikipedia.org/wiki/Cross-validation_(statistics)

We should reasonably expect this to perform better than a single training/test split (see below).

In [None]:
# Cross-validated model!
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LogisticRegression(random_state = 0, penalty = "l2", C = 1000))
                     ])

scores = cross_val_score(text_clf, x, y, cv = 20)

print(scores, np.mean(scores))

***Whoa! What just happened?!?*** The pipeline tells us three things happened:

1. `CountVectorizer`

2. `TfidfTransformer`

3. `LogisticRegression`

Let's walk through this step by step.

1. `CountVectorizer` does the same as before. It changes all the texts to quickly normalized words, and then simply counts the frequency of each word occuring in the corpus for each document. The feature array for each document at this point is simply the length of all unique words in a corpus, with the count for the frequency of each. This is the most basic way to provide features for a classifier - a document term matrix!

2. Remember that tfidf (term frequency inverse document frequency) is an algorithm that aims to find words that are important to specific documents. It does this by taking the term frequency (tf) for a specific term in a specific document, and multiplying it by the term's inverse document frequency (idf), which is the total number of documents divided by the number of documents that contain the term at least once. `TfidfTransformer` transforms the `CountVectorizer` into a tf-idf representation. 

A tfidf value is calculated for each term for each document. The feature arrays for a document is now the tfidf values. 

> Remember! The tfidf matrix is similar to our document term matrix, only now the values have been weighted according to their distribution across documents.

The pipeline now sends these tfidf feature arrays to 

3. `LogisticRegression`, what we learned from notebook 4-3. We add in an l2 penalization parameter because we have many more independent variables from our `dtm` than observations. 

Check out the [pipeline documentation here](http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

The code below breaks this down by each step, but combines the `CountVectorizer` and `TfidfTransformer` in the `TfidfVectorizer`.

In [None]:
# Standard training/test split (no cross validation)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 0)

# get tfidf values
tfidf = TfidfVectorizer()
tfidf.fit(x)
x_train = tfidf.transform(x_train)
x_test = tfidf.transform(x_test)

# build and test logit
logit_class = LogisticRegression(random_state = 0, penalty = "l2", C = 1000)
model = logit_class.fit(x_train, y_train)
model.score(x_test, y_test)

# Important Features

After we train the model we can then index the tfidf matrix for the words with the most significant coefficients (remember independent variables!) to get the most helpful features:

In [None]:
feature_names = tfidf.get_feature_names()
top25pos = np.argsort(model.coef_[0])[-25:]
print("Top features for positive reviews:")
print(list(feature_names[j] for j in top25pos))
print()
print("Top features for negative reviews:")
top25neg = np.argsort(model.coef_[0])[:25]
print(list(feature_names[j] for j in top25neg))

# Prediction

We can also use our model to classify new reviews, all we have to do is extract the tfidf features from the raw text and send them to the model as our features (independent variables):

In [None]:
new_bad_review = "This was the most awful worst super bad movie ever!"

features = tfidf.transform([new_bad_review])

model.predict(features)

In [None]:
new_good_review = "This movie was great, awesome and good!"

features = tfidf.transform([new_good_review])

model.predict(features)

# Bidirectional Encoder Representations from Transformers (BERT)

Want to go really crazy? The [BERT algorithmic family](https://www.blog.google/products/search/search-language-understanding-bert/) is the way to go!

https://github.com/google-research/bert  
https://github.com/google-research/bert#pre-trained-models  
http://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/