# 1. Import

In [None]:
from __future__ import division
from collections import Counter
## for data
import json
import pandas as pd
import numpy as np
from sklearn import metrics, manifold

## for pre-processing
import re
import nltk
import nltk.corpus
from nltk.corpus import stopwords


## for plotting
import matplotlib.pyplot as plt
import seaborn as sns

## for language detection
import langdetect
import spacy
from spacy_langdetect import LanguageDetector

## for w2v
import gensim
import gensim.downloader as gensim_api

## for bert
import transformers
from transformers import DistilBertTokenizer, DistilBertModel
import torch
  

## for predicting
from sklearn.metrics.pairwise import cosine_similarity

import re, nltk
import my_functions as func


In [None]:
df = pd.read_csv('tweetBERT.csv')
df

In [None]:
func.print_tweet(df, len(df))

# 2. Clean

In [None]:
# from unidecode import unidecode

def clean_BERT(text, isSentenceEmbed = True):

    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''

    if(isSentenceEmbed == True):
        # sentence embedding, we need hashatag
        text = re.sub(r"[^A-Za-z0-9^,!-?%.\/#'+]", " ", text)
    else:
        # word embedding remove hashtag symbol
        text = re.sub(r"[^A-Za-z0-9^,!?%.\/'+]", " ", text)
    text = re.sub(r"\+", " plus ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text



In [None]:
df.tweet = df.tweet.apply(lambda x: clean_BERT(x, True))

In [None]:
# create new df that we need to create our word dictionary 
df_word = df.copy()[0:100]
df_word = func.remove_end_hashtag(df_word)
df_word.tweet = df.tweet.apply(lambda x: clean_BERT(x, False))


In [None]:
func.print_tweet(df_word,len(df_word))

In [None]:
# remove duplicates
df_word = df_word.drop_duplicates(subset=['tweet'], keep='first')

In [None]:
df_word.reset_index(drop = True, inplace = True)


In [None]:
df_word

# 3. BERT

<span style='background-color:Teal'>Create word dictionary<span>

https://medium.com/@dhartidhami/understanding-bert-word-embeddings-7dc4d2ea54ca<br>
    https://medium.com/analytics-vidhya/bert-word-embeddings-deep-dive-32f6214f02bf<br>
https://dzlab.github.io/dltips/en/tensorflow/create-bert-vocab/<br>

# 4. Distil BERT

## 4.0 Load and Test

In [None]:
from transformers import DistilBertTokenizerFast

# Instantiate DistilBERT tokenizer...we use the Fast version to optimize runtime
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', pad_token='[PAD]', model_max_length = 80)

# 2. get word embedder (encoder)
model = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased',output_hidden_states=True)



In [None]:
# We want to convert our id into 2D numpy array by using [None,:]
# labels=np.array([1,3,5])
# print('2D array rows: \n',labels[None,:])
# print('2D array cols: \n',labels[:,None])

In [None]:
txt = 'Dear Line Managers Appraisal your subordinate based on their Job performance and not sentiment blood-line religious group or tribe'

# 1. encode
ids = tokenizer.encode(txt, padding = True)
print("\nIDs   :\n", ids)

# 2. tokenize with CLS and SEP
tokens = tokenizer.convert_ids_to_tokens(ids)
print("\nTokens with special:\n", tokens)

# 3. Display the words with their IDs.
for tup in zip(tokens, ids ):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))


In [None]:
# We will convert our sentence to 1 vector of 768D and store the result vector to our list
# embedded_text_list = []
# for i in range(5):
#     text = df.iloc[i].tweet
    
#     #ids
#     ids = tokenizer.encode(text)
    
#     #tokens
#     tokens = tokenizer.convert_ids_to_tokens(ids)

#     #array
#     ids_arr = np.array(ids)[None,:]
    
#     #embed
#     embedding = model(ids_arr)
    
#     embedded_text_list.append(embedding)
# # we have 5 samples x 768 

# # total number of tweet
# print(len(embedded_text_list))

# # shape of each embedded tweet
# print(embedded_text_list[1][0].shape)



# 4.1 Tokenize

In [None]:
# 1. get max sentences length
# max_len = 0
# for i in ids_np.values:
#     if len(i) > max_len:
#         max_len = len(i)

# max_len
df_word.tweet.str.len().max()

## To ID

In [None]:
# lets try a small subset first
ids_np = df_word.tweet.apply(lambda x: tokenizer.encode(x,
                                                        add_special_tokens = True,
#                                                         max_length = 75,# maximum length of a sentence
#                                                         truncation=True,
                                                        pad_to_max_length=True)) # Add [PAD]s
ids_np[0]                                                    

## To tokens words

In [None]:
tokens_np = df_word.tweet.apply(lambda x: tokenizer.tokenize(x,
                                                             add_special_tokens = True,
                                                             max_length = 75,
                                                             truncation=True,
                                                             pad_to_max_length=True))
tokens_np[0]


# 4.3 Save IDs and Tokens as DF

In [None]:
tokens_df = pd.DataFrame(tokens_np)
tokens_df.rename(columns = {'tweet': 'tweet_tokens'})
df_word['tokens'] = tokens_df
df_word = df_word[0:100]
df_word

In [None]:
id_list = id_np.tolist()

# assign empty list to token ids column
df_word['token_ids'] = np.empty((len(df), 74)).tolist()

# assign token ids 
df_word['token_ids'] = id_list

df_word

# 4.3 Masking

<span style ='background-color:Teal'>If we directly send padded to BERT, that would slightly confuse it. We need to create another variable to tell it to ignore (mask) the padding we've added when it's processing its input. That's what attention_mask is:<span>

In [None]:
attention_mask = np.where(id_np != 0, 1, 0)
attention_mask.shape

# 4.4 Modelling

In [None]:
# 1. convert 'padded ids' and 'attention mask' to tensors
# input_ids = torch.tensor(padded)  
# attention_mask1 = torch.tensor(attention_mask)

In [None]:
# 2. get embedded last hidden states
last_hidden_states = model(id_np, attention_mask=attention_mask, )

In [None]:
last_hidden_states

# 4.5 Get embedding for each word

In [None]:
# for one tweet, split the vector into 1*768 vector
print(last_hidden_states[0][0][73].shape)
print(last_hidden_states[0][0][73].shape)

# 5. Clustering

<span style='background-color:Teal'> We want to make sure that documents with similar sentiments are clustered together such that we can find the topics within these clusters. Before doing so, we first need to lower the dimensionality of the embeddings as many clustering algorithms handle high dimensionality poorly. <span>

## 4.1 Reduce Dimensionality using UMAP

<span style="background-color:Teal">Out of the few dimensionality reduction algorithms, UMAP is arguably the best performing as it keeps a significant portion of the high-dimensional local structure in lower dimensionality.<span>

In [None]:
import umap
umap_tweet_list = []
for embedded_tweet in embedded_text_list:
    umap_tweet = umap.UMAP(n_neighbors=15,
                           n_components=5,
                           metric='cosine').fit_transform(embedded_tweet)
    umap_tweet_list.append(umap_tweet)

In [None]:
# total number of tweet
print(len(umap_tweet_list))

# shape of each embedded tweet
print(umap_tweet_list[6268].shape)



## 4.2 Clustering KNN?
Question : I see HDBSCAN, can I just use KNN?

<span style="background-color:Teal"> After having reduced the dimensionality of the documents embeddings to 5, we can cluster the documents with HDBSCAN. HDBSCAN is a density-based algorithm that works quite well with UMAP since UMAP maintains a lot of local structure even in lower-dimensional space. Moreover, HDBSCAN does not force data points to clusters as it considers them outliers.<span>

In [None]:
# !pip install hdbscan --no-build-isolation --no-binary :all:

In [None]:
from sklearn.cluster import KMeans

In [None]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]