# 1. Import

In [None]:
from __future__ import division
from collections import Counter
## for data
import json
import pandas as pd
import numpy as np
from sklearn import metrics, manifold

## for pre-processing
import re
import nltk
import nltk.corpus
from nltk.corpus import stopwords


## for plotting
import matplotlib.pyplot as plt
import seaborn as sns

## for language detection
import langdetect
import spacy
from spacy_langdetect import LanguageDetector

## for w2v
import gensim
import gensim.downloader as gensim_api

## for bert
import transformers
from transformers import DistilBertTokenizer, DistilBertModel
import torch
  

## for predicting
from sklearn.metrics.pairwise import cosine_similarity

import re, nltk
import my_functions as func


In [None]:
df = pd.read_csv('tweetBERT.csv')
df

In [None]:
func.print_tweet(df, len(df))

# 2. Clean

In [None]:
# from unidecode import unidecode

def clean_BERT(text, isSentenceEmbed = True):

    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''

    if(isSentenceEmbed == True):
        # sentence embedding, we need hashatag
        text = re.sub(r"[^A-Za-z0-9^,!-?%.\/#'+]", " ", text)
    else:
        # word embedding remove hashtag symbol
        text = re.sub(r"[^A-Za-z0-9^,!?%.\/'+]", " ", text)
    text = re.sub(r"\+", " plus ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text



In [None]:
df.tweet = df.tweet.apply(lambda x: clean_BERT(x, True))

In [None]:
# create new df that we need to create our word dictionary 
df_word = df.copy()[0:200]
df_word = func.remove_end_hashtag(df_word)
df_word.tweet = df.tweet.apply(lambda x: clean_BERT(x, False))


In [None]:
func.print_tweet(df_word,len(df_word))

In [None]:
# remove duplicates
df_word = df_word.drop_duplicates(subset=['tweet'], keep='first')

In [None]:
df_word.reset_index(drop = True, inplace = True)


In [None]:
df_word

# DistilBERT Word Embedding 

<span style='background-color:Teal'>Create word dictionary<span>

https://medium.com/@dhartidhami/understanding-bert-word-embeddings-7dc4d2ea54ca<br>
    https://medium.com/analytics-vidhya/bert-word-embeddings-deep-dive-32f6214f02bf<br>
https://dzlab.github.io/dltips/en/tensorflow/create-bert-vocab/<br>

# 3. Load and Test DistilBERT

In [None]:
from transformers import DistilBertTokenizerFast

# 1. Instantiate DistilBERT tokenizer...we use the Fast version to optimize runtime
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', pad_token='[PAD]', model_max_length = 80)

# 2. get word embedder (encoder)
model = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased',output_hidden_states=True)



In [None]:
txt = 'Dear Line Managers Appraisal your subordinate based on their Job performance and not sentiment blood-line religious group or tribe'

# 1. encode
ids = tokenizer.encode(txt, max_length = 75,pad_to_max_length=True,add_special_tokens = True, truncation=True)
print("\nIDs   :\n", ids)

# 2. tokenize with CLS and SEP
tokens = tokenizer.convert_ids_to_tokens(ids)
print("\nTokens with special:\n", tokens)

# 3. Display the words with their IDs.
for tup in zip(tokens, ids ):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))


In [None]:
# We will convert our sentence to 1 vector of 768D and store the result vector to our list
# embedded_text_list = []
# for i in range(5):
#     text = df.iloc[i].tweet
    
#     #ids
#     ids = tokenizer.encode(text)
    
#     #tokens
#     tokens = tokenizer.convert_ids_to_tokens(ids)

#     #array
#     ids_arr = np.array(ids)[None,:]
    
#     #embed
#     embedding = model(ids_arr)
    
#     embedded_text_list.append(embedding)
# # we have 5 samples x 768 

# # total number of tweet
# print(len(embedded_text_list))

# # shape of each embedded tweet
# print(embedded_text_list[1][0].shape)



# 4. Tokenize and Encoding

## 4.1 Encoding and Tokenize Sample

<span style='background-color:Teal'>We first need to get the maximum length of the tweet so we can pad it properly<span>

In [None]:
ids_np = df_word.tweet.apply(lambda x: tokenizer.encode(x))
max_len = 0
for ids in ids_np.values:
    if len(ids) > max_len:
        max_len = len(ids)


<span style='background-color:Teal'>We want our IDs numpy to be in this format<span>

In [None]:
# get ids as numpy array
id_ = df_word.tweet.apply(lambda x: tokenizer.encode(x, add_special_tokens = True)) # Add [PAD]s

idJ_np = np.array([i + [0]*(max_len-len(i)) for i in id_.values])

print('\nshape:\n', idJ_np.shape)
print(idJ_np[0])

In [None]:
attention_maskJ = np.where(idJ_np != 0, 1, 0)
print('\nshape:\n',attention_maskJ.shape)
print(attention_maskJ[0])

## 4.2 Encoding to ID

In [None]:
# 1. Encoding
ids_attn = df_word.tweet.apply(lambda x: tokenizer.encode_plus(x,
                                                        add_special_tokens = True,
                                                        max_length =max_len,# maximum length of a sentence
                                                        truncation=True,
                                                        return_tensors = 'np',
                                                        padding = 'max_length')) # Add [PAD]s
print(ids_attn)
# print(type(ids_attn[0]['input_ids']))   
# print(ids_attn[0]['attention_mask'])   

In [None]:
# 2. get IDs numpy
input_ids_list = []
for i in range(len(ids_attn)):
    input_ids_list.append(ids_attn[i]['input_ids'][0])

    
input_ids_np = np.array(input_ids_list)



## 4.3 Encoding to Attention_mask

In [None]:
# get Attention_mask numpy

attention_mask_list = []
for i in range(len(ids_attn)):
    attention_mask_list.append(ids_attn[i]['attention_mask'][0])
    
    
attention_mask_np = np.array(attention_mask_list)



In [None]:
# Check that the index match 39 is [SEP] and so 40 is [PAD]
print("input id's shape: ", input_ids_np.shape)
print("attention mask's shape: ", attention_mask_np.shape)

print('\n',input_ids_np[0])
print('\nAt index 39 input id is: ',input_ids_np[0][39])
print('At index 40 input id is: ',input_ids_np[0][40])
print('\nAt Index 39 attention is: ', attention_mask_np[0][39])
print('At Index 40 attention is: ',attention_mask_np[0][40])


## 4.4 Tokenize to words

In [None]:
tokens_np = df_word.tweet.apply(lambda x: tokenizer.tokenize(x,
                                                             add_special_tokens = True,
                                                             max_length = max_len,
                                                             truncation=True,
                                                             return_tensors = 'np',
                                                             pad_to_max_length=True))
tokens_np[0]


# 5. Save IDs, attention_mask, and tokens as DF

## 5.1 Save tokens in Df

In [None]:
tokens_df = pd.DataFrame(tokens_np)
tokens_df.rename(columns = {'tweet': 'tweet_tokens'})
df_word['tokens'] = tokens_df
df_word

## 5.2 Save Ids as Df

In [None]:
# assign empty list to token ids column: of shape (len(df_word) = 100, max_len = 67 words)
df_word['token_ids'] = np.empty((len(df_word), max_len)).tolist()

# assign token ids 
df_word['token_ids'] = input_ids_list

df_word

## 5.3 Save attention mask as DF

In [None]:
df_word['attention_mask'] = np.empty((len(df_word),max_len)).tolist()

df_word['attention_mask'] = attention_mask_list

df_word

In [None]:
# print(df_word.iloc[3].tokens)
# print(df_word.iloc[3].attention_mask)

# 6. Modelling

In [None]:
# get embedded last hidden states
last_hidden_states = model(input_ids_np, attention_mask=attention_mask_np)

In [None]:
last_hidden_states

# 7. Check our last hidden states

![title](last_hidden.png)

<span style='background-color:Teal'> first  [0] is the Tensor <br>
second [0] is the tweet <br>
third [0] is the word  <br><span>
fourth [0] is the part of tensor of 768  <br><span>

In [None]:
# Lets see the embedding of the last word(PAD) of 1st and 2nd sentence
print(last_hidden_states[0][0,63,:5])
print(last_hidden_states[0][1,63,:5])

In [None]:
# Lets check the embedding for the word con in the first tweet
con_embed = last_hidden_states[0][1,1]
print(type(con_embed))

con_embed_np = np.array(last_hidden_states[0][1,1])
print(type(con_embed_np))

In [None]:
print(con_embed_np)

# 8. Test Get embedding for each word in a tweet

In [None]:
sentence_idx = 0
tokens_tweets = df_word.iloc[sentence_idx].tokens
embedding_tweets = last_hidden_states[0][sentence_idx]

print(tokens_tweets)
print(embedding_tweets.shape)

## 8.1 Get Words Function

In [None]:
# def create_word_df(the_tokens_list):   
#     # Part A: words df
#     the_clean_tokens_list = []
#     the_tokens_idx = []
#     merged_tokens_idx = []
#     max_len = len(the_tokens_list)
#     i = 0
#     word = the_tokens_list[i]
#     while(i < max_len):
#         add = True
#         word = the_tokens_list[i]
#         # 1. join ## words first
#         # if its not the last word, 
#         if(i <= (max_len-2)):
#             next_word = the_tokens_list[i+1]
#             pattern = r"(^##)"
#             # if the next word is ##
#             while (bool(re.search(pattern,next_word))):
#                 add = False
#                 print('index of word with ##: ',i+1)
#                 word = word + next_word[2:]
#                 if(i) not in merged_tokens_idx:
#                     merged_tokens_idx.append(i)
#                 merged_tokens_idx.append(i+1)
#                 print('word after merge:', word)
#                 # skip the next word becos we have merged it
#                 i+=1
#                 next_word = the_tokens_list[i+1]
        
#         if(merged_tokens_idx) not in the_tokens_idx:
#             the_tokens_idx.append(merged_tokens_idx) 
#         # 2. get tokens that are not padding, CLS, PAD, SEP, and not in stopwords
#         if(word!= '[CLS]') and (word!= '[PAD]') and (word!= '[SEP]') and (word not in stopwords.words('english')):     
#             the_clean_tokens_list.append(word)
#             if(add == True):
#                 the_tokens_idx.append(i)
                
        
#         # 3. move to next word
#         i+=1
    
#     the_df_dict= pd.DataFrame(the_clean_tokens_list)
#     the_df_dict.columns = ['words']
#     print(merged_tokens_idx)
#     return [the_df_dict ,the_tokens_idx, the_clean_tokens_list]
    
    

In [None]:
def create_word_df(the_tokens_list):   
    the_clean_tokens_list= []
    the_tokens_idx = []
    i = 0
    
    # 1. loop through the token list
    word = the_tokens_list[i]
    for i in range(len(the_tokens_list)):
        word = the_tokens_list[i]
        # 2. If its not specialised token and words are not in stop words:
        if(word!= '[CLS]') and (word!= '[PAD]') and (word!= '[SEP]') and (word not in stopwords.words('english')):     
            the_clean_tokens_list.append(word)
            the_tokens_idx.append(i)     


    the_df_dict= pd.DataFrame(the_clean_tokens_list)
    the_df_dict.columns = ['words']
    return [the_df_dict ,the_tokens_idx, the_clean_tokens_list]
    
    

In [None]:
df_dict,tokens_idx,clean_tokens_list  = create_word_df(tokens_tweets)

In [None]:
print(clean_tokens_list)

In [None]:
df_dict

## 8.2 Get Embedding Functions

In [None]:
def create_embedding_df(the_df_dict,the_embeddings_list, the_idx_list):
    clean_embedding_list = []
    
    # 1. loop through tokens_idx 
    for i in range(len(the_idx_list)):
        embed_idx = the_idx_list[i]
        
        # print(np.array(the_embeddings_list[embed_idx][:5]))
        # get the embedding from embedding_list
        clean_embedding_list.append(np.array(the_embeddings_list[embed_idx]))
             
    # 2. numpy form 
    the_embeddings_np = np.array(clean_embedding_list)
    
    # make embeddings:
    # 3. df form
    the_df_dict['embeddings'] = np.empty([len(the_idx_list), 768]).tolist()
#     print(np.array(the_embeddings_list[0]))
#     print('dfdict shape: ', the_df_dict['embeddings'].shape)
    
    # print(the_df_dict)
    the_df_dict['embeddings'] = clean_embedding_list
    
    return [the_df_dict, the_embeddings_np]
    

In [None]:
df_dict, embeddings_np = create_embedding_df(df_dict, embedding_tweets, tokens_idx)
df_dict

## 8.3 Get word and embedding for all tweets Functions

In [None]:
def get_words_and_embedding_from_df(the_df):
    
    tokens_list = the_df.iloc[0].tokens
    embedding_list = last_hidden_states[0][0]
    
    final_df_dict, tokens_idx,clean_tokens_list = create_word_df(tokens_tweets)
    final_df_dict, the_embeddings_np = create_embedding_df(final_df_dict, embedding_list, tokens_idx)
    final_df_dict['sentence_idx'] = 0
    
    #for rest of each tweet/ row
    for sentence_idx in range(1,len(the_df)):
        # print(sentence_idx)
        # 1. get list of tokens/words in a tweet
        tokens_list = the_df.iloc[sentence_idx].tokens
        
        # 2. get list of embedding in a tweet
        embedding_list = last_hidden_states[0][sentence_idx]
        
        # 3. get word df
        df_dict, tokens_idx,clean_tokens_list = create_word_df(tokens_list)
        df_dict['sentence_idx'] = int(sentence_idx)
        
        #4. add embeddings columns to the df
        df_dict, the_embeddings_np = create_embedding_df(df_dict, embedding_list, tokens_idx)
        
        
        frames = [final_df_dict, df_dict]

        final_df_dict = pd.concat(frames)
        
        
    return final_df_dict


In [None]:
df_dict = get_words_and_embedding_from_df(df_word)

In [None]:
df_dict.reset_index(drop = True, inplace = True)
df_dict

## 8.4 Double check if the embedding created is correct

In [None]:
# get the 16th word of the last sentence tweets from df_words
print(df_word.iloc[999].tokens[52])

# get the embedding: [tensor][sentence][words][nth value]
last_hidden_states[0][999][52][:5]

## 8.6 Convert our embedding column to numpy array

In [None]:
# convert Pandas series to numpy array
# convert first to list, then convert to array
embeddings_np = np.array(df_dict.embeddings.tolist())
embeddings_np.shape

## 8.7 Remove Duplicates

In [None]:
# Question. There are no duplicates, its normal?
embeddings_list = [tuple(row) for row in embeddings_np]
embeddings_unique_np, unique_index = np.unique(embeddings_list,axis = 0, return_index = True)


In [None]:
print(embeddings_unique_np.shape)
unique_index

<span style='background-color:Teal'>There are 22,071 duplicates. Lets Remove them from df_dict<span>

In [None]:
# 1. convert the numpy array to tupple. 
# 2. apply duplicated() to the tuple
# 3. get the duplicated embeddings
# 4. slice where its not duplicated
df_dict = df_dict[~df_dict['embeddings'].apply(tuple).duplicated()]

In [None]:
df_dict.reset_index(drop = True, inplace = True)

# 9. Clustering

<span style='background-color:Teal'> We want to make sure that documents with similar sentiments are clustered together such that we can find the topics within these clusters. Before doing so, we first need to lower the dimensionality of the embeddings as many clustering algorithms handle high dimensionality poorly. <span>

## 9.1 Reduce Dimensionality using UMAP

<span style="background-color:Teal">Out of the few dimensionality reduction algorithms, UMAP is arguably the best performing as it keeps a significant portion of the high-dimensional local structure in lower dimensionality.<span>

In [None]:
embeddings_unique_np

In [None]:
embeddings_reduced_np = umap.UMAP(n_neighbors=15,
                       n_components=100,
                       metric='cosine').fit_transform(embeddings_unique_np)


In [None]:
embeddings_reduced_np

## 9.2 Clustering KNN

<span style="background-color:Teal"> After having reduced the dimensionality of the documents embeddings to 5, we can cluster the documents with HDBSCAN. HDBSCAN is a density-based algorithm that works quite well with UMAP since UMAP maintains a lot of local structure even in lower-dimensional space. Moreover, HDBSCAN does not force data points to clusters as it considers them outliers.<span>

In [None]:
# !pip install hdbscan --no-build-isolation --no-binary :all:

In [None]:
from sklearn.cluster import KMeans

In [None]:
model_KMeans = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50)
labels = model_KMeans.fit_predict(X=embeddings_reduced_np)
# positive_cluster_center = model_KMeans.cluster_centers_[0]
# negative_cluster_center = model_KMeans.cluster_centers_[1]

In [None]:
cluster_df = pd.DataFrame(labels, columns =['cluster_label'])

In [None]:
cluster_df

In [None]:
df_dict['cluster_label'] = cluster_df


In [None]:
df_dict

In [None]:
dict_0 = df_dict[df_dict['cluster_label'] == 0].words.reset_index(drop = True)
dict_1 = df_dict[df_dict['cluster_label'] == 1].words.reset_index(drop = True)


print('cluster 0 counts : ',len(dict_0))
print('cluster 1 counts : ',len(dict_1))

for i in range(len(dict_1)):
    print(dict_1.iloc[i])


In [None]:
for i in range(len(dict_0)):
    print(dict_0.iloc[i])

In [None]:
for i in range(len(dict_1)):
    print(dict_1.iloc[i])

<span style="background-color:Teal">Question: <span><br>
<span style="background-color:Teal">- There is a mix of postiive and negative words. fever is negative while happy is positive <br>
<span style="background-color:Teal">- Subwords are separated from their original word <br><span>

# 9.3 Get top 20 words that are of closest distance from centroids 

<span style="background-color:Teal">- Question: how to do this? the article I saw use gensim similarity<span>

<span style="background-color:Teal">- Question:How to find distance between centroids and words<span>

In [None]:
all_distance = model_KMeans.fit_transform(X=embeddings_reduced_np)

In [None]:
totalDistance = np.min(all_distance, axis=1).sum()

# 9.4 HDBScan Clustering

In [None]:
import hdbscan

In [None]:
cluster = hdbscan.HDBSCAN(min_cluster_size=2,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(embeddings_reduced_np)

In [None]:
len(np.unique(cluster.labels_))

<span style="background-color:Teal">Question: <span><br>
<span style="background-color:Teal">- Why are there so many cluster? This is not good for sentiment clustering<span>
    