# 1. Import

In [1]:
from __future__ import division
from collections import Counter
## for data
import json
import pandas as pd
import numpy as np
from sklearn import metrics, manifold

## for pre-processing
import re
import nltk
import nltk.corpus
from nltk.corpus import stopwords

## for plotting
import matplotlib.pyplot as plt
import seaborn as sns

## for language detection
import langdetect
import spacy
from spacy_langdetect import LanguageDetector

## for w2v
import gensim
import gensim.downloader as gensim_api

## for bert
import transformers
from transformers import DistilBertTokenizer, DistilBertModel
import torch
  

## for predicting
from sklearn.metrics.pairwise import cosine_similarity

import re, nltk
import my_functions as func




In [2]:
df = pd.read_csv('tweetBERT.csv')
df

Unnamed: 0,tweet
0,Real Estate Market would crash if there is no ...
1,"Concur. My company was 100% ""you MUST work in ..."
2,Why not ask if we really need that thing? I th...
3,"Dear Line Managers, Appraisal your subordinate..."
4,I have had more opportunities to work cross-fu...
...,...
6270,"Development work is inherently remote, we migh..."
6271,Employers should train your employees in #Cybe...
6272,Washingtonian staff goes on strike after CEO's...
6273,Happy Mother's Day to all the wonderful moms o...


In [None]:
func.print_tweet(df, len(df))

0
Real Estate Market would crash if there is no demand for commercial space. Hybrid work / Remote work works . If we design for it. For decades, Office space worked as space to socialize with fellow human beings. 

1
Concur. My company was 100% "you MUST work in the office" and now they have said that is gone. More importantly many of our leaders have moved remote and we have hired remotely. That is a genie that is REALLY hard to put back in the bottle. 

2
Why not ask if we really need that thing? I think it would be fair that anyone that could work remote was privileged to do so, considering how many essential workers and small businesses got fucked over. Coming to to the office now feels like a teacher telling me to learn cursive 

3
Dear Line Managers, Appraisal your subordinate based on their Job performance and not sentiment ,blood-line , religious group or tribe. #HR #Career #peformanceappraisal #EmployeeExperience #remotework #employees 

4
I have had more opportunities to work

# 2. Clean

In [None]:
# from unidecode import unidecode

def clean_BERT(text, isSentenceEmbed = True):

    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''

    if(isSentenceEmbed == True):
        # sentence embedding, we need hashatag
        text = re.sub(r"[^A-Za-z0-9^,!-?%.\/#'+]", " ", text)
    else:
        # word embedding remove hashtag symbol
        text = re.sub(r"[^A-Za-z0-9^,!?%.\/'+]", " ", text)
    text = re.sub(r"\+", " plus ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text



In [None]:
df.tweet = df.tweet.apply(lambda x: clean_BERT(x, True))

In [None]:
# create new df that we need to create our word dictionary 
df_word = df.copy()[0:100]
df_word = func.remove_end_hashtag(df_word)
df_word.tweet = df.tweet.apply(lambda x: clean_BERT(x, False))


In [None]:
func.print_tweet(df_word,len(df_word))

In [None]:
# remove duplicates
df_word = df_word.drop_duplicates(subset=['tweet'], keep='first')

In [None]:
df_word.reset_index(drop = True, inplace = True)


In [None]:
df_word

# DistilBERT Word Embedding 

<span style='background-color:Teal'>Create word dictionary<span>

https://medium.com/@dhartidhami/understanding-bert-word-embeddings-7dc4d2ea54ca<br>
    https://medium.com/analytics-vidhya/bert-word-embeddings-deep-dive-32f6214f02bf<br>
https://dzlab.github.io/dltips/en/tensorflow/create-bert-vocab/<br>

# 3. Load and Test DistilBERT

In [None]:
from transformers import DistilBertTokenizerFast

# Instantiate DistilBERT tokenizer...we use the Fast version to optimize runtime
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', pad_token='[PAD]', model_max_length = 80)

# 2. get word embedder (encoder)
model = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased',output_hidden_states=True)



In [None]:
txt = 'Dear Line Managers Appraisal your subordinate based on their Job performance and not sentiment blood-line religious group or tribe'

# 1. encode
ids = tokenizer.encode(txt, max_length = 75,pad_to_max_length=True,add_special_tokens = True)
print("\nIDs   :\n", ids)

# 2. tokenize with CLS and SEP
tokens = tokenizer.convert_ids_to_tokens(ids)
print("\nTokens with special:\n", tokens)

# 3. Display the words with their IDs.
for tup in zip(tokens, ids ):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))


In [None]:
# We will convert our sentence to 1 vector of 768D and store the result vector to our list
# embedded_text_list = []
# for i in range(5):
#     text = df.iloc[i].tweet
    
#     #ids
#     ids = tokenizer.encode(text)
    
#     #tokens
#     tokens = tokenizer.convert_ids_to_tokens(ids)

#     #array
#     ids_arr = np.array(ids)[None,:]
    
#     #embed
#     embedding = model(ids_arr)
    
#     embedded_text_list.append(embedding)
# # we have 5 samples x 768 

# # total number of tweet
# print(len(embedded_text_list))

# # shape of each embedded tweet
# print(embedded_text_list[1][0].shape)



# 4. Tokenize and Encoding

## 4.1 Encoding and Tokenize Sample

<span style='background-color:Teal'>We first need to get the maximum length of the tweet so we can pad it properly<span>

In [None]:
ids_np = df_word.tweet.apply(lambda x: tokenizer.encode(x))
max_len = 0
for ids in ids_np.values:
    if len(ids) > max_len:
        max_len = len(ids)


<span style='background-color:Teal'>WE want our IDs numpy to be in this format<span>

In [None]:
# get ids as numpy array
id_ = df_word.tweet.apply(lambda x: tokenizer.encode(x, add_special_tokens = True)) # Add [PAD]s

idJ_np = np.array([i + [0]*(max_len-len(i)) for i in id_.values])

print('\nshape:\n', idJ_np.shape)
print(idJ_np[0])

In [None]:
attention_mask = np.where(idJ_np != 0, 1, 0)
print('\nshape:\n',attention_mask.shape)
print(attention_mask[0])

## 4.2 Encoding to ID

In [None]:
# 1. Encoding
ids_attn = df_word.tweet.apply(lambda x: tokenizer.encode_plus(x,
                                                        add_special_tokens = True,
                                                        max_length =max_len,# maximum length of a sentence
                                                        truncation=True,
                                                        return_tensors = 'np',
                                                        padding = 'max_length')) # Add [PAD]s
print(ids_attn)
# print(type(ids_attn[0]['input_ids']))   
# print(ids_attn[0]['attention_mask'])   

In [None]:
# 2. get IDs numpy
input_ids_list = []
for i in range(len(ids_attn)):
    input_ids_list.append(ids_attn[i]['input_ids'][0])

    
input_ids_np = np.array(input_ids_list)
print(input_ids_np[0])
print('\ninput_ids_np.shape:\n', input_ids_np.shape)

## 4.3 Encoding to Attention_mask

## 4.4 Tokenize to words

In [None]:
tokens_np = df_word.tweet.apply(lambda x: tokenizer.tokenize(x,
                                                             add_special_tokens = True,
                                                             max_length = max_len,
                                                             truncation=True,
                                                             return_tensors = 'np',
                                                             pad_to_max_length=True))
tokens_np[0]


# 5. Save IDs, attention_mask, and tokens as DF

## 5.1 Save Ids as Df

In [None]:
input_ids_np

In [None]:
id_list = input_ids_np.tolist()

# # assign empty list to token ids column
df_word['token_ids'] = np.empty((len(df_word), max_len)).tolist()

# assign token ids 
df_word['token_ids'] = id_list

df_word

## 5.2 Save tokens in Df

In [None]:
tokens_df = pd.DataFrame(tokens_np)
tokens_df.rename(columns = {'tweet': 'tweet_tokens'})
df_word['tokens'] = tokens_df
df_word = df_word[0:100]
df_word

# 6. Modelling

In [None]:
# get embedded last hidden states
last_hidden_states = model(input_ids_np, attention_mask=attention_mask)

In [None]:
last_hidden_states

# 7. Check our last hidden states

![title](last_hidden.png)

<span style='background-color:Teal'> first  [0] is the Tensor <br>
second [0] is the tweet <br>
third [0] is the word  <br><span>

In [None]:
# Lets see the embedding of the last word(PAD) of 1st and second sentence
print(last_hidden_states[0][0,63,:5])
print(last_hidden_states[0][1,63,:5])

<span style='background-color:Teal'> Shouldn't all padding have the same embedding? <span>

# 8. Test Get embedding for each word in a tweet

In [None]:
sentence_idx = 0
word_idx = 0
tokens_tweets = df_word.iloc[sentence_idx].tokens
embedding_tweets = last_hidden_states[0][sentence_idx]

print(len(tokens_tweets))
print(embedding_tweets.shape)

In [None]:
def create_df_from_tweet_tokens(the_embeddings_list, the_tokens_list):
    
    # Part A: words df
    the_clean_tokens_list = []
    special_tokens_idx = []
    max_len = len(the_tokens_list)
    i = 0
    while(i < max_len):
        print(i)
        print(max_len)
#     for i in range(max_len):
        word = the_tokens_list[i]
        next_word = the_tokens_list[i+1]
            
        #1. join ## words first
        # if its not the last word, 
        # and the next word starts with ## merge this with next word
        if(i != (max_len-2)):
            next_word = the_tokens_list[i+1]
            pattern = r"(^##)"
            if (bool(re.search(pattern,next_word))):
                word = word + next_word[2:]
                print(word)
                # delete next word from list
                the_tokens_list.pop(i+1)
                max_len = len(the_tokens_list)
        i+=1
#         # 2. remove tokens that are padding, CLS, PAD, and SEP
#         if(word!= '[CLS]') and (the_tokens_list[i]!= '[PAD]') and (word!= '[SEP]'):
#             the_clean_tokens_list.append(word)
#             special_tokens_idx.append(i)
            
#     the_df_dict= pd.DataFrame(the_clean_tokens_list)
#     the_df_dict.columns = ['words']
#     return the_df_dict  
    
    

In [None]:
df_dict = create_df_from_tweet_tokens(embedding_tweets, tokens_tweets)

In [None]:
df_dict

# 9. Clustering

<span style='background-color:Teal'> We want to make sure that documents with similar sentiments are clustered together such that we can find the topics within these clusters. Before doing so, we first need to lower the dimensionality of the embeddings as many clustering algorithms handle high dimensionality poorly. <span>

## 9.1 Reduce Dimensionality using UMAP

<span style="background-color:Teal">Out of the few dimensionality reduction algorithms, UMAP is arguably the best performing as it keeps a significant portion of the high-dimensional local structure in lower dimensionality.<span>

In [None]:
import umap
umap_tweet_list = []
for embedded_tweet in embedded_text_list:
    umap_tweet = umap.UMAP(n_neighbors=15,
                           n_components=5,
                           metric='cosine').fit_transform(embedded_tweet)
    umap_tweet_list.append(umap_tweet)

In [None]:
# total number of tweet
print(len(umap_tweet_list))

# shape of each embedded tweet
print(umap_tweet_list[6268].shape)



## 9.2 Clustering KNN?
Question : I see HDBSCAN, can I just use KNN?

<span style="background-color:Teal"> After having reduced the dimensionality of the documents embeddings to 5, we can cluster the documents with HDBSCAN. HDBSCAN is a density-based algorithm that works quite well with UMAP since UMAP maintains a lot of local structure even in lower-dimensional space. Moreover, HDBSCAN does not force data points to clusters as it considers them outliers.<span>

In [None]:
# !pip install hdbscan --no-build-isolation --no-binary :all:

In [None]:
from sklearn.cluster import KMeans

In [None]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

# TEST BERT JALAMAAR

# TEST BERT JALAMAAR

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings

In [None]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)



In [None]:
batch_1 = df[:100]
batch_1[1].value_counts()
batch_1

In [None]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
# padding
max_lenJal = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
# masking
attention_maskJal = np.where(padded != 0, 1, 0)
attention_maskJal.shape



In [None]:
# modelling
with torch.no_grad():
    last_hidden_states = model(padded, attention_mask=attention_maskJal)

In [None]:
print(last_hidden_states[0][5][53][:5])
print(last_hidden_states[0][1][53][:5])

In [None]:
print(batch_1.iloc[5][0])
print(batch_1.iloc[1][0])