# 1. Import

In [1]:
from __future__ import division
from collections import Counter
## for data
import json
import pandas as pd
import numpy as np
from sklearn import metrics, manifold

## for pre-processing
import re
import nltk
import nltk.corpus
from nltk.corpus import stopwords

## for plotting
import matplotlib.pyplot as plt
import seaborn as sns

## for language detection
import langdetect
import spacy
from spacy_langdetect import LanguageDetector

## for w2v
import gensim
import gensim.downloader as gensim_api

## for bert
import transformers
from transformers import DistilBertTokenizer, DistilBertModel
import torch
  

## for predicting
from sklearn.metrics.pairwise import cosine_similarity

import re, nltk
import my_functions as func




In [2]:
df = pd.read_csv('tweetBERT.csv')
df

Unnamed: 0,tweet
0,Real Estate Market would crash if there is no ...
1,"Concur. My company was 100% ""you MUST work in ..."
2,Why not ask if we really need that thing? I th...
3,"Dear Line Managers, Appraisal your subordinate..."
4,I have had more opportunities to work cross-fu...
...,...
6270,"Development work is inherently remote, we migh..."
6271,Employers should train your employees in #Cybe...
6272,Washingtonian staff goes on strike after CEO's...
6273,Happy Mother's Day to all the wonderful moms o...


In [3]:
func.print_tweet(df, len(df))

0
Real Estate Market would crash if there is no demand for commercial space. Hybrid work / Remote work works . If we design for it. For decades, Office space worked as space to socialize with fellow human beings. 

1
Concur. My company was 100% "you MUST work in the office" and now they have said that is gone. More importantly many of our leaders have moved remote and we have hired remotely. That is a genie that is REALLY hard to put back in the bottle. 

2
Why not ask if we really need that thing? I think it would be fair that anyone that could work remote was privileged to do so, considering how many essential workers and small businesses got fucked over. Coming to to the office now feels like a teacher telling me to learn cursive 

3
Dear Line Managers, Appraisal your subordinate based on their Job performance and not sentiment ,blood-line , religious group or tribe. #HR #Career #peformanceappraisal #EmployeeExperience #remotework #employees 

4
I have had more opportunities to work

# 2. Clean

In [4]:
# from unidecode import unidecode

def clean_BERT(text, isSentenceEmbed = True):

    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''

    if(isSentenceEmbed == True):
        # sentence embedding, we need hashatag
        text = re.sub(r"[^A-Za-z0-9^,!-?%.\/#'+]", " ", text)
    else:
        # word embedding remove hashtag symbol
        text = re.sub(r"[^A-Za-z0-9^,!?%.\/'+]", " ", text)
    text = re.sub(r"\+", " plus ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text



In [5]:
df.tweet = df.tweet.apply(lambda x: clean_BERT(x, True))

In [6]:
# create new df that we need to create our word dictionary 
df_word = df.copy()[0:100]
df_word = func.remove_end_hashtag(df_word)
df_word.tweet = df.tweet.apply(lambda x: clean_BERT(x, False))


In [7]:
func.print_tweet(df_word,len(df_word))

0
Real Estate Market would crash if there is no demand for commercial space Hybrid work / Remote work works If we design for it For decades Office space worked as space to socialize with fellow human beings  

1
Concur My company was 100% you MUST work in the office and now they have said that is gone More importantly many of our leaders have moved remote and we have hired remotely That is a genie that is REALLY hard to put back in the bottle  

2
Why not ask if we really need that thing ? I think it would be fair that anyone that could work remote was privileged to do so considering how many essential workers and small businesses got fucked over Coming to to the office now feels like a teacher telling me to learn cursive 

3
Dear Line Managers Appraisal your subordinate based on their Job performance and not sentiment blood line religious group or tribe HR Career peformanceappraisal EmployeeExperience remotework employees 

4
I have had more opportunities to work cross functionally an

In [8]:
# remove duplicates
df_word = df_word.drop_duplicates(subset=['tweet'], keep='first')

In [9]:
df_word.reset_index(drop = True, inplace = True)


In [10]:
df_word

Unnamed: 0,tweet
0,Real Estate Market would crash if there is no ...
1,Concur My company was 100% you MUST work in th...
2,Why not ask if we really need that thing ? I t...
3,Dear Line Managers Appraisal your subordinate ...
4,I have had more opportunities to work cross fu...
...,...
95,Amid the COVID19 outbreak the challenges of Re...
96,RemoteWork and GlobalTalentMobility Stay on to...
97,The work I am doing is either bar tending or w...
98,I am with you on commuting and I extend that t...


# DistilBERT Word Embedding 

<span style='background-color:Teal'>Create word dictionary<span>

https://medium.com/@dhartidhami/understanding-bert-word-embeddings-7dc4d2ea54ca<br>
    https://medium.com/analytics-vidhya/bert-word-embeddings-deep-dive-32f6214f02bf<br>
https://dzlab.github.io/dltips/en/tensorflow/create-bert-vocab/<br>

# 3. Load and Test DistilBERT

In [11]:
from transformers import DistilBertTokenizerFast

# Instantiate DistilBERT tokenizer...we use the Fast version to optimize runtime
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', pad_token='[PAD]', model_max_length = 80)

# 2. get word embedder (encoder)
model = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased',output_hidden_states=True)



Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_projector', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [12]:
txt = 'Dear Line Managers Appraisal your subordinate based on their Job performance and not sentiment blood-line religious group or tribe'

# 1. encode
ids = tokenizer.encode(txt, max_length = 75,pad_to_max_length=True,add_special_tokens = True)
print("\nIDs   :\n", ids)

# 2. tokenize with CLS and SEP
tokens = tokenizer.convert_ids_to_tokens(ids)
print("\nTokens with special:\n", tokens)

# 3. Display the words with their IDs.
for tup in zip(tokens, ids ):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



IDs   :
 [101, 6203, 2240, 10489, 10439, 14995, 12002, 2115, 15144, 2241, 2006, 2037, 3105, 2836, 1998, 2025, 15792, 2668, 1011, 2240, 3412, 2177, 2030, 5917, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Tokens with special:
 ['[CLS]', 'dear', 'line', 'managers', 'app', '##rai', '##sal', 'your', 'subordinate', 'based', 'on', 'their', 'job', 'performance', 'and', 'not', 'sentiment', 'blood', '-', 'line', 'religious', 'group', 'or', 'tribe', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[P



In [13]:
# We will convert our sentence to 1 vector of 768D and store the result vector to our list
# embedded_text_list = []
# for i in range(5):
#     text = df.iloc[i].tweet
    
#     #ids
#     ids = tokenizer.encode(text)
    
#     #tokens
#     tokens = tokenizer.convert_ids_to_tokens(ids)

#     #array
#     ids_arr = np.array(ids)[None,:]
    
#     #embed
#     embedding = model(ids_arr)
    
#     embedded_text_list.append(embedding)
# # we have 5 samples x 768 

# # total number of tweet
# print(len(embedded_text_list))

# # shape of each embedded tweet
# print(embedded_text_list[1][0].shape)



# 4. Tokenize and Encoding

## 4.1 Encoding and Tokenize Sample

<span style='background-color:Teal'>We first need to get the maximum length of the tweet so we can pad it properly<span>

In [14]:
ids_np = df_word.tweet.apply(lambda x: tokenizer.encode(x))
max_len = 0
for ids in ids_np.values:
    if len(ids) > max_len:
        max_len = len(ids)


<span style='background-color:Teal'>WE want our IDs numpy to be in this format<span>

In [15]:
# get ids as numpy array
id_ = df_word.tweet.apply(lambda x: tokenizer.encode(x, add_special_tokens = True)) # Add [PAD]s

idJ_np = np.array([i + [0]*(max_len-len(i)) for i in id_.values])

print('\nshape:\n', idJ_np.shape)
print(idJ_np[0])


shape:
 (100, 67)
[ 101 2613 3776 3006 2052 5823 2065 2045 2003 2053 5157 2005 3293 2686
 8893 2147 1013 6556 2147 2573 2065 2057 2640 2005 2009 2005 5109 2436
 2686 2499 2004 2686 2000 2591 4697 2007 3507 2529 9552  102    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]


In [16]:
attention_mask = np.where(idJ_np != 0, 1, 0)
print('\nshape:\n',attention_mask.shape)
print(attention_mask[0])


shape:
 (100, 67)
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


## 4.2 Encoding to ID

In [17]:
# 1. Encoding
ids_attn = df_word.tweet.apply(lambda x: tokenizer.encode_plus(x,
                                                        add_special_tokens = True,
                                                        max_length =max_len,# maximum length of a sentence
                                                        truncation=True,
                                                        return_tensors = 'np',
                                                        padding = 'max_length')) # Add [PAD]s
print(ids_attn)
# print(type(ids_attn[0]['input_ids']))   
# print(ids_attn[0]['attention_mask'])   

0     [input_ids, attention_mask]
1     [input_ids, attention_mask]
2     [input_ids, attention_mask]
3     [input_ids, attention_mask]
4     [input_ids, attention_mask]
                 ...             
95    [input_ids, attention_mask]
96    [input_ids, attention_mask]
97    [input_ids, attention_mask]
98    [input_ids, attention_mask]
99    [input_ids, attention_mask]
Name: tweet, Length: 100, dtype: object


In [18]:
# 2. get IDs numpy
input_ids_list = []
for i in range(len(ids_attn)):
    input_ids_list.append(ids_attn[i]['input_ids'][0])

    
input_ids_np = np.array(input_ids_list)
print(input_ids_np[0])
print('\ninput_ids_np.shape:\n', input_ids_np.shape)

[ 101 2613 3776 3006 2052 5823 2065 2045 2003 2053 5157 2005 3293 2686
 8893 2147 1013 6556 2147 2573 2065 2057 2640 2005 2009 2005 5109 2436
 2686 2499 2004 2686 2000 2591 4697 2007 3507 2529 9552  102    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]

input_ids_np.shape:
 (100, 67)


## 4.3 Encoding to Attention_mask

## 4.4 Tokenize to words

In [19]:
tokens_np = df_word.tweet.apply(lambda x: tokenizer.tokenize(x,
                                                             add_special_tokens = True,
                                                             max_length = max_len,
                                                             truncation=True,
                                                             return_tensors = 'np',
                                                             pad_to_max_length=True))
tokens_np[0]


['[CLS]',
 'real',
 'estate',
 'market',
 'would',
 'crash',
 'if',
 'there',
 'is',
 'no',
 'demand',
 'for',
 'commercial',
 'space',
 'hybrid',
 'work',
 '/',
 'remote',
 'work',
 'works',
 'if',
 'we',
 'design',
 'for',
 'it',
 'for',
 'decades',
 'office',
 'space',
 'worked',
 'as',
 'space',
 'to',
 'social',
 '##ize',
 'with',
 'fellow',
 'human',
 'beings',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

# 5. Save IDs, attention_mask, and tokens as DF

## 5.1 Save Ids as Df

In [20]:
input_ids_np

array([[  101,  2613,  3776, ...,     0,     0,     0],
       [  101,  9530, 10841, ...,     0,     0,     0],
       [  101,  2339,  2025, ...,     0,     0,     0],
       ...,
       [  101,  1996,  2147, ...,     0,     0,     0],
       [  101,  1045,  2572, ...,     0,     0,     0],
       [  101,  7668,  5973, ...,     0,     0,     0]])

In [21]:
id_list = input_ids_np.tolist()

# # assign empty list to token ids column
df_word['token_ids'] = np.empty((len(df_word), max_len)).tolist()

# assign token ids 
df_word['token_ids'] = id_list

df_word

Unnamed: 0,tweet,token_ids
0,Real Estate Market would crash if there is no ...,"[101, 2613, 3776, 3006, 2052, 5823, 2065, 2045..."
1,Concur My company was 100% you MUST work in th...,"[101, 9530, 10841, 2099, 2026, 2194, 2001, 253..."
2,Why not ask if we really need that thing ? I t...,"[101, 2339, 2025, 3198, 2065, 2057, 2428, 2342..."
3,Dear Line Managers Appraisal your subordinate ...,"[101, 6203, 2240, 10489, 10439, 14995, 12002, ..."
4,I have had more opportunities to work cross fu...,"[101, 1045, 2031, 2018, 2062, 6695, 2000, 2147..."
...,...,...
95,Amid the COVID19 outbreak the challenges of Re...,"[101, 13463, 1996, 2522, 17258, 16147, 8293, 1..."
96,RemoteWork and GlobalTalentMobility Stay on to...,"[101, 6556, 6198, 1998, 3795, 22059, 3372, 530..."
97,The work I am doing is either bar tending or w...,"[101, 1996, 2147, 1045, 2572, 2725, 2003, 2593..."
98,I am with you on commuting and I extend that t...,"[101, 1045, 2572, 2007, 2017, 2006, 4012, 2812..."


## 5.2 Save tokens in Df

In [22]:
tokens_df = pd.DataFrame(tokens_np)
tokens_df.rename(columns = {'tweet': 'tweet_tokens'})
df_word['tokens'] = tokens_df
df_word = df_word[0:100]
df_word

Unnamed: 0,tweet,token_ids,tokens
0,Real Estate Market would crash if there is no ...,"[101, 2613, 3776, 3006, 2052, 5823, 2065, 2045...","[[CLS], real, estate, market, would, crash, if..."
1,Concur My company was 100% you MUST work in th...,"[101, 9530, 10841, 2099, 2026, 2194, 2001, 253...","[[CLS], con, ##cu, ##r, my, company, was, 100,..."
2,Why not ask if we really need that thing ? I t...,"[101, 2339, 2025, 3198, 2065, 2057, 2428, 2342...","[[CLS], why, not, ask, if, we, really, need, t..."
3,Dear Line Managers Appraisal your subordinate ...,"[101, 6203, 2240, 10489, 10439, 14995, 12002, ...","[[CLS], dear, line, managers, app, ##rai, ##sa..."
4,I have had more opportunities to work cross fu...,"[101, 1045, 2031, 2018, 2062, 6695, 2000, 2147...","[[CLS], i, have, had, more, opportunities, to,..."
...,...,...,...
95,Amid the COVID19 outbreak the challenges of Re...,"[101, 13463, 1996, 2522, 17258, 16147, 8293, 1...","[[CLS], amid, the, co, ##vid, ##19, outbreak, ..."
96,RemoteWork and GlobalTalentMobility Stay on to...,"[101, 6556, 6198, 1998, 3795, 22059, 3372, 530...","[[CLS], remote, ##work, and, global, ##tale, #..."
97,The work I am doing is either bar tending or w...,"[101, 1996, 2147, 1045, 2572, 2725, 2003, 2593...","[[CLS], the, work, i, am, doing, is, either, b..."
98,I am with you on commuting and I extend that t...,"[101, 1045, 2572, 2007, 2017, 2006, 4012, 2812...","[[CLS], i, am, with, you, on, com, ##mut, ##in..."


# 6. Modelling

In [23]:
# get embedded last hidden states
last_hidden_states = model(input_ids_np, attention_mask=attention_mask)

In [24]:
last_hidden_states

TFBaseModelOutput(last_hidden_state=<tf.Tensor: shape=(100, 67, 768), dtype=float32, numpy=
array([[[-0.02339539, -0.09013493, -0.07003146, ..., -0.56661355,
          0.5897922 ,  0.06696324],
        [ 0.8093122 ,  0.25729024,  0.32817924, ..., -0.54350966,
          0.6168786 , -0.05259355],
        [ 0.5150185 , -0.2593718 ,  0.7684823 , ..., -0.42708883,
          0.41538167, -0.27274   ],
        ...,
        [ 0.34820083,  0.02191923,  0.41176152, ..., -0.4235657 ,
          0.10091731,  0.00427072],
        [ 0.3804726 , -0.11843493,  0.41450268, ..., -0.3025887 ,
          0.0675649 ,  0.05992321],
        [ 0.38783982, -0.12908062,  0.35594252, ..., -0.3052296 ,
          0.03858719,  0.01343088]],

       [[ 0.23057267,  0.09537684,  0.06643021, ..., -0.25237364,
          0.57970953,  0.15485445],
        [ 0.35936505, -0.1498798 ,  0.34110406, ..., -0.281134  ,
          0.644292  ,  0.16273354],
        [-0.4610675 , -0.5297161 ,  0.54784894, ..., -0.30162263,
          0

# 7. Check our last hidden states

![title](last_hidden.png)

<span style='background-color:Teal'> first  [0] is the Tensor <br>
second [0] is the tweet <br>
third [0] is the word  <br><span>

In [25]:
# Lets see the embedding of the last word(PAD) of 1st and second sentence
print(last_hidden_states[0][0,63,:5])
print(last_hidden_states[0][1,63,:5])

tf.Tensor([ 0.22503561 -0.12309138  0.07410068  0.23669322  0.11658055], shape=(5,), dtype=float32)
tf.Tensor([ 0.12682685  0.10939362  0.12900117  0.22593094 -0.03126201], shape=(5,), dtype=float32)


<span style='background-color:Teal'> Shouldn't all padding have the same embedding? <span>

# 8. Test Get embedding for each word in a tweet

In [26]:
sentence_idx = 1
tokens_tweets = df_word.iloc[sentence_idx].tokens
embedding_tweets = last_hidden_states[0][sentence_idx]

print(tokens_tweets)
print(embedding_tweets.shape)

['[CLS]', 'con', '##cu', '##r', 'my', 'company', 'was', '100', '%', 'you', 'must', 'work', 'in', 'the', 'office', 'and', 'now', 'they', 'have', 'said', 'that', 'is', 'gone', 'more', 'importantly', 'many', 'of', 'our', 'leaders', 'have', 'moved', 'remote', 'and', 'we', 'have', 'hired', 'remotely', 'that', 'is', 'a', 'genie', 'that', 'is', 'really', 'hard', 'to', 'put', 'back', 'in', 'the', 'bottle', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
(67, 768)


In [28]:
from nltk.corpus import stopwords

# filtered_words = [word for word in word_list if word not in stopwords.words('english')]

In [36]:
def create_word_df(the_tokens_list):   
    # Part A: words df
    the_clean_tokens_list = []
    the_tokens_idx = []
    merged_tokens_idx = []
    max_len = len(the_tokens_list)
    i = 0
    word = the_tokens_list[i]
    while(i < max_len):
        add = True
        word = the_tokens_list[i]
        # 1. join ## words first
        # if its not the last word, 
        if(i <= (max_len-2)):
            next_word = the_tokens_list[i+1]
            pattern = r"(^##)"
            # if the next word is ##
            while (bool(re.search(pattern,next_word))):
                add = False
                print('index of word with ##: ',i+1)
                word = word + next_word[2:]
                if(i) not in merged_tokens_idx:
                    merged_tokens_idx.append(i)
                merged_tokens_idx.append(i+1)
                print('word after merge:', word)
                # skip the next word becos we have merged it
                i+=1
                next_word = the_tokens_list[i+1]
        
        if(merged_tokens_idx) not in the_tokens_idx:
            the_tokens_idx.append(merged_tokens_idx) 
        # 2. get tokens that are not padding, CLS, PAD, SEP, and not in stopwords
        if(word!= '[CLS]') and (word!= '[PAD]') and (word!= '[SEP]') and (word not in stopwords.words('english')):     
            the_clean_tokens_list.append(word)
            if(add == True):
                the_tokens_idx.append(i)
                
        
        # 3. move to next word
        i+=1
    
    the_df_dict= pd.DataFrame(the_clean_tokens_list)
    the_df_dict.columns = ['words']
    print(merged_tokens_idx)
    return [the_df_dict ,the_tokens_idx, the_clean_tokens_list]
    
    

In [37]:
df_dict,tokens_idx,clean_tokens_list  = create_word_df(tokens_tweets)

index of word with ##:  2
word after merge: concu
index of word with ##:  3
word after merge: concur
[1, 2, 3]


In [38]:
print(len(tokens_idx))
tokens_idx

22


[[1, 2, 3],
 5,
 7,
 8,
 10,
 11,
 14,
 19,
 22,
 24,
 25,
 28,
 30,
 31,
 35,
 36,
 40,
 43,
 44,
 46,
 47,
 50]

In [39]:
df_dict

Unnamed: 0,words
0,concur
1,company
2,100
3,%
4,must
5,work
6,office
7,said
8,gone
9,importantly


In [None]:
def create_embedding_df(the_embeddings_list, the_idx):
    clean_embedding_list = []
    # 1. loop through tokens_idx 
    for i in range(len(the_idx)):
        embed_idx = the_idx[i]
        
        # 2. if this is a list, loop thorough this idx, to get the index of the embedding vector
        
        
        # get the average of these vectors
        
        # append to a list of embedding
        
        
        # 3. else ( not a list):
        
        # get the embedding from emebedding_list

# 9. Clustering

<span style='background-color:Teal'> We want to make sure that documents with similar sentiments are clustered together such that we can find the topics within these clusters. Before doing so, we first need to lower the dimensionality of the embeddings as many clustering algorithms handle high dimensionality poorly. <span>

## 9.1 Reduce Dimensionality using UMAP

<span style="background-color:Teal">Out of the few dimensionality reduction algorithms, UMAP is arguably the best performing as it keeps a significant portion of the high-dimensional local structure in lower dimensionality.<span>

In [None]:
import umap
umap_tweet_list = []
for embedded_tweet in embedded_text_list:
    umap_tweet = umap.UMAP(n_neighbors=15,
                           n_components=5,
                           metric='cosine').fit_transform(embedded_tweet)
    umap_tweet_list.append(umap_tweet)

In [None]:
# total number of tweet
print(len(umap_tweet_list))

# shape of each embedded tweet
print(umap_tweet_list[6268].shape)



## 9.2 Clustering KNN?
Question : I see HDBSCAN, can I just use KNN?

<span style="background-color:Teal"> After having reduced the dimensionality of the documents embeddings to 5, we can cluster the documents with HDBSCAN. HDBSCAN is a density-based algorithm that works quite well with UMAP since UMAP maintains a lot of local structure even in lower-dimensional space. Moreover, HDBSCAN does not force data points to clusters as it considers them outliers.<span>

In [None]:
# !pip install hdbscan --no-build-isolation --no-binary :all:

In [None]:
from sklearn.cluster import KMeans

In [None]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

# TEST BERT JALAMAAR

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings

In [None]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)



In [None]:
batch_1 = df[:100]
batch_1[1].value_counts()
batch_1

In [None]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
# padding
max_lenJal = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
# masking
attention_maskJal = np.where(padded != 0, 1, 0)
attention_maskJal.shape



In [None]:
# modelling
with torch.no_grad():
    last_hidden_states = model(padded, attention_mask=attention_maskJal)

In [None]:
print(last_hidden_states[0][5][53][:5])
print(last_hidden_states[0][1][53][:5])

In [None]:
print(batch_1.iloc[5][0])
print(batch_1.iloc[1][0])