# 0 - Import & Load Data

In [1]:
## for data
import json
import pandas as pd
import numpy as np

# to compute cosine similarity
from sklearn import metrics, manifold


In [2]:
df_own_dict = pd.read_json('df_own_dict.json')
df_data_dict = pd.read_json('df_data_dict.json')
df_sentence = pd.read_json('df_sentence.json')
df_vader = pd.read_csv('df_vader.csv')

In [3]:
print(type(df_own_dict.iloc[0].embedding))
df_own_dict

<class 'list'>


Unnamed: 0,sentiment,embedding
0,positive_own,"[-0.23684903980000002, 0.11508918550000001, 0...."
1,negative_own,"[-0.45195508, 0.0403723456, 0.5418859124000001..."


In [4]:
df_data_dict

Unnamed: 0,sentiment,embedding
0,positive_data,"[-0.6036096215, 0.11336755750000001, 0.6376053..."
1,negative_data,"[-0.7820361257, 0.20127013330000001, 0.5900069..."


In [36]:
df_sentence.head(10)

Unnamed: 0,tweet,tweet_embedding,data_predicted,own_predicted
0,Real Estate Market would crash if there is no ...,"[0.1891789287, -0.0161942858, 0.626776576, -1....",POSITIVE,NEGATIVE
1,"Concur. My company was 100% ""you MUST work in ...","[-0.3543173969, -0.3487285674, 0.2804761231000...",POSITIVE,NEGATIVE
2,Why not ask if we really need that thing? I th...,"[-0.7401492596, -0.3735173047, 0.1090161577, -...",NEGATIVE,NEGATIVE
3,"Dear Line Managers, Appraisal your subordinate...","[-0.6507265568, 0.33605423570000004, 0.3439847...",NEGATIVE,NEGATIVE
4,I have had more opportunities to work cross-fu...,"[-0.2743485272, -0.29202288390000003, 0.711124...",NEGATIVE,POSITIVE
5,Study reveals growing cybersecurity risks driv...,"[-1.0207954645, -0.6177105904, 0.7027013302, -...",NEGATIVE,POSITIVE
6,"As a remote employee, you may be tempted to ch...","[-0.6124552488, 0.043601572500000005, 0.210215...",POSITIVE,NEGATIVE
7,"I am lucky, mine is moving to a hybrid model. ...","[-0.7892742157, -0.9139534831, 0.4314154088, -...",POSITIVE,POSITIVE
8,Shifting to a #remotework environment created ...,"[-0.4753146172, -0.188119173, 0.6137900352, -1...",NEGATIVE,NEGATIVE
9,professionals from a range of industries who n...,"[-0.7931020856000001, -0.4555655718, 0.8284254...",NEGATIVE,POSITIVE


In [7]:
df_vader = df_vader.drop(['VaderScore'],1)

In [8]:
df_vader.head(10)

Unnamed: 0,tweet,VaderSentiment
0,Real Estate Market would crash if there is no ...,NEGATIVE
1,"Concur. My company was 100% ""you MUST work in ...",NEGATIVE
2,Why not ask if we really need that thing? I th...,POSITIVE
3,"Dear Line Managers, Appraisal your subordinate...",POSITIVE
4,I have had more opportunities to work cross-fu...,POSITIVE
5,Study reveals growing cybersecurity risks driv...,NEGATIVE
6,"As a remote employee, you may be tempted to ch...",NEGATIVE
7,"I am lucky, mine is moving to a hybrid model. ...",POSITIVE
8,Shifting to a #remotework environment created ...,POSITIVE
9,professionals from a range of industries who n...,POSITIVE


<span style="background-color:Teal"> We need to compute the distance between: <span><br>
<span style="background-color:Teal"> 1. Using +ve and -ve words from our data: <br>
<span style="background-color:Teal"> a) tweet -> positive average embedding <span><br>
<span style="background-color:Teal"> b) tweet -> negative average embedding <span><br><span style="background-color:Teal"> b) tweet -> negative average embedding <span><br>

<span style="background-color:Teal"> 2. Using our own +ve and -ve words  <span><br>
<span style="background-color:Teal"> a) tweet -> positive average embedding <span><br>
<span style="background-color:Teal"> b) tweet -> negative average embedding <span><br>

# 1 - Distance between tweets and words clusters from our data

## 1.1 Convert sentence and word embedding as list

In [9]:
# 1. get sentence embedding as a list
sentence_embedding_list = df_sentence.tweet_embedding.to_list()

In [10]:
print(len(sentence_embedding_list[0]))
print(len(sentence_embedding_list))

768
3000


In [11]:
# 2. get word embedding as a list
data_embedding_list = df_data_dict.embedding.to_list()

In [12]:
print(len(data_embedding_list[0]))
print(len(data_embedding_list))

768
2


## 1.2 Loop through both list and compute distance

In [13]:
# 3. loop through the sentence embedding list and compute distance

data_distances_all_sentences = []

# 1. For each text:
for sentence_embedding in sentence_embedding_list:
    # create empty list of distance for each text
    distance_single_sentence = []
    
    for category_embedding in data_embedding_list:  
        # 2. compute distance to each of the 2 sentiments (positive + negative)
        # cosine_similarity takes two 2D array as an argument
        distance = metrics.pairwise.cosine_similarity([sentence_embedding], [category_embedding])
        # 3. insert distance to a list
        distance_single_sentence.append(distance[0][0])
    data_distances_all_sentences.append(distance_single_sentence)
        

In [14]:
print('number of distances: ',len(data_distances_all_sentences))

print('\nfirst distance: ', data_distances_all_sentences[0])

df_data_distance = pd.DataFrame(data_distances_all_sentences, columns=['POSITIVE', 'NEGATIVE'])



number of distances:  3000

first distance:  [0.6652183343769393, 0.6391052061015284]


In [15]:
df_data_distance

Unnamed: 0,POSITIVE,NEGATIVE
0,0.665218,0.639105
1,0.720249,0.662964
2,0.685392,0.719752
3,0.556406,0.677702
4,0.596768,0.638734
...,...,...
2995,0.271691,0.292918
2996,0.669600,0.599127
2997,0.664723,0.624095
2998,0.643832,0.713236


## 1.3 Adjust and Rescale

<span style="background-color:Teal">to make each row's sum of similarity to be equal to 1<span>

In [16]:
# create random number from 0 to 2
labels = ['POSITIVE', 'NEGATIVE']
print(labels)
print([0]*2)

# create random number 0-2
np.random.choice(range(2))

['POSITIVE', 'NEGATIVE']
[0, 0]


0

In [17]:
# for each row
for i in range(len(data_distances_all_sentences)):
    # sum this row 
    sum_row = sum(data_distances_all_sentences[i])
    
    # 1. if no similarity to any of the category assign randomly
    if(sum_row == 0):
        # assign 0 to each category
        data_distances_all_sentences[i] = [0]*len(labels)
        
        # get random index between 0-2
        random_index = np.random.choice(range(2))
        data_distances_all_sentences[random_index] = 1
        
    # 2. if not rescale so they sum == 1
    data_distances_all_sentences[i] = data_distances_all_sentences[i]/ sum_row

In [18]:
# check if each row no sums to 1
df_distance = pd.DataFrame(data_distances_all_sentences, columns=['POSITIVE', 'NEGATIVE'])
df_distance.head(7)

Unnamed: 0,POSITIVE,NEGATIVE
0,0.51001,0.48999
1,0.520707,0.479293
2,0.487773,0.512227
3,0.450857,0.549143
4,0.483016,0.516984
5,0.491955,0.508045
6,0.500046,0.499954


## 1.4 Classify tweet with highest similarity score

In [19]:
predicted =[]

# for each distance pair:
for row in data_distances_all_sentences:
    # 1. get the index with highest score . 
    # index 1 means negative, index 0 means positive
    prediction_index = np.argmax(row)
    print(prediction_index)
    # 2. get the name of our prediction
    prediction_name = labels[prediction_index]
    
    predicted.append(prediction_name)

0
0
1
1
1
1
0
0
1
1
1
1
0
1
1
0
1
1
1
1
1
1
1
0
0
0
0
0
0
1
0
0
0
1
0
1
1
1
0
0
0
0
0
0
1
1
0
0
1
1
1
1
1
1
0
0
1
0
1
1
1
0
0
1
0
0
1
0
1
0
0
1
0
1
1
0
0
0
0
0
1
1
1
1
1
1
0
0
0
1
1
1
1
1
1
1
1
1
0
0
1
1
0
0
1
1
0
0
0
0
0
0
1
1
1
0
0
0
0
0
1
0
1
0
1
0
0
1
1
1
0
1
1
1
0
0
1
1
0
1
1
1
0
0
0
1
1
0
0
0
0
1
1
1
0
1
1
0
0
1
0
1
1
0
1
0
1
1
0
0
1
1
1
0
1
0
0
0
0
0
1
1
0
0
0
0
0
1
1
0
1
1
0
0
1
0
0
0
1
1
1
0
1
1
0
0
0
0
1
0
0
1
0
1
1
0
0
0
1
0
0
0
1
0
1
1
1
0
0
0
1
1
1
1
1
0
1
0
1
0
0
1
1
1
0
1
1
0
0
1
1
0
1
1
1
1
0
1
1
0
1
0
1
1
0
0
0
0
0
1
0
1
1
1
0
0
1
0
1
1
1
1
0
1
0
0
1
0
0
0
1
1
1
0
0
0
0
0
0
1
1
1
1
1
1
0
0
1
0
0
0
1
1
1
1
1
1
1
0
1
1
0
1
0
1
0
0
0
0
0
0
0
1
0
1
0
0
1
1
0
1
1
0
0
1
1
1
1
0
0
1
1
0
1
1
1
0
0
1
1
1
0
0
1
1
1
1
1
0
1
1
0
1
0
0
1
0
1
0
1
0
0
0
1
1
0
1
1
0
0
1
0
1
1
1
0
1
1
0
1
1
0
1
1
0
1
1
0
0
1
0
0
0
1
1
1
1
1
0
1
1
0
1
1
1
0
0
1
0
0
1
0
1
0
1
0
0
1
1
1
1
0
0
0
0
1
0
1
0
1
0
0
0
0
0
0
1
0
1
1
1
1
1
1
0
1
1
0
0
0
1
1
1
0
1
1
0
1
0
1
0
1
0
0
0
0
1
1
1
0
0
1
1
1
0
1
0
0
1
1


In [20]:
predicted[0:6]

['POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE']

In [21]:
# convert to dataframe
predicted_data_df = pd.DataFrame(predicted)

In [22]:
df_sentence['data_predicted'] = predicted_data_df.values

In [23]:
df_sentence

Unnamed: 0,tweet,tweet_embedding,data_predicted
0,Real Estate Market would crash if there is no ...,"[0.1891789287, -0.0161942858, 0.626776576, -1....",POSITIVE
1,"Concur. My company was 100% ""you MUST work in ...","[-0.3543173969, -0.3487285674, 0.2804761231000...",POSITIVE
2,Why not ask if we really need that thing? I th...,"[-0.7401492596, -0.3735173047, 0.1090161577, -...",NEGATIVE
3,"Dear Line Managers, Appraisal your subordinate...","[-0.6507265568, 0.33605423570000004, 0.3439847...",NEGATIVE
4,I have had more opportunities to work cross-fu...,"[-0.2743485272, -0.29202288390000003, 0.711124...",NEGATIVE
...,...,...,...
2995,I am available if a remote work opportunity ar...,"[-0.7216980457000001, -0.13284055890000002, -0...",NEGATIVE
2996,It made almost everyone work with limited reso...,"[-0.2861507535, -0.3813780248, 0.8702679873, 0...",POSITIVE
2997,I started moving things into my new office ups...,"[-0.5724875927, 0.11444271360000001, 0.7665416...",POSITIVE
2998,DYK that we have spent more than 5.5 billion m...,"[-0.5358973145, -0.4356870949, 0.8506852984000...",NEGATIVE


# 2 - Distance between tweets and words clusters from our data

## 2.1 Convert own word embedding as list

In [24]:
own_embedding_list = df_own_dict.embedding.to_list()

In [25]:
len(own_embedding_list)

2

## 2.2 Loop through sentence and word list to compute distance

In [26]:
# 3. loop through the sentence embedding list and compute distance
own_distances_all_sentences = []

# 1. For each text:
for sentence_embedding in sentence_embedding_list:
    # create empty list of distance for each text
    distance_single_sentence = []
    
    for category_embedding in own_embedding_list:  
        # 2. compute distance to each of the 2 sentiments (positive + negative)
        # cosine_similarity takes two 2D array as an argument
        distance = metrics.pairwise.cosine_similarity([sentence_embedding], [category_embedding])
 
        # 3. insert distance to a list
        distance_single_sentence.append(distance[0][0])
    own_distances_all_sentences.append(distance_single_sentence)

In [27]:
own_distances_all_sentences

[[0.31764481636364583, 0.4461235460916147],
 [0.26059102251646576, 0.3697676735380847],
 [0.3370278571966943, 0.41393971738077395],
 [0.1650824934743209, 0.16923247266765018],
 [0.4635582893048704, 0.21014460672214585],
 [0.3106496122648842, 0.27604817293322537],
 [0.28161835226741083, 0.4068274823572376],
 [0.421867817447115, 0.1535374762765921],
 [0.28709499471074007, 0.30969846949502583],
 [0.40488109797231575, 0.09917539884660083],
 [0.1289141322865683, 0.2892959098002108],
 [0.4905684911318522, 0.2369096311703943],
 [0.20392658984242654, 0.28941752317900027],
 [0.123433855745184, 0.4747190317173344],
 [0.09810009956112711, 0.4135555396226889],
 [0.2483667345745799, 0.4466253506951488],
 [0.19605134739070473, 0.4914112847772433],
 [0.1512908728105473, 0.2572071755763475],
 [0.31015814754068727, 0.2674129935317172],
 [0.5603697649657289, 0.189122090775567],
 [0.3122801440006717, 0.47531260603126924],
 [0.04166190679848971, 0.2849522642951665],
 [0.05098210853995948, 0.24540786469150

## 2.3 Adjust and Rescale

In [28]:
# for each row
for i in range(len(own_distances_all_sentences)):
    # sum this row 
    sum_row = sum(own_distances_all_sentences[i])
    
    # 1. if no similarity to any of the category assign randomly
    if(sum_row == 0):
        # assign 0 to each category
        own_distances_all_sentences[i] = [0]*len(labels)
        
        # get random index between 0-2
        random_index = np.random.choice(range(2))
        own_distances_all_sentences[random_index] = 1
        
    # 2. if not rescale so they sum == 1
    own_distances_all_sentences[i] = own_distances_all_sentences[i]/ sum_row

## 2.4 Classify tweet with highest similarity score

In [29]:
own_predicted =[]

# for each distance pair:
for row in own_distances_all_sentences:
    # 1. get the index with highest score . 
    # index 1 means negative, index 0 means positive
    prediction_index = np.argmax(row)
    print(prediction_index)
    # 2. get the name of our prediction
    prediction_name = labels[prediction_index]
    
    own_predicted.append(prediction_name)

1
1
1
1
0
0
1
0
1
0
1
0
1
1
1
1
1
1
0
0
1
1
1
0
1
0
1
0
0
1
1
0
1
1
0
1
0
1
1
0
1
0
1
0
1
1
1
1
0
1
0
1
0
0
0
1
1
0
0
0
1
1
1
0
0
0
0
1
1
1
1
0
0
0
1
1
1
0
1
1
1
0
0
1
1
1
0
0
1
1
0
1
1
0
1
1
0
1
0
0
1
0
0
0
1
1
1
1
0
1
1
1
0
1
0
0
0
0
1
1
1
1
1
1
1
0
0
1
1
0
1
1
0
1
0
0
1
0
0
1
0
1
0
0
0
0
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
1
1
0
1
0
1
1
0
1
0
0
1
0
1
1
1
1
0
0
0
1
1
0
1
1
1
1
0
0
0
0
1
1
1
0
1
1
0
1
0
0
1
0
0
0
0
1
0
0
1
0
0
1
0
1
1
1
0
1
0
1
0
1
1
1
1
0
1
0
0
1
0
1
1
0
0
1
0
1
1
0
0
1
0
0
1
0
1
1
0
0
0
1
1
1
0
0
1
0
1
0
1
0
1
0
0
0
0
1
0
1
1
1
0
1
0
1
1
1
1
1
1
0
0
0
1
1
1
1
1
0
0
1
1
1
0
1
0
1
1
1
0
0
0
0
1
1
1
1
1
1
1
0
1
0
1
1
1
1
1
1
1
0
1
1
1
1
1
0
1
1
0
1
1
1
0
1
0
1
1
1
1
0
1
1
0
0
1
0
0
1
0
0
1
1
0
0
0
0
1
0
0
1
0
1
1
0
0
1
1
0
0
0
1
1
0
0
0
1
1
0
0
1
0
0
1
1
1
1
1
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
1
0
1
1
0
0
0
1
1
1
0
1
1
0
1
0
0
1
1
0
1
0
0
0
0
1
0
1
0
0
1
1
1
1
0
0
1
0
1
0
1
0
0
1
0
1
0
0
1
0
1
0
1
1
0
0
0
1
1
0
1
1
1
1
1
0
1
1
1
0
0
0
1
1
1
1
1
1
1
1
1
1
0
1
1
1
0
0
1
0
0
1
1
1


In [30]:
# convert to dataframe
predicted_own_df = pd.DataFrame(own_predicted)

In [31]:
df_sentence['own_predicted'] = predicted_own_df.values

In [32]:
df_sentence

Unnamed: 0,tweet,tweet_embedding,data_predicted,own_predicted
0,Real Estate Market would crash if there is no ...,"[0.1891789287, -0.0161942858, 0.626776576, -1....",POSITIVE,NEGATIVE
1,"Concur. My company was 100% ""you MUST work in ...","[-0.3543173969, -0.3487285674, 0.2804761231000...",POSITIVE,NEGATIVE
2,Why not ask if we really need that thing? I th...,"[-0.7401492596, -0.3735173047, 0.1090161577, -...",NEGATIVE,NEGATIVE
3,"Dear Line Managers, Appraisal your subordinate...","[-0.6507265568, 0.33605423570000004, 0.3439847...",NEGATIVE,NEGATIVE
4,I have had more opportunities to work cross-fu...,"[-0.2743485272, -0.29202288390000003, 0.711124...",NEGATIVE,POSITIVE
...,...,...,...,...
2995,I am available if a remote work opportunity ar...,"[-0.7216980457000001, -0.13284055890000002, -0...",NEGATIVE,POSITIVE
2996,It made almost everyone work with limited reso...,"[-0.2861507535, -0.3813780248, 0.8702679873, 0...",POSITIVE,POSITIVE
2997,I started moving things into my new office ups...,"[-0.5724875927, 0.11444271360000001, 0.7665416...",POSITIVE,POSITIVE
2998,DYK that we have spent more than 5.5 billion m...,"[-0.5358973145, -0.4356870949, 0.8506852984000...",NEGATIVE,NEGATIVE


# 3 - Save Vader score into the df

In [33]:
df_merge = df_sentence.merge(df_vader, how = 'inner' ,indicator=False)

In [39]:
df_merge

Unnamed: 0,tweet,tweet_embedding,data_predicted,own_predicted,VaderSentiment
0,Real Estate Market would crash if there is no ...,"[0.1891789287, -0.0161942858, 0.626776576, -1....",POSITIVE,NEGATIVE,NEGATIVE
1,"Concur. My company was 100% ""you MUST work in ...","[-0.3543173969, -0.3487285674, 0.2804761231000...",POSITIVE,NEGATIVE,NEGATIVE
2,Why not ask if we really need that thing? I th...,"[-0.7401492596, -0.3735173047, 0.1090161577, -...",NEGATIVE,NEGATIVE,POSITIVE
3,"Dear Line Managers, Appraisal your subordinate...","[-0.6507265568, 0.33605423570000004, 0.3439847...",NEGATIVE,NEGATIVE,POSITIVE
4,I have had more opportunities to work cross-fu...,"[-0.2743485272, -0.29202288390000003, 0.711124...",NEGATIVE,POSITIVE,POSITIVE
...,...,...,...,...,...
2948,I am available if a remote work opportunity ar...,"[-0.7216980457000001, -0.13284055890000002, -0...",NEGATIVE,POSITIVE,POSITIVE
2949,It made almost everyone work with limited reso...,"[-0.2861507535, -0.3813780248, 0.8702679873, 0...",POSITIVE,POSITIVE,POSITIVE
2950,I started moving things into my new office ups...,"[-0.5724875927, 0.11444271360000001, 0.7665416...",POSITIVE,POSITIVE,POSITIVE
2951,DYK that we have spent more than 5.5 billion m...,"[-0.5358973145, -0.4356870949, 0.8506852984000...",NEGATIVE,NEGATIVE,POSITIVE


# 4 - Print tweets and compare predictions

In [65]:
listString = ['POSITIVE', 'POSITIVE', 'POSITIVE']
listString.count(listString[0]) == len(listString)

True

In [69]:
count = 0
for i in range(len(df_merge)):
    sentiment_all_models = []
    sentiment_all_models.append(df_merge.iloc[i].data_predicted)
    sentiment_all_models.append(df_merge.iloc[i].own_predicted)
    sentiment_all_models.append(df_merge.iloc[i].VaderSentiment)

    are_same = sentiment_all_models.count(sentiment_all_models[0]) == len(sentiment_all_models)  
    if are_same:
        print(i)
        # 1. print tweets
        print(df_merge.iloc[i].tweet)
    
        # 2. print prediciton by data
#         print('data prediction: ', df_merge.iloc[i].data_predicted)
    
        # 3. print prediciton by own cluster
        print('own prediction: ', df_merge.iloc[i].own_predicted)
        
         # 3. print Vader Prediction
        print('Vader prediction: ', df_merge.iloc[i].VaderSentiment)
    
        print('\n')
        count+=1

0
Real Estate Market would crash if there is no demand for commercial space. Hybrid work / Remote work works . If we design for it. For decades, Office space worked as space to socialize with fellow human beings.
own prediction:  NEGATIVE
Vader prediction:  NEGATIVE


1
Concur. My company was 100% "you MUST work in the office" and now they have said that is gone. More importantly many of our leaders have moved remote and we have hired remotely. That is a genie that is REALLY hard to put back in the bottle.
own prediction:  NEGATIVE
Vader prediction:  NEGATIVE


4
I have had more opportunities to work cross-functionally and engage with company leadership in the past year than in the Before Times. Remote has flattened the org and led to better collaboration. But, sure, keep doing things the old way.
own prediction:  POSITIVE
Vader prediction:  POSITIVE


6
As a remote employee, you may be tempted to check work emails or do work tasks during off hours. it is important to NOT do this. Just

<span style="background-color:Teal">There are 1500 tweets out of 3000 with unmatched sentiment.<span>


<span style="background-color:Teal">Own prediction is better at:
#1332
#1333 
#1331
#1328
#1325
#8
<span>

<span style="background-color:Teal">Data prediction is better at:
#1330 #1326
<span>
<span style="background-color:Teal">Overall using our own embedded cluster gives better result<span>   
<span style="background-color:Teal">Let's save the negative tweets and use BERT again to extract topic<span>    

There are many negative tweet about remote work but are actually advocates of remote work. The tweets are negative because they are criticising compaies who don't allow them to do remote work