In [150]:
import torch
import string
import numpy as np
import itertools as it
from transformers import BertTokenizer, BertModel, BertConfig
import pandas as pd
from misinformation_classifier import read_data, process_docs
from scipy.spatial.distance import cosine as cossim

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [145]:
!python -m spacy download en_core_web_lg
import spacy
import en_core_web_lg

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 13.2 MB/s eta 0:00:01     |█████████████████████████▊      | 665.4 MB 14.9 MB/s eta 0:00:11
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180940 sha256=9cb50473fe5d9b553b99947d7848092a6b7fd9bf33ab1456fd1c8b430714dcb3
  Stored in directory: /private/var/folders/tn/2pq1hztd2lb_n9llwz1gjy2h0000gn/T/pip-ephem-wheel-cache-_gocgcng/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load(

In [181]:
class Embedding(object):
    """
    Extend this class and overide get_embedding method with whatever is used to
    generate the embedding
    """
    def get_embedding(self, sentence):
        pass

class BERTEmbedding(Embedding):

    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def get_embedding(self, sentence, max_len=64):
        """
        Input formtting for BERT
        """
        encoded_dict = self.tokenizer.encode_plus(
            sentence, 
            add_special_tokens = True,    # Add '[CLS]' and '[SEP]'
            max_length = max_len,         # Pad & truncate all sentences.
            pad_to_max_length = False,
            return_attention_mask = False, # Construct attn. masks.
            return_tensors = 'pt',        # Return pytorch tensors.
        )
        
        """
        Creating word/sentence vectors
        """
        # Predict hidden states features for each layer
        with torch.no_grad():
          hidden_states = self.model(encoded_dict['input_ids'])[2]

        token_embeddings = torch.stack(hidden_states, dim=0)   # (12, batch_size, token_length, embedding_size)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)  # (token_length, 12, embedding_size (768))

        """
        Sentence vectors
        """
        token_vecs = hidden_states[-4:][0]
        sentence_embedding = torch.mean(token_vecs, dim=1)
        sentence_embedding = torch.mean(sentence_embedding, dim=0)

        return sentence_embedding.numpy()
    
class Word2Vec(Embedding):
    def __init__(self, model):
        self.model = model.load()

    def get_embedding(self, sentence):
        wordvecs = []
        for token in self.model(sentence):
            wordvecs.append(token.vector)

        vec_arr = np.array(wordvecs)
        avg_vec = np.mean(vec_arr, axis=0)
        assert len(wordvecs[0]) == avg_vec.shape[0]

        return avg_vec


In [182]:
config = BertConfig.from_pretrained('model_save/', output_hidden_states=True)
model = BertModel.from_pretrained('model_save/', config=config)
tokenizer = BertTokenizer.from_pretrained('model_save/')
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [183]:
embedding = BERTEmbedding(model, tokenizer)
# embedding = Word2Vec(en_core_web_lg)

In [184]:
remove = string.punctuation
remove = remove.replace('/', '')

with open('latest_ideas_misinformation.csv', 'r') as f:
    lines = f.readlines()
lines = [line.split('\t')[-1].strip() for line in lines[1:]]
misinformation = pd.DataFrame({'sentence': lines})
misinformation.sentence = misinformation.apply(lambda x: x.sentence.translate({ord(c): None for c in remove}), axis=1)
misinformation['embedding'] = misinformation.apply(lambda x: embedding.get_embedding(x.sentence), axis=1)
misinformation.head()

Unnamed: 0,sentence,embedding
0,gargling with bleach will prevent/cure – also ...,"[-0.39152414, -0.6630647, 0.34272134, -0.05833..."
1,drinking corona beer will prevent/cure – also ...,"[-0.32701516, -0.5514859, 0.13181137, -0.17519..."
2,taking acetic acid will prevent/cure,"[-0.32734972, -0.20340192, 0.24712618, -0.0114..."
3,taking steroids will prevent/cure,"[-0.22360928, -0.2570151, 0.10327525, -0.15645..."
4,taking colloidal silver will cure,"[-0.38702664, -0.42426723, 0.11841856, 0.00288..."


In [188]:
with open('quorona.txt', 'r') as f:
    lines = f.readlines()
lines = [line.strip() for line in lines]
df = pd.DataFrame({'sentence': lines})
df = df.sample(100)
df.sentence = df.apply(lambda x: x.sentence.translate({ord(c): None for c in remove}), axis=1)
df['embedding'] = df.apply(lambda x: embedding.get_embedding(x.sentence), axis=1)
df.head()

Unnamed: 0,sentence,embedding
3845,will coronavirus survive in high temperature,"[-0.12987584, -0.28084862, 0.28179958, -0.0215..."
3362,what is coronavirus and what happens now it is...,"[-0.13366845, -0.3387423, 0.07689589, -0.05812..."
1895,does coronavirus cause a sore throat,"[-0.18242365, -0.4832562, 0.08941638, 0.153969..."
3808,will coronavirus end with warm weather,"[-0.3298868, -0.33108774, 0.52107453, 0.268209..."
6085,how long does coronavirus live on surfaces chart,"[0.37919047, -0.55202806, -0.16150321, 0.05773..."


In [189]:
def get_match(embedding):
    sims = [cossim(embedding, e) for e in misinformation.embedding]
    return '{},{}'.format(misinformation.loc[np.argmax(sims), 'sentence'], max(sims))

In [190]:
df['temp'] = df.apply(lambda x: get_match(x.embedding), axis=1)
df['match'] = df.apply(lambda x: x.temp.split(',')[0], axis=1)
df['score'] = df.apply(lambda x: x.temp.split(',')[1], axis=1)
df.drop(columns=['temp', 'embedding'], inplace=True)
df = df.sort_values(by=['score'], ascending=False)
df

Unnamed: 0,sentence,match,score
1873,does coronavirus affect children,Covid 19 is a normal flu and is no more danger...,0.6653649210929871
1167,coronavirus medicine vaccine,Elderly people in Brazil who are caught wander...,0.6392353773117065
6874,what is the cause for coronaviruses,Covid 19 is a normal flu and is no more danger...,0.6323185861110687
6230,influenza a coronavirus,This is “fake news” invented by Trump to stren...,0.6240950226783752
4726,coronavirus is there a vaccine,It was caused by an infected rat biting a stud...,0.6180782616138458
6244,is coronavirus a pandemic now,This is “fake news” invented by Trump to stren...,0.6047084331512451
2220,how coronavirus effects the body,Covid 19 is a normal flu and is no more danger...,0.5944984257221222
2649,how the coronavirus is caused,Covid 19 is a normal flu and is no more danger...,0.5934285819530487
3569,what states have coronavirus in the usa,Covid 19 is a normal flu and is no more danger...,0.5929408669471741
5858,how coronavirus transmitted in humans,Covid 19 is a normal flu and is no more danger...,0.587576150894165
