In [1]:
# Import libraries

import os
import torch
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from sklearn.manifold import TSNE
import scipy
from gensim.models.word2vec import Word2Vec
import re
import random
import plotly.express as px
import plotly.graph_objects as go



#sotpwords removal
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


# Prepare data

In [2]:
# Import data
claims_df = pd.read_csv('data/postprocessed/claims.csv')
# delete nans in the claims and answer column
claims_df = claims_df.dropna(subset=['claim'])
# build a vector for the claims
claims_str = np.array(claims_df['claim'])
claims_str

array(['Hunter Biden had no experience in Ukraine or in the energy sector when he joined the board of Burisma.',
       'Donald Trump delivered the largest tax cuts in American history.',
       'In Nigeria … in terms of revenue share, 20% goes to the local government.',
       ...,
       'Matt Hancock owns the company responsible for supplying the NHS.',
       '£1 billion was given to 13 companies for PPE but no PPE has been supplied.',
       'Mixing one tablespoon of 2% iodine and one cup of baby oil will create a substance that removes body hair.'],
      dtype=object)

In [3]:
# remove newline character
claims_str = [claim.replace('\n', '') for claim in claims_str]

In [4]:
# download stopwords
nltk.download('stopwords')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Gloria/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Gloria/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# we have to remove the possessives since we noticed they appeared frequently in the claims
def remove_stopwords_and_possessive(tokens):
    stop_words = set(stopwords.words('english'))
    processed_tokens = []
    
    for token in tokens:
        # Remove trailing "'s"
        token = re.sub(r"'s\b", '', token)
        
        if token.lower() not in stop_words:
            processed_tokens.append(token)
    
    return processed_tokens

In [6]:
processed_sentences = []
    
for sentence in claims_str:
       
    word_tokens = word_tokenize(sentence)
    word_tokens = [word.lower() for word in word_tokens if word.isalnum()]
    processed_tokens = remove_stopwords_and_possessive(word_tokens)
    
    processed_sentence = ' '.join(processed_tokens)
    processed_sentences.append(processed_sentence)
processed_sentences

['hunter biden experience ukraine energy sector joined board burisma',
 'donald trump delivered largest tax cuts american history',
 'nigeria terms revenue share 20 goes local government',
 'biden pledged stop border wall construction give amnesty health care illegal immigrants',
 'police shooting jacob blake gov tony evers gov mandela barnes call peace encourage calm',
 'common law admission test clat 2020 conducted september 7 2020 planned',
 '35 revenue goes states nigeria',
 'margaret sanger racist believed eugenics goal founding planned parenthood eradicate minorities',
 'joe biden voted irag war supported war serbia syria libya',
 'biden take away second amendment',
 'biden pledged defund police',
 '45 nigeria revenue goes federal government',
 'donald trump inherited stagnant economy rebuilt',
 'joe biden voted iraq war supported wars serbia syria libya',
 'president muhammadu buhari sign new police bill allow officers arrest without warrant',
 'plant seeds restructure dna human

In [7]:
"""
Remove non letter chatacters
Lowercase
Tkenize based on lower space
Remove sentences with less than 2 tokens
"""
tokenized_sentences = [re.sub('\W', ' ', sentence).lower().split() for sentence in processed_sentences]
# remove sentences that are only 1 word long
tokenized_sentences = [sentence for sentence in tokenized_sentences if len(sentence) > 1]

for sentence in tokenized_sentences[:10]:
    print(sentence)

['hunter', 'biden', 'experience', 'ukraine', 'energy', 'sector', 'joined', 'board', 'burisma']
['donald', 'trump', 'delivered', 'largest', 'tax', 'cuts', 'american', 'history']
['nigeria', 'terms', 'revenue', 'share', '20', 'goes', 'local', 'government']
['biden', 'pledged', 'stop', 'border', 'wall', 'construction', 'give', 'amnesty', 'health', 'care', 'illegal', 'immigrants']
['police', 'shooting', 'jacob', 'blake', 'gov', 'tony', 'evers', 'gov', 'mandela', 'barnes', 'call', 'peace', 'encourage', 'calm']
['common', 'law', 'admission', 'test', 'clat', '2020', 'conducted', 'september', '7', '2020', 'planned']
['35', 'revenue', 'goes', 'states', 'nigeria']
['margaret', 'sanger', 'racist', 'believed', 'eugenics', 'goal', 'founding', 'planned', 'parenthood', 'eradicate', 'minorities']
['joe', 'biden', 'voted', 'irag', 'war', 'supported', 'war', 'serbia', 'syria', 'libya']
['biden', 'take', 'away', 'second', 'amendment']


# Train W2V
Vector size usually ranges from 100 to 300, we choose 100 to reduce the model complexity and speed up the training. We reduce the window size to 5 
since we have short sentences. 

In [8]:
model = Word2Vec(tokenized_sentences, vector_size=100, min_count=5, window=5)

In [9]:
term = 'covid'

model.wv.most_similar(term)

[('million', 0.9903276562690735),
 ('state', 0.9902217388153076),
 ('nigeria', 0.9900514483451843),
 ('country', 0.9899704456329346),
 ('2018', 0.9899702668190002),
 ('years', 0.9899058938026428),
 ('government', 0.9898313283920288),
 ('launched', 0.989829421043396),
 ('world', 0.989748477935791),
 ('house', 0.9896952509880066)]

## Visualization with t-SNE

In [27]:
# sample of 50 words most close to the term 'covid'
term_1 = 'covid'
term_2='nigeria'
term_3='trump'
covid_sample=model.wv.most_similar(term_1, topn=100)
covid_terms= [tup[0] for tup in covid_sample]

nigeria_sample=model.wv.most_similar(term_2, topn=100)
nigeria_terms= [tup[0] for tup in nigeria_sample]

trump_sample=model.wv.most_similar(term_3, topn=100)
trump_terms= [tup[0] for tup in trump_sample]


In [28]:
covid_vectors = model.wv[covid_terms]
nigeria_vectors = model.wv[nigeria_terms]
trump_vectors = model.wv[trump_terms]


In [29]:
tsne = TSNE(n_components=3, n_iter=2000)
tsne_embedding_covid = tsne.fit_transform(covid_vectors)
tsne_embedding_nigeria = tsne.fit_transform(nigeria_vectors)
tsne_embedding_trump = tsne.fit_transform(trump_vectors)

In [30]:
combined_embeddings = [
    (tsne_embedding_covid, 'covid'),
    (tsne_embedding_nigeria, 'nigeria'),
    (tsne_embedding_trump, 'trump')
]
x_data = []
y_data = []
z_data = []
colors = []

for embedding, label in combined_embeddings:
    x_data.extend(embedding[:, 0])
    y_data.extend(embedding[:, 1])
    z_data.extend(embedding[:, 2])
    if label == 'covid':
        colors.extend(['red'] * len(embedding))
    elif label == 'nigeria':
        colors.extend(['green'] * len(embedding))
    elif label == 'trump':
        colors.extend(['blue'] * len(embedding))

In [31]:
fig = go.Figure(data=[go.Scatter3d(
    x=x_data,
    y=y_data,
    z=z_data,
    mode='markers',
    marker=dict(
        size=5,
        color=colors,  # assign colors based on identifiers
        opacity=0.8
    )
)])

fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

fig.show()