In [1]:
import pandas as pd
import numpy as np
from laservec import LASER
import html
import pickle
from mediawiki import MediaWiki
from unidecode import unidecode

wikipedia = MediaWiki()

url = "localhost:8100"

In [2]:
data_file = "sample_data/political_social_media.csv"
df = pd.read_csv(data_file)

df["author"] = df["label"].map(lambda lbl: html.unescape(unidecode(lbl)).replace("From:", "").strip())
df["objective"] = df["message"]
df["text"] = "Author: " + df["author"] + "\nSource: " + df["source"] + "\n" + df["text"]
df["text"] = df["text"].map(lambda t: html.unescape(unidecode(t)))

df = df[["author", "text", "objective", "bias" ]]
df.head()

Unnamed: 0,author,text,objective,bias
0,Trey Radel (Representative from Florida),Author: Trey Radel (Representative from Florid...,policy,partisan
1,Mitch McConnell (Senator from Kentucky),Author: Mitch McConnell (Senator from Kentucky...,attack,partisan
2,Kurt Schrader (Representative from Oregon),Author: Kurt Schrader (Representative from Ore...,support,neutral
3,Michael Crapo (Senator from Idaho),Author: Michael Crapo (Senator from Idaho)\nSo...,policy,neutral
4,Mark Udall (Senator from Colorado),Author: Mark Udall (Senator from Colorado)\nSo...,policy,partisan


In [4]:
def get_author_summary(author:str) -> str:
    for _ in range(3):
        try:
            res = wikipedia.search(author, results=1)
            if res:
                p = wikipedia.page(res[0])
                if p:
                    return p.summary
        except:
            pass
    return None

In [5]:
%%time

unique_authors = df["author"].unique()
author_summary = { a:"" for a in unique_authors }

for author in author_summary.keys():     
    summary = get_author_summary(author)  
    if summary:
        author_summary[author] = summary

CPU times: user 12.6 s, sys: 331 ms, total: 12.9 s
Wall time: 17min 25s


In [10]:
author_with_no_summary = [ a for a in author_summary.keys() if not author_summary[a] ]
author_with_no_summary

['Michael Turner (Representative from Ohio)',
 'John Tierney (Representative from Massachusetts)',
 'William Thornberry (Representative from Texas)',
 'Luis Gutierrez (Representative from Illinois)',
 'RubI(c)n Hinojosa (Representative from Texas)',
 "Gregorio 'Kilili' Sablan (Representative from NA)",
 'Nydia VelIAzquez (Representative from New York)']

In [13]:
author_fixed = { 
    "Michael Turner (Representative from Ohio)": "Michael Ray Turner",
    "John Tierney (Representative from Massachusetts)": "John F. Tierney",
    "William Thornberry (Representative from Texas)": 'William McClellan "Mac" Thornberry',
    "Luis Gutierrez (Representative from Illinois)": "Luis Vicente Gutierrez",
    "RubI(c)n Hinojosa (Representative from Texas)": "Ruben Eloy Hinojosa",
    "Gregorio 'Kilili' Sablan (Representative from NA)": "Gregorio Kilili Camacho Sablan",
    "Nydia VelIAzquez (Representative from New York)": "Nydia Margarita Velazquez Serrano"
}

for author in author_fixed.keys():
    summary = get_author_summary(author_fixed[author])
    if summary:
        author_summary[author] = summary

In [14]:
author_with_no_summary = [ a for a in author_summary.keys() if not author_summary[a] ]
author_with_no_summary

[]

In [15]:
if author_summary:
    with open("author_summary.pkl", mode="wb") as f:
        pickle.dump(author_summary, f)

In [16]:
%%time

VECTOR_LEN = 1024
rows = len(df)
cols = VECTOR_LEN
text_embeddings = np.zeros((rows, cols))

with LASER(url) as laser:
    for i,row in enumerate(df.itertuples()):
        if not row.text:
            print(f"row {i} has a blank text")
            continue  
            
        text = row.text
        embedding,_ = laser.vectorize(text)
        text_embeddings[i] = embedding
        

CPU times: user 7.97 s, sys: 687 ms, total: 8.66 s
Wall time: 49min 41s


In [18]:
with open("text_embeddings.pkl", mode="wb") as f:
    pickle.dump(text_embeddings, f)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
sim = cosine_similarity(vec1.reshape(1,1024), vec.reshape(1,1024))
lang1,lang2, sim.squeeze().item()

In [None]:
text1,text2

In [None]:
import wikipedia

sum1 = wikipedia.summary("Candice Miller (Representative from Michigan)")

In [None]:
sum1

In [None]:
sum1vec,_ = laser.vectorize(sum1)

In [None]:
sum2 = wikipedia.summary("Mitch McConnell (Senator from Kentucky)")

In [None]:
sum2

In [None]:
sum2vec,_ = laser.vectorize(sum2)

In [None]:
sum1vec.shape, sum2vec.shape

In [None]:
vec1c = np.concatenate((vec1, sum1vec))
vec2c = np.concatenate((vec2, sum2vec))

In [None]:
sim = cosine_similarity(sum1vec.reshape(1,1024), sum2vec.reshape(1,1024))
sim.squeeze()

In [None]:
lang

In [None]:
vec.tolist()

In [None]:
emb = facts["_embedding"]

In [None]:
emb

In [None]:
vec1 = vec
vec2 = np.array(emb)

In [None]:
sim = cosine_similarity(vec1.reshape(1,1024), vec2.reshape(1,1024))

In [None]:
sim.squeeze().item()