In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
import time
from pathlib import Path
from glob import glob
from datetime import datetime
import spacy

nlp = spacy.load("en_core_web_sm")

In [2]:

if Path("./data/df_quotes.pickle").is_file():
    df_quotes = pd.read_pickle("./data/df_quotes.pickle")
else:
    df_quotes = pd.read_json(
        "data/natural_disaster_quotes.json.bz2", lines=True, compression="bz2"
    )
    df_quotes.to_pickle("./data/df_quotes.pickle")

print(f"Number of quotes: %d" % len(df_quotes.index))


Number of quotes: 1480711


In [3]:

# Turn the parquets into exploded pickles
files = sorted(glob("data/speaker_attributes.parquet/part*"))
for (i, f) in enumerate(files):
    if Path("data/speaker_chunks/chunk_" + str(i) + ".pickle").is_file():
        continue
    else:
        chunk = pd.read_parquet(f, engine="pyarrow").explode("aliases")
        chunk = chunk[chunk.aliases.isin(df_quotes.speaker)]
        chunk.to_pickle("data/speaker_chunks/chunk_" + str(i) + ".pickle")

In [4]:

# Regroup the chunks into one
files = sorted(glob("data/speaker_chunks/chunk*"))
chunks = pd.DataFrame()

for f in files:
    chunk = pd.read_pickle(f)
    chunks = chunks.append(chunk)

chunks.head(2)

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
3,President George W. Bush,[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
3,George Bush,[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"


In [5]:

# Load the Wikidata nodes
df_labels = pd.read_csv(
    "data/wikidata_labels_descriptions_quotebank.csv.bz2",
    compression="bz2",
    index_col="QID",
)
df_labels.head()

Unnamed: 0_level_0,Label,Description
QID,Unnamed: 1_level_1,Unnamed: 2_level_1
Q31,Belgium,country in western Europe
Q45,Portugal,country in southwestern Europe
Q75,Internet,global system of connected computer networks
Q148,People's Republic of China,sovereign state in East Asia
Q155,Brazil,country in South America


In [6]:

# Turn the wikinodes into human readable labels
cols = [
    "nationality",
    "occupation",
    "party",
    "academic_degree",
    "candidacy",
    "religion",
]
for col in cols:
    print(col)
    chunks[col] = chunks[col].apply(
        lambda l: [df_labels.loc[q]["Label"] for q in l] if l is not None else l
    )

chunks.head(2)
speakers = chunks

nationality
occupation
party
academic_degree
candidacy
religion


In [7]:
df_quotes.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,2020-01-13-009373,But he said `you're responsible. You make peop...,Tom Elliott,"[Q16196918, Q21461424, Q7815715, Q8952291]",2020-01-13 05:00:53,1,"[[Tom Elliott, 0.5795], [None, 0.4205]]",[https://www.3aw.com.au/tom-elliott-calls-for-...,E
1,2020-03-11-014404,Everything that went wrong was always my fault...,Kate James,[Q56379605],2020-03-11 13:04:58,2,"[[Kate James, 0.8144], [None, 0.1397], [Amber ...",[https://calgarysun.com/entertainment/celebrit...,E
2,2020-04-10-011917,"Great guy, messiest desk I ever saw, like a to...",,[],2020-04-10 04:00:00,3,"[[None, 0.7283], [Brad Harris, 0.2717]]",[http://newstimes.com/news/coronavirus/article...,E
3,2020-01-18-012600,How will they attend to the problems of the pe...,Krishna Byre Gowda,[Q6437387],2020-01-18 18:13:36,1,"[[Krishna Byre Gowda, 0.89], [None, 0.11]]",[https://www.thehindu.com/news/national/karnat...,E
4,2020-02-21-031866,I'm sure you hear it every day but you are smo...,,[],2020-02-21 21:12:31,1,"[[None, 0.9031], [Donatella Versace, 0.0969]]",[http://feeds.inquisitr.com/~r/google/yDYq/~3/...,E


In [8]:
# Get the quotes from the people whose occupation is researcher
researchers = speakers[
    speakers.occupation.apply(lambda l: "researcher" in l if l is not None else False)
].aliases
res_quotes = df_quotes[df_quotes.speaker.isin(researchers)]
res_text = res_quotes.quotation.str.cat(sep="\n")

with open("data/researcher_book.txt", "w") as f:
    f.write(res_text)


## Topic detection pipeline........

In [9]:
from nltk.corpus import PlaintextCorpusReader
books = PlaintextCorpusReader("data", "researcher_book.txt")

In [10]:
def get_chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i : i + n]

# Get the chunks again (into smaller chunks)
book_id = {f: n for n, f in enumerate(books.fileids())}  # dictionary of books
chunks = list()
chunk_class = (
    list()
)  # this list contains the original book of the chunk, for evaluation

limit = 60  # how many chunks total
size = 50  # how many sentences per chunk/page

for f in books.fileids():
    sentences = books.sents(f)
    print(f)
    print("Number of sentences:", len(sentences))

    # create chunks
    chunks_of_sents = [
        x for x in get_chunks(sentences, size)
    ]  # this is a list of lists of sentences, which are a list of tokens
    chs = list()

    # regroup so to have a list of chunks which are strings
    for c in chunks_of_sents:
        grouped_chunk = list()
        for s in c:
            grouped_chunk.extend(s)
        chs.append(" ".join(grouped_chunk))
    print("Number of chunks:", len(chs), "\n")

    # filter to the limit, to have the same number of chunks per book
    chunks.extend(chs[:limit])
    chunk_class.extend([book_id[f] for _ in range(len(chs[:limit]))])



researcher_book.txt
Number of sentences: 137120
Number of chunks: 2743 



In [11]:

STOP_WORDS = spacy.lang.en.stop_words.STOP_WORDS

processed_docs = list()
for doc in nlp.pipe(chunks, batch_size = 10):
    ents = doc.ents
    
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    doc = [token for token in doc if token not in STOP_WORDS and len(token) > 2]
    doc.extend([str(entity) for entity in ents if len(entity) > 1])

    processed_docs.append(doc)
docs = processed_docs
# print(docs)
del(processed_docs)

In [12]:
from gensim.models.phrases import Phrases

bigrams = Phrases(docs)

for idx in range(len(docs)):
    for token in bigrams[docs[idx]]:
        if '_' in token:
            docs[idx].append(token)


In [13]:

from gensim.corpora import Dictionary
dictionary = Dictionary(docs)

max_freq = 0.6
min_wordcount = 3

dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

corpus = [dictionary.doc2bow(doc) for doc in docs]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of chunks: %d' % len(corpus))

Number of unique tokens: 2326
Number of chunks: 60


In [14]:

# models
from gensim.models import LdaMulticore

params = {"passes": 10, "random_state": 0}
base_models = dict()
model = LdaMulticore(
    corpus=corpus,
    num_topics=3,
    id2word=dictionary,
    workers=6,
    passes=params["passes"],
    random_state=params["random_state"],
)


In [15]:
model.show_topics(num_words=5)

[(0,
  '0.004*"home" + 0.003*"little" + 0.003*"family" + 0.003*"week" + 0.003*"Australia"'),
 (1,
  '0.003*"road" + 0.003*"hard" + 0.003*"home" + 0.003*"try" + 0.003*"build"'),
 (2,
  '0.004*"rain" + 0.003*"burn" + 0.003*"try" + 0.003*"home" + 0.003*"damage"')]

In [16]:

import pyLDAvis.gensim_models
# plot topics
data =  pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
pyLDAvis.display(data)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
