## Mount the Google Drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Import libraries and packages

In [None]:
#!pip install pandas==1.0.5

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, date, time
from dateutil.parser import parse
import seaborn as sns
import bz2
import json

## Quotebank dataset pre-processing

#### Filtering quotes

In [None]:
# Selecting quotations containing the key words that we want tu use and then writting the selected quotations into csv files.
def select_quotations(chunk):
  keywords = {"police"}
  chunk.quotation = chunk.quotation.str.lower()
  for word in keywords :
    clean_chunk = chunk[chunk.quotation.str.contains('|'.join(keywords))]
  return clean_chunk

def good_quotes_to_csv(path_to_file, path_to_out):
  df_reader = pd.read_json (path_to_file, lines=True, compression = 'bz2', chunksize=10000)
  for chunk in df_reader :
    select_quotations(chunk).to_csv(path_or_buf = path_to_out, compression = 'bz2', mode = 'a')

In [None]:
good_quotes_to_csv("/content/drive/MyDrive/Quotebank/quotes-2020.json.bz2",'/content/drive/MyDrive/data_2020.bz2')

In [None]:
good_quotes_to_csv("/content/drive/MyDrive/Quotebank/quotes-2019.json.bz2",'/content/drive/MyDrive/data_2019.bz2')

In [None]:
good_quotes_to_csv("/content/drive/MyDrive/Quotebank/quotes-2018.json.bz2",'/content/drive/MyDrive/data_2018.bz2')

In [None]:
good_quotes_to_csv("/content/drive/MyDrive/Quotebank/quotes-2017.json.bz2",'/content/drive/MyDrive/data_2017.bz2')

In [None]:
good_quotes_to_csv("/content/drive/MyDrive/Quotebank/quotes-2016.json.bz2",'/content/drive/MyDrive/data_2016.bz2')

In [None]:
good_quotes_to_csv("/content/drive/MyDrive/Quotebank/quotes-2015.json.bz2",'/content/drive/MyDrive/data_2015.bz2')

#### Cleaning quotes

In [None]:
# Creating a DataFrame for each year and cleaning it.

def clean_df(file):
  dataframe = pd.read_csv(file)
  dataframe = dataframe.replace(to_replace='None', value=np.nan).dropna()
  dataframe.quoteID.drop_duplicates(keep='first', inplace=False)
  return dataframe

In [None]:
clean_data_2020 = clean_df('/content/drive/MyDrive/data_2020.bz2')

In [None]:
clean_data_2019 = clean_df('/content/drive/MyDrive/data_2019.bz2')

In [None]:
clean_data_2018 = clean_df('/content/drive/MyDrive/data_2018.bz2')

In [None]:
clean_data_2017 = clean_df('/content/drive/MyDrive/data_2017.bz2')

In [None]:
clean_data_2016 = clean_df('/content/drive/MyDrive/data_2016.bz2')

In [None]:
clean_data_2015 = clean_df('/content/drive/MyDrive/data_2015.bz2')

In [None]:
# concatenate the dataframes
frames = [clean_data_2020, clean_data_2019, clean_data_2018, clean_data_2017, clean_data_2016, clean_data_2016, clean_data_2015]
quotes_df = pd.concat(frames, sort = False)

# change quote dates from string to Timestamp type
quotes_df['date'] = quotes_df.date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
quotes_df.head(2)

Unnamed: 0.1,Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,23.0,2020-01-05-000384,"a 25-year-old sikh man, was killed by unidenti...",Harmeet Singh,"['Q16225529', 'Q16228362', 'Q210955', 'Q421580...",2020-01-05 18:29:36,1,"[['Harmeet Singh', '0.5227'], ['None', '0.4229...",['http://www.nagalandpost.com/after-nankana-at...,E
1,93.0,2020-02-16-002091,an expectation of the contact aboriginal and t...,David Elliott,"['Q2051773', 'Q21280660', 'Q24299940', 'Q37031...",2020-02-16 13:00:00,9,"[['David Elliott', '0.6047'], ['None', '0.3953']]",['http://msn.com/en-au/news/australia/systemic...,E


## Load the speaker data

In [None]:
# load speaker attributes
path_to_file = "/content/drive/MyDrive/Project datasets/speaker_attributes.parquet"
every_speaker_attributes = pd.read_parquet(path_to_file, engine = 'pyarrow')

# load wikidata label descriptions
path_to_file = "/content/drive/MyDrive/Project datasets/wikidata_labels_descriptions_quotebank.csv.bz2"
label_descriptions = pd.read_csv(path_to_file, compression='bz2', index_col='QID')

## Speaker dataset pre-processing

#### Selecting speakers' attributes only for those having uttered specific quotations

In [None]:
# gets attributes for every quoted speaker
def get_attributes(attributes_df, quotes_df, id_colname):

  # gets all the non null speaker ids (qids) from the quote dataframe
  ids = [x for x in quotes_df.explode(id_colname)[id_colname] if str(x) != 'nan']

  # selects only the rows in the attributes dataframe of the quoted speakers
  speaker_attributes = attributes_df.copy()
  speaker_attributes = speaker_attributes[speaker_attributes['id'].isin(ids)]

  return speaker_attributes

In [None]:
# gets attributes for every quoted speaker
speaker_attributes = get_attributes(every_speaker_attributes, quotes_df, 'qids')

# adds the number of times a speaker is quoted in our subset of quotes
speaker_attributes['numOccurrences'] = speaker_attributes.groupby('id')['id'].transform('size')
# remove duplicate rows (since different quotations can be uttered by the same speaker)
speaker_attributes = speaker_attributes.drop_duplicates(subset=['id'])
speaker_attributes.head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion,numOccurrences
18,"[Namo, Modi, Narendra Bhai, Narendra Damodarda...",[+1950-09-17T00:00:00Z],[Q668],[Q6581097],1395415052,"[Q1282294, Q6889284]",,"[Q82955, Q36180, Q7019111, Q10429346]",[Q10230],,Q1058,Narendra Modi,"[Q63988227, Q63988919, Q16841231, Q16251961, Q...",item,[Q9089],1
399,[Ricky Dene Gervais],[+1961-06-25T00:00:00Z],[Q145],[Q6581097],1395460804,,,"[Q33999, Q36180, Q639669, Q245068, Q28389, Q27...",,,Q23517,Ricky Gervais,,item,,1
27807,[Dave Loebsack],[+1952-12-23T00:00:00Z],[Q30],[Q6581097],1392893478,,L000565,"[Q82955, Q1622272, Q1238570]",[Q29552],,Q771586,David Loebsack,,item,,1
65651,,[+1931-05-07T00:00:00Z],[Q145],[Q6581097],1392376764,,,"[Q2059704, Q7042855]",,,Q2051773,David Elliott,,item,,1
82463,,[+1931-09-24T00:00:00Z],[Q30],[Q6581097],1376504203,,,[Q2526255],,,Q2926392,Bruce Baillie,,item,,1


#### Pre-processing dates and adding age

In [None]:
def year_type(d):
  '''Determine whether someone was born AC or BC and add that to a new column.'''
  if d is None:
    year_type = None
  elif d[0][0] == "+":
    year_type = 'AC'
  else:
    year_type = 'BC'
  return year_type


def extract_birthday(d):
  '''Get the speakers birthday in a proper format.'''
  if d is None:
    day, month, year = '01','01', '0001'
  else:
    d = d[0]
    day, month, year = d[9:11], d[6:8], d[1:5]
    if int(month) not in range(1,13):
      month = '01'
    if int(day) not in range(1,32):
      day = '01'
    if int(year) == 0:
      year = '0001'
    if (int(day) > 28) & (int(month) == 2):
      day = '28'
    elif (int(day) > 30) & (int(month) in [4,6,9,11]):
      day ='30'
    elif int(day)>31:
      day = '31'
  return day+'-'+month+'-'+year

def get_age(date_of_birth, date_of_quotation):
  '''Calculate the age of the speaker at the date of the quote.'''
  # if the person was born after the quotation
  if date_of_birth > date_of_quotation:
    return -1
  # if one date is missing
  if (date_of_birth == datetime(1, 1, 1)) | (date_of_quotation == datetime(1, 1, 1)):
    return -1
  else:
    age = date_of_quotation.year - date_of_birth.year - ((date_of_quotation.month, date_of_quotation.day) < (date_of_birth.month, date_of_birth.day))
  return age

In [None]:
# example of functions to get date of birth in datetime
speaker_attributes['year_type'] = speaker_attributes.date_of_birth.apply(lambda d: year_type(d))
speaker_attributes.date_of_birth = speaker_attributes.date_of_birth.apply(lambda d: extract_birthday(d))
speaker_attributes.date_of_birth = speaker_attributes.date_of_birth.apply(lambda x: datetime.strptime(x, '%d-%m-%Y'))

# for each quotation: get the date of the quote, the speaker id and the speaker's birth date to calculate the speaker's age
date = speaker_attributes[['id', 'date_of_birth']].copy() # get date of birth
quote_date = quotes_df.copy().explode('qids')[['qids', 'date']] # get date of quotation
date = date.merge(quote_date, left_on='id', right_on='qids') # merge the two dates into the "date" dataframe
date = date.rename(columns={"date": "date_of_quotation"}).drop(columns=['qids'])
date['age'] = date.apply(lambda x: get_age(x.date_of_birth, x.date_of_quotation), axis=1) # calculate the age

speaker_attributes = speaker_attributes.merge(date[['id', 'age']], on='id') # add the age to the speakers' attributes
speaker_attributes.head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion,numOccurrences,year_type,age
0,"[Namo, Modi, Narendra Bhai, Narendra Damodarda...",1950-09-17 00:00:00,[Q668],[Q6581097],1395415052,"[Q1282294, Q6889284]",,"[Q82955, Q36180, Q7019111, Q10429346]",[Q10230],,Q1058,Narendra Modi,"[Q63988227, Q63988919, Q16841231, Q16251961, Q...",item,[Q9089],1,AC,69
1,[Ricky Dene Gervais],1961-06-25 00:00:00,[Q145],[Q6581097],1395460804,,,"[Q33999, Q36180, Q639669, Q245068, Q28389, Q27...",,,Q23517,Ricky Gervais,,item,,1,AC,58
2,[Dave Loebsack],1952-12-23 00:00:00,[Q30],[Q6581097],1392893478,,L000565,"[Q82955, Q1622272, Q1238570]",[Q29552],,Q771586,David Loebsack,,item,,1,AC,67
3,,1931-05-07 00:00:00,[Q145],[Q6581097],1392376764,,,"[Q2059704, Q7042855]",,,Q2051773,David Elliott,,item,,1,AC,88
4,,1931-09-24 00:00:00,[Q30],[Q6581097],1376504203,,,[Q2526255],,,Q2926392,Bruce Baillie,,item,,1,AC,88


#### Easy querying?

In [None]:
def top_qids(data, column_name, max_labels=5):
  '''Get the most frequent qids seen in a given column.'''
  #exploding data so each entry in the column is counted once
  column_data = data[[column_name]][data[column_name].notnull()]
  column_data_explode = column_data.explode(column_name)
  
  #getting the unique values along with their counts
  unique_qids, unique_counts = np.unique(column_data_explode, return_counts=True, return_inverse=False, return_index=False)
  sort_ids = np.argsort(unique_counts)
  unique_qids = unique_qids[sort_ids]
  
  #selecting the top qids with threshod max_labels
  if len(unique_counts) > max_labels:
      qids = unique_qids[-max_labels:]
  else:
      qids = unique_qids[-len(unique_qids):]
      #qids = unique_qids[-len(unique_ids):] ## this was the original line in your code but I got an erro saying that "unique_ids" wasn't defined

  return qids, column_data


def get_labels(label_data, qids):
  '''Return the labels for a given set of qids.'''
  labels = []
  for i in range(len(qids)):
    labels.append(label_data["Label"][label_data.QID==qids[i]].item())
  return labels


def count_qids(column_data, qids, label_data = label_descriptions):
  '''Create a histogram of the distribution of the qids in column_data.'''
  counts = np.zeros((len(qids),))

  # getting frequncy of each qid
  for i in range(len(column_data)):
    for j in range(len(qids)):
      if qids[j] in column_data.iloc[i].item():
        counts[j]+=1
        
  # getting their labels
  LABELS = get_labels(label_data, qids)
  x = range(len(qids))
  
  # creating a histogram
  plt.bar(x, counts, align='center')
  plt.xticks(x, LABELS, rotation = 'vertical')
  plt.show()
    
  return counts

In [None]:
# GENDER
qids, column_data = top_qids(data = every_speaker_attributes, column_name = 'gender')
count_qids(column_data=column_data[0:50000], qids=qids)

AttributeError: ignored

In [None]:
# PARTY
qids, column_data = top_qids(data = every_speaker_attributes, column_name = 'party')
count_qids(column_data=column_data[0:50000], qids=qids)

AttributeError: ignored