In [1]:
import bz2
import sys
import json
import numpy as np
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import pickle

In [2]:
# We find the US country code

endpoint_url = "https://query.wikidata.org/sparql"
#sparkql query to get the country name and their respective quids from wikidata
query = """#List of present-day countries and capital(s)
SELECT DISTINCT ?country ?countryLabel
WHERE
{
  ?country wdt:P31 wd:Q3624078 .

  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
ORDER BY ?countryLabel"""

'''
query internet dataset:
endpoint_url (string): url of corresponding dataset
query (string): sparksql query
'''
def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

results = get_results(endpoint_url, query)

#Build a country numpy array {quid, country name}
country_array = []
for result in results["results"]["bindings"]:
    country_id = result["country"]['value'].split('/')[-1]
    #country_dict[country_id] = result["countryLabel"]['value']
    country_array.append([country_id, result["countryLabel"]['value']])
country_array = np.array(country_array)

US_index = np.argwhere(country_array=='United States of America')[0][0]
US_code = country_array[US_index][0]

In [3]:
# We extract the US speakers
speaker_df = pd.read_parquet("speaker_attributes.parquet")

nationality_list = list(speaker_df['nationality'])
names_list = [item[0] for item in speaker_df.values]
my_df = pd.DataFrame({"nationality_list": nationality_list, "names_list": names_list})

indexes = [index for index in range(len(nationality_list)) if (nationality_list[index] is not None) and (names_list[index] is not None) and US_code in nationality_list[index]]

speaker_flat_list = [names_list[i][j] for i in indexes for j in range(len(names_list[i])) if len(names_list[i][j])>=4]

In [4]:
# We extract speakers that appeared in nytime 2019
path_to_file = 'quotes-2019-nytimes.json.bz2'
path_to_out = 'quotes-2019-us.json.bz2'

Dict = {}

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance)  # loading a sample
            quote = instance['quotation']
            speaker = instance['speaker']

            if any(word in quote for word in speaker_flat_list):
                common_american_speaker = [word in quote for word in speaker_flat_list]

                if speaker_flat_list[common_american_speaker.index(True)] in Dict:
                    Dict[speaker_flat_list[common_american_speaker.index(True)]] = Dict[speaker_flat_list[common_american_speaker.index(True)]] + 1
                else:
                    Dict[speaker_flat_list[common_american_speaker.index(True)]] = 1
            else:
                pass

Empty Dictionary: 


In [None]:
# save the enriched keywords
a_file = open("Dict_quotes-2019-nytimes.pkl", "wb")
pickle.dump(Dict, a_file)
a_file.close()

