In [2]:
import bz2
import json
import re
import random
import sys
import os
import bz2
import time
import pandas as pd
import numpy as np
from functools import partial
import seaborn as sns
from helpers import *

In [2]:
def write_json_to_file(name, obj):
    # Use current timestamp to make the name of the file unique
    millis = round(time.time() * 1000)
    name = f'{name}_{millis}.json'
    with open(name, 'wb') as f:
        output = json.dumps(obj)
        f.write(output.encode('utf-8'))
    return name

In [12]:
with open("data/local_only/signi-quote-count-combined_1636658426963.json", "r") as f:
    sorted_combined_signi_dict = json.load(f)

In [13]:
signi_list = list(sorted_combined_signi_dict.keys())

In [31]:
test_list = list(sorted_combined_signi_dict.items())

found_anthony = False
RANGE = 2000

for i in range(RANGE):
#     print(test_list[i])
    qid = test_list[i][0]
    
    if qid == 'Q426582':
        print(f'FOUND: #{i} {test_list[i][1]}')
        found_anthony = True

if found_anthony == False:
    print(f'not found in the top {RANGE}')
    ind = RANGE - 1
    print(f'#{RANGE} list[{test_list[ind][0]}] = {test_list[ind][1]}')

FOUND: #1026 2689


In [20]:
x = 0

for i in range(4):
    print(f'list[{x:5}] = {signi_list[x][1]:5}')
    x += 500

list[    0] = 201293
list[  500] =  4147
list[ 1000] =  2720
list[ 1500] =  2114


In [28]:
top1000 = signi_list[0:1000]

In [35]:
len(top1000)

1000

In [36]:
top1000[0:5]

['Q22686', 'Q1058', 'Q76', 'Q450675', 'Q83106']

In [39]:
PATTERN_INPUT = "../quotebank/quotes-{}.json.bz2"

In [40]:
from typing import Callable

CHUNK_SIZE = 1_048_576

def process_compressed_json_file(input_file_name: str, output_name: str, year: int, process_json_object: Callable) -> str:
    """
    Read from a compressed file chunk by chunk. Decompress every chunk and try to decode it and parse it into an array of JSON objects.
    For each JSON object extracted this way, run the process_json_object function.
    In the end, a JSON object representing the result of this process is written into a file.

    Args:
        input_file_name (str): Name of the compressed json file which is the subject of processing.
        output_name (str): First part of the output file name. Used in creation of the full output file name: the year parameter and a timestamp are appended, as well as the .json extension.
        year (int): Represents the year for which the data in the input file is gathered, is appended to the output_name to generate the full output file name.
        process_json_object (Callable): Function that processes the individual JSON objects extracted from the compressed file. The signature should be as follows:
            Args:
                json_obj: JSON object which is to be processed.
                out_json: The output object in which the result of the processing is stored

    Returns:
        (str) Full name of the output JSON file.
    """
    # Decompression variables
    decompressor = bz2.BZ2Decompressor()
    
    # Decoding variables
    decoding_buffer = bytearray([])
    decoding_error_counter = 0
    
    # Parsing variables
    parsing_buffer = ''
    parsing_error_counter = 0
    
    # Progress variables - used to provide feedback to the dev
    input_size = os.path.getsize(input_file_name)
    start_time = time.time()
    total_in = 0
    total_out = 0
    previous_value = -1
    
    # Result of processing
    out_json = dict()
    
    # Iterate through the file
    with open(input_file_name, 'rb') as input_file:
        for chunk in iter(lambda: input_file.read(CHUNK_SIZE), b''):
            # Feed chunk to decompressor
            decompressed_chunk = decompressor.decompress(chunk)
            dec_chunk_length = len(decompressed_chunk)
            
            # Check the length of the decompressed data - 0 is common -- waiting for a bzip2 block
            if (dec_chunk_length == 0):
                continue
            
            # Try to decode byte array
            decoding_buffer += decompressed_chunk
            try:
                chunk_string = decoding_buffer.decode('utf-8')
                
                # Clear buffer
                decoding_buffer = bytearray([])
                
                decoding_successful = True
            except UnicodeDecodeError:
                # Error occurs when input stream is split in the middle of a character which is encoded with multiple bytes
                decoding_error_counter += 1
                decoding_successful = False
            
            # Try to parse the decoded string
            if decoding_successful:
                # Elements of the JSON array are split by '\n'
                array_elements = chunk_string.split('\n')
                
                # Iterate through the JSON array in the current chunk
                for json_candidate in array_elements:
                    # Try to parse the JSON object, might fail if the object was divided in parts because of the chunk separation
                    parsing_buffer += json_candidate
                    try:
                        json_obj = json.loads(parsing_buffer)
                        
                        # Clear buffer
                        parsing_buffer = ''
                        
                        parsing_successful = True
                    except ValueError:
                        """
                        Error occurs when the line does not contain the whole JSON object, which happens for the last array element in almost every chunk of input stream.
                        We solve this by remembering the prevous partial objects in parsing_buffer, and then merging it with the rest of the object when we load the next chunk.
                        """
                        parsing_error_counter += 1
                        parsing_successful = False
                    
                    # Perform JSON object processing
                    if parsing_successful:
                        process_json_object(json_obj, out_json)
            
            # Show progress
            total_in += len(chunk)
            total_out += dec_chunk_length
            if dec_chunk_length != 0:    # only if a bzip2 block emitted
                processed_fraction = round(1000 * total_in / input_size)
                if processed_fraction != previous_value:
                    left = (input_size / total_in - 1) * (time.time() - start_time)
                    print(f'\r{processed_fraction / 10:.1f}% (~{left:.1f}s left)\tyear: {year}\tnumber of entries: {len(out_json)}\tdecoding errors: {decoding_error_counter}\tparsing errors: {parsing_error_counter}', end='      ')
                    previous_value = processed_fraction
    
    # Save result to file
    output_full_name = write_json_to_file(f'{output_name}-{year}', out_json)
    
    # Report ending
    print()
    total_time = time.time() - start_time
    print(f'File {input_file_name} processed in {total_time:.1f}s', end='\n\n')
    
    return output_full_name

In [41]:
def check_if_party_member_quote(row: dict, party_member_quotes: dict, party_list: list) -> None:
    """CHeck if party member quote is useful for analysis

    Args:
        row (dict): Row of data
        party_member_quotes (dict): Dict to keep track of party member quotes
        party_list (list): Party list
    """
    probabilities = row['probas']
    qids = row['qids']
    
    # Check if the probas and qids values exist
    if (len(probabilities) == 0 or len(qids) == 0):
        return
    
    # Check if the speaker is not 'Unknown'
    if (probabilities[0][0] == 'None'):
        return
    
    # Check if the probability is over 80%
    p = float(probabilities[0][1])
    if (p < 0.8):
        return
    
    # Check if the speaker is on the party list
    qid = qids[0]
    if qid not in party_list:
        return
    
    # Remember only the quote and the probability
    data = {}
    data['quotation'] = row['quotation']
    data['proba'] = row['probas'][0][1]
    
    # Append the quote
    arr = party_member_quotes.get(qid, [])
    arr.append(data)
    party_member_quotes[qid] = arr

In [42]:
# Define partial function check_if_dem_or_rep_quote using function check_if_party_member_quote
check_if_top_1000_speaker_quote = partial(check_if_party_member_quote, party_list=top1000)

In [43]:
years = [2015, 2016, 2017, 2018, 2019, 2020]
# years = [2018, 2019, 2020]
# years = [2020]

output_list = []

for year in years:
    path_to_input = PATTERN_INPUT.format(year)
    
    # Process quote file
    output_name = process_compressed_json_file(path_to_input, 'data/local_only/top-1000-quotes', year, check_if_top_1000_speaker_quote)
    
    output_list.append(output_name)

print('\n\nOutput file names:')
for file_name in output_list:
    print(file_name)

100.0% (~0.4s left)	year: 2015	number of entries: 995	decoding errors: 1	parsing errors: 3187       
File ../quotebank/quotes-2015.json.bz2 processed in 1861.2s

100.0% (~0.5s left)	year: 2016	number of entries: 998	decoding errors: 0	parsing errors: 2216       
File ../quotebank/quotes-2016.json.bz2 processed in 1557.5s

100.0% (~1.3s left)	year: 2017	number of entries: 1000	decoding errors: 0	parsing errors: 4959        
File ../quotebank/quotes-2017.json.bz2 processed in 3299.7s

100.0% (~0.7s left)	year: 2018	number of entries: 999	decoding errors: 1	parsing errors: 4585        
File ../quotebank/quotes-2018.json.bz2 processed in 1483.0s

100.0% (~0.3s left)	year: 2019	number of entries: 999	decoding errors: 0	parsing errors: 3396       
File ../quotebank/quotes-2019.json.bz2 processed in 1090.9s

100.0% (~0.1s left)	year: 2020	number of entries: 993	decoding errors: 0	parsing errors: 792       
File ../quotebank/quotes-2020.json.bz2 processed in 263.5s



Output file names:
data/l

In [None]:
SELECT DISTINCT ?item ?itemLabel ?genderLabel ?citizenshipLabel ?languageLabel ?religionLabel ?ethnicLabel ?degreeLabel ?dateOfBirth ?placeOfBirthLabel
WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
  {
    VALUES ?item { wd:Q22686 wd:Q359442 wd:Q426582 }
    
    OPTIONAL { ?item wdt:P21 ?gender. }
    OPTIONAL { ?item wdt:P27 ?citizenship. }
    OPTIONAL { ?item wdt:P569 ?dateOfBirth. }
    OPTIONAL { ?item wdt:P103 ?language. }
    OPTIONAL { ?item wdt:P140 ?religion. }
    OPTIONAL { ?item wdt:P172 ?ethnic. }
    OPTIONAL { ?item wdt:P512 ?degree. }
  }
}

In [47]:
with open("data/local_only/top-1000-wikidata-raw.json", "r", encoding='utf-8') as f:
    top1000_wiki = json.load(f)

In [48]:
top1000_wiki_merged = dict()

index = 0
for row in top1000_wiki:
    # Extract the QID from the link (ex. http://www.wikidata.org/entity/Q203286 -> Q203286)
    qid_start = row['item'].rindex('/') + 1
    key = row['item'][qid_start:]
    # Replace the link with the QID
    row['item'] = key
    
    if key in top1000_wiki_merged:
        merged_entry = top1000_wiki_merged[key]
        columns = ['itemLabel', 'genderLabel', 'citizenshipLabel', 'religionLabel', 'ethnicLabel', 'degreeLabel', 'dateOfBirth', 'languageLabel']
        """
        Merge the values for every column:
            - if the values are the same - do nothing
            - if the values are different - create a list and add them both
        """
        for col in columns:
            if row.get(col, None) is None:
                continue
                
            updated_entry = merged_entry.get(col, None)
            
            if updated_entry is None:
                updated_entry = row[col]
            elif isinstance(updated_entry, list):
                if row[col] not in updated_entry:
                    updated_entry.append(row[col])
            elif row[col] != updated_entry:
                updated_entry = [updated_entry, row[col]]
                
            merged_entry[col] = updated_entry
    else:
        top1000_wiki_merged[key] = row

In [50]:
print(len(top1000_wiki))
print(len(top1000_wiki_merged))

1223
1000


In [51]:
write_json_to_file('data/local_only/top-1000-wikidata-merged', top1000_wiki_merged)

'data/local_only/top-1000-wikidata-merged_1638962423221.json'

In [58]:
top1000_list = list(top1000_wiki_merged.values())

In [59]:
quotes_files = [
    "data/local_only/top-1000-quotes-2015_1638929459545.json",
    "data/local_only/top-1000-quotes-2016_1638931024701.json",
    "data/local_only/top-1000-quotes-2017_1638934319108.json",
    "data/local_only/top-1000-quotes-2018_1638935809502.json",
    "data/local_only/top-1000-quotes-2019_1638936901748.json",
    "data/local_only/top-1000-quotes-2020_1638937168661.json"
]

quotes_combined = {}

for v in top1000_list:
    copy = dict(v)
    copy['quotations'] = []
    
    quotes_combined[v['item']] = copy

for file_name in quotes_files:
    with open(file_name, 'r', encoding='utf-8') as f:
        quotes = json.load(f)
        
        for k in quotes.keys():
            quotes_combined[k]['quotations'] += quotes[k]

write_json_to_file('data/local_only/top-1000-quotes-combined', quotes_combined)

'data/local_only/top-1000-quotes-combined_1638962737072.json'

In [62]:
quotes_filtered = quotes_combined.copy()

In [63]:
filtered_quotes = []

weird_pattern = '[_@#+&;:\(\)\{\}\[\]\\/`]'
json_pattern = '\{.*[a-zA-Z]+:\s[\'"`][a-zA-Z0-9]+[\'"`].*\}'
url_pattern = 'https?'

for k in quotes_filtered.keys():
    elem = quotes_filtered[k]
    
    new_arr = []
    for entry in elem['quotations']:
        text = entry['quotation']
        
        longest = max(entry['quotation'].split(), key=len)
        if (len(longest) > 50):
            filtered_quotes.append(entry)
            continue
        
        if re.search(url_pattern, text) is not None:
            filtered_quotes.append(entry)
            continue
        
        if re.search(json_pattern, text) is not None:
            filtered_quotes.append(entry)
            continue
            
        weird_num = len(re.findall(weird_pattern, text))
        total = len(text)
        weird_percent = weird_num / total
        if (weird_percent > 0.1):
            filtered_quotes.append(entry)
            continue
            
        new_arr.append(entry)
    elem['quotations'] = new_arr

In [64]:
write_json_to_file('data/local_only/top-1000-quotes-combined-and-filtered', quotes_filtered)

'data/local_only/top-1000-quotes-combined-and-filtered_1638963965624.json'

Show some filtered quotes:

In [65]:
for entry in filtered_quotes[0:5]:
    print(entry['quotation'], end='\n\n')

absolutely love (s)

Finally @aamir_khan ki Masti Ki Paathshala is open & #KaategiKya is our super hit school anthem! पसंद आया तो लाइक करो पसंद नहीं आया तो टेस्ट चेंज करो!?? #AamirsMastiKiPaathshala #TeachersDay #शिक्षकदिवस Wah! Kaun hai bhai iss group ka principal? Whoever made this video, lots of love to you.. https://t.co/9fq8pSpH3z -- Aamir Khan (@aamir_khan) September 5, 2018 Aamir quote tweeted the video and said,

Sen. John Mc & shy; Cain took on two key 2016 Re & shy; pub & shy; lic & shy; an con & shy; tenders Wed & shy; nes & shy; day, ex & shy; press & shy; ing baffle & shy; ment at Don & shy; ald Trump's con & shy; tin & shy; ued pop & shy; ular & shy; ity and cri & shy; ti & shy; ciz & shy; ing Ted Cruz for his tac & shy; tics in the Sen & shy; ate. Mc & shy; Cain, the 2008 GOP pres & shy; id & shy; en & shy; tial nom & shy; in & shy; ee, gave his view of the 2016 field be & shy; fore a break & shy; fast meet & shy; ing with re & shy; port & shy; ers on Wed & shy; nes & sh

In [4]:
with open('data/local_only/top-1000-quotes-combined-and-filtered_1638963965624.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [11]:
keys = list(data.keys())

for i in range(11):
    key = keys[i * 100]
    print(f'{data[key]["item"]:10} {len(data[key]["quotations"]):10}')

Q9557            2871
Q207431          8317
Q244338          3242
Q483309          2737
Q1382365         4098
Q2834185         2840
Q3595385         5687
Q6968942         2770
Q16196017        5309
Q11310708        5687


IndexError: list index out of range

In [67]:
quotes_concat = quotes_filtered.copy()

In [68]:
QUOTE_LENGTH = 5000

index = 1
for k in quotes_concat.keys():
    elem = quotes_concat[k]
    
    # Sort the quotes by length
    elem['quotations'].sort(key = lambda x: len(x['quotation']), reverse = True)
    
    concat = ''
    printed = False
    for quote in elem['quotations']:
        # Concatenate the quotes
        concat += ' ' + quote['quotation']
        
        # Trim if we are over QUOTE_LENGTH
        if (len(concat) >= QUOTE_LENGTH):
            concat = concat[0:QUOTE_LENGTH]
            break
    
    elem['quotations'] = concat

In [69]:
write_json_to_file('data/local_only/top-1000-quotes-concatenated', quotes_concat)

'data/local_only/top-1000-quotes-concatenated_1638964313725.json'

### Analysis

In [70]:
import bz2
import json
import pandas as pd
import csv

In [72]:
# Load the concatenated quotes for top 100 politician
with open('data/local_only/top-1000-quotes-concatenated_1638964313725.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

After getting the data, we extract the quote ID and the concatenated quote of each politician, and write them to `input_data1.csv` for the LIWC personality analysis.

In [73]:
with open('data/local_only/top_1000_input_data_1.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["qid", "quote"])
    for qid, all_value in data.items():
        quote = all_value["quotations"]
        writer.writerow([qid, quote])

In [74]:
liwc = pd.read_csv('data/local_only/top_1000_output_1.csv')

In [75]:
# Visualise a random sample
liwc.sample()

Unnamed: 0,Source (A),Source (B),WC,WPS,Sixltr,Dic,Pronoun,I,We,Self,...,Comma,Colon,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP
255,Q717959,At the moment we're all feeling disappointed ...,927,23.77,15.64,76.27,10.57,1.4,3.56,4.96,...,3.88,0.11,0.11,0.0,0.0,1.4,0.0,3.34,0.65,0.0
