## Initial pre-processing

In this notebook we will:
- Proceed with basic cleaning of each year from 2015-2020 which includes remove nan speakers, nan quotes and further inconsistencies 
- Exploit the useful information from the wikidata dumps provided to us 
- Add columns of interest for further analysis 

The output will be pickle files for each year with additional columns such as tags, gender (male/female), domain name, citizenship of spokesperson...


#### Useful libraries

In [None]:
import pandas as pd
import pickle
from tqdm.auto import trange, tqdm
import time
from journal_API_wikidata import extract_info_wiki
from Data_clean_functions import *
from tld import get_tld

from collections import Counter
import warnings
warnings.filterwarnings("ignore")

 

In [None]:
# Whether to run cleaning or not
RUN_CLEANING = False
# Note: Approximate 4 hours per year


### Data paths

In [None]:
"""
    Please note the data is not provided but the `FILE` represents the 
    data given by QuoteBank 
"""
DATA_PATH = './data/'
FILE = DATA_PATH + 'quotes-2020.json.bz2'
PATH_OUT = DATA_PATH + 'rapid_clean-quotes-2020.json.bz2'
PATH_OUT_filter = DATA_PATH + 'filter_clean-quotes-2020.json.bz2'

#### Read file, clean and save in pickle

In [None]:
n_chunks = 0
Total_count = Counter()
top_sites = []

with pd.read_json(FILE, lines=True, compression='bz2', chunksize=100000) as df_reader:
    for chunk in tqdm(df_reader):

        # Basic cleaning (refer to function doc)
        df_base_clean = rapid_clean(chunk)
        
        # Extract site name from dataframe
        extract_name(df_base_clean)
        
        # Expand the df on sitenames and urls
        df_base_clean_exp = df_base_clean.explode(["sitenames", "urls"])
        
        # Save chunk by chunk appending the clean df
        with open(PATH_OUT, 'ab') as d_file:
            pickle.dump(df_base_clean_exp, d_file)
            n_chunks += 1
            
        # Add counter for occurences of a specific media
        counts = Counter(df_base_clean_exp['sitenames'].tolist()) 
        Total_count += counts
        print("Chunk done")
   
    # List the top 100 most occuring media
    for site, count in Total_count.most_common(100):
            top_sites.append(site)
            

    
print("finished top sites")


#### Pickle save the top_sites for future use

In [None]:
with open(DATA_PATH + 'top_sites.pkl', 'wb') as output:
    pickle.dump(top_sites, output)


# Pickle open the top_sites
"""
with open(DATA_PATH + 'top_sites.pkl', 'rb') as file:
    top_sites = pickle.load(file)
    
"""

In [None]:
# For Milestone 2 keep 10 most citing media

"""
    Note: We intend on increasing the number from 10 to 50 but this would take 
    an approximate 5 hours to run per year, we thus plan on running it after the deadline 
    and focused on making a main pipeline first
"""
top_10_sites = top_sites[:10]
top_10_sites

### Filter the rows belonging to top 10 sites

In [None]:
# New df with rows belonging to top 10 sites

chunks_all_filtered = pd.DataFrame(columns=['quoteID', 'quotation', 'speaker', 'qids', 'date', 'numOccurrences',
       'probas', 'urls', 'phase', 'sitenames'])

chunk_nbr = 0
n_chunks_filtered = 0

with open(PATH_OUT, 'rb') as d_file:
    while (chunk_nbr < n_chunks):
        
        # Progress meter
        print(f"{chunk_nbr}/{n_chunks}")
        
        chunk = pickle.load(d_file)
        
        # Filter chunks with sitenames belonging to top 10
        chunk_filtered = chunk[chunk.sitenames.isin(top_10_sites)]
        
        # Save filtered chunks
        with open(PATH_OUT_filter, 'ab') as d_file_out:
            pickle.dump(chunk_filtered, d_file_out)
            n_chunks_filtered += 1
            
        chunks_all_filtered = chunks_all_filtered.append(chunk_filtered)

        chunk_nbr+=1

In [None]:
# Save as pickle for future use

with open(DATA_PATH + 'chunks_all_filtered.pkl', 'wb') as output:
    pickle.dump(chunks_all_filtered, output)

"""
    # Open pickled dataframe
with open(DATA_PATH +'chunks_all_filtered.pkl', 'rb') as output:
    chunks_all_filtered = pickle.load(output)

"""

In [None]:
# Groupby the exploded data set 
gb_all_filtered = chunks_all_filtered[["speaker", "qids" , "urls", "quoteID", "quotation","date"]].groupby(["speaker", "qids", "quoteID", "quotation"])

In [None]:
# One row, quote, may be cited by different media so we list them
df_filtered = gb_all_filtered["urls"].apply(list)

df_filtered_final = df_filtered.reset_index()

In [None]:
# Save pickled dataframe
with open(DATA_PATH + 'df_filtered_final.pkl', 'wb') as output:
    pickle.dump(df_filtered_final, output)
    
'''# Open pickled dataframe
with open(DATA_PATH + 'df_filtered_final.pkl', 'rb') as output:
    df_filtered = pickle.load(output)
'''

### Create a dictionnary of categories and associated synonyms

This will enable us to tag the different category of the quote

*Note*: This is a restrictive list and some additional content will be added

In [14]:
matchers = {"art": ["art", "paint", "draw", "museum"], \
            "business": ["business", "finance", "economy", "commerce", "bank", "money", "trade"], \
            "entertainment":["entertainment"], 
            "fashion":["fashion", "couture", "designer"], \
            "medicine":["medicine", "health", "pharmacy", "wellbeing", "body"], \
            "music":["music", "song", "album", "concert"], \
            "politics":["politics", "government"], \
            "science":["science", "research"], \
            "sport": ["sport", "football", "athletics", "swimming", "rugby", "tennis", "volleyball", "ski"]}

# Find general form for categories and words
generalizeDictionary(matchers)

### Extract information from URL

In [15]:
df_extract = Chunk_url_extract(df_filtered_final, matchers)

Total length:  285911


0it [00:00, ?it/s]

In [16]:
with open(DATA_PATH + 'df_extract.pkl', 'wb') as output:
    pickle.dump(df_extract, output)
    


In [2]:
# Open pickled dataframe
with open(DATA_PATH + 'df_extract.pkl', 'rb') as output:
    df_extract = pickle.load(output)


### Formatting wikidata data of interest
Using the Wikidata speakers and label description files provided by TA's, we extract data we need for our project.

This includes gender, citizenship, data of birth...

In [None]:
Wikidata_speakers = pd.read_parquet(DATA_PATH + 'speaker_attributes.parquet')
Wikidata_countries = pd.read_csv(DATA_PATH + 'wikidata_labels_descriptions_quotebank.csv.bz2', compression = 'bz2')


Wikidata_utils = formating_wikidata(Wikidata_speakers, Wikidata_countries)

In [None]:
# Save wikidata utils
with open(DATA_PATH + 'Wikidata_utils.pkl', 'wb') as output:
    pickle.dump(Wikidata_utils, output)

In [None]:
# Open file 

with open(DATA_PATH + 'Wikidata_utils.pkl', 'rb') as input_file:
    Wikidata_utils = pickle.load(input_file)

### Add information from wikidata

In [19]:
# Merge the info of wikidata from speaker to the initial dataframe
df_merged = merge_quotes_wikidata(Wikidata_utils, df_extract)

In [36]:
with open(DATA_PATH + 'df_year_no_media.pkl', 'wb') as output:
    pickle.dump(df_merged, output)