In [None]:
import pandas as pd
import pickle
from tqdm.auto import trange, tqdm
import time
from journal_API_wikidata import extract_info_wiki
from Data_clean_functions import *
from tld import get_tld
DATA_PATH = './Data/'
FILE = DATA_PATH + 'quotes-2019.json.bz2'
PATH_OUT = DATA_PATH + 'rapid_clean-quotes-2019.json.bz2'
PATH_OUT_filter = DATA_PATH + 'filter_clean-quotes-2019.json.bz2'
# Download it from the drive and add to your folder / adapt path
PATH_WIKIDATA_UTILS = DATA_PATH + 'Wikidata_utils.pkl' 
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

### Do basic data cleaning and count occurrences of sites

In [None]:
# Pickle the chunks of dataframe for later
n_chunks = 0
Total_count = Counter()
top_sites = []

with pd.read_json(FILE, lines=True, compression='bz2', chunksize=100000) as df_reader:
    for chunk in tqdm(df_reader):

        df_base_clean = rapid_clean(chunk)
        # Extract site name from dataframe
        extract_name(df_base_clean)
        
        df_base_clean_exp = df_base_clean.explode(["sitenames", "urls"])
        with open(PATH_OUT, 'ab') as d_file:
            pickle.dump(df_base_clean_exp, d_file)
            n_chunks += 1
            
        counts = Counter(df_base_clean_exp['sitenames'].tolist()) 
        Total_count += counts
        print("Chunk done")
   
    for site, count in Total_count.most_common(100):
            top_sites.append(site)
            

    
print("finished top sites")

### Filter and keep only quotes from top k sites

In [None]:
print(top_sites)

In [None]:
# Pickle save the top_sites for future use

with open(DATA_PATH + 'top_sites.pkl', 'wb') as output:
    pickle.dump(top_sites, output)

In [None]:
# Pickle open the top_sites

with open(DATA_PATH + 'top_sites.pkl', 'rb') as file:
    top_sites_unpkl = pickle.load(file)

In [None]:
print(top_sites_unpkl)

If the next cell too slow, we can always diminish the number of top sites :) 

In [None]:
top_10_sites = top_sites_unpkl[:10]
top_10_sites

In [None]:
chunks_all_filtered = pd.DataFrame(columns=['quoteID', 'quotation', 'speaker', 'qids', 'date', 'numOccurrences',
       'probas', 'urls', 'phase', 'sitenames'])
chunk_nbr = 0
n_chunks_filtered = 0

with open(PATH_OUT, 'rb') as d_file:
    while (chunk_nbr < n_chunks):
        print(f"{chunk_nbr}/{n_chunks}")
        chunk = pickle.load(d_file)
        chunk_filtered = chunk[chunk.sitenames.isin(top_10_sites)]
        
        # delete PATH_OUT_filter !!!!!
        # delete thefile or change name
        with open(PATH_OUT_filter, 'ab') as d_file_out:
            pickle.dump(chunk_filtered, d_file_out)
            n_chunks_filtered += 1
        chunks_all_filtered = chunks_all_filtered.append(chunk_filtered)

        chunk_nbr+=1

In [None]:
# Save as pickle for future use

with open(DATA_PATH + 'chunks_all_filtered.pkl', 'wb') as output:
    pickle.dump(chunks_all_filtered, output)
    
'''# Open pickled dataframe
with open(DATA_PATH + 'chunks_all_filtered.pkl', 'rb') as output:
    chunks_all_filtered_unpkl = pickle.load(output)
'''

In [None]:
gb_all_filtered = chunks_all_filtered[["speaker", "qids" , "urls", "quoteID", "quotation","date"]].groupby(["speaker", "qids", "quoteID"])

In [None]:
df_filtered = gb_all_filtered["urls"].apply(list)
df_filtered_final = df_filtered.reset_index()

In [None]:
df_filtered_final

In [None]:
with open(DATA_PATH + 'df_filtered_final.pkl', 'wb') as output:
    pickle.dump(df_filtered_final, output)
    
'''# Open pickled dataframe
with open(DATA_PATH + 'df_filtered_final.pkl', 'rb') as output:
    df_filtered_unpkl = pickle.load(output)
'''

### Create a dictionnary of categories and associated synonyms

In [None]:
matchers = {"art": ["art", "paint", "draw", "museum"], \
            "business": ["business", "finance", "economy", "commerce", "bank", "money", "trade"], \
            "entertainment":["entertainment"], 
            "fashion":["fashion", "couture", "designer"], \
            "medicine":["medicine", "health", "pharmacy", "wellbeing", "body"], \
            "music":["music", "song", "album", "concert"], \
            "politics":["politics", "government"], \
            "science":["science", "research"], \
            "sport": ["sport", "football", "athletics", "swimming", "rugby", "tennis", "volleyball", "ski"]}

# Find general form for categories and words
generalizeDictionary(matchers)

### Extract information from URL

In [None]:
df_extract = Chunk_url_extract(df_filtered_final, matchers)

In [None]:
with open(DATA_PATH + 'df_extract.pkl', 'wb') as output:
    pickle.dump(df_extract, output)
    
'''# Open pickled dataframe
with open(DATA_PATH + 'df_extract.pkl', 'rb') as output:
    df_extract_unpkl = pickle.load(output)
'''

### Add information from wikidata

In [None]:
# Open file 

with open(PATH_WIKIDATA_UTILS, 'rb') as input_file:
    Wikidata_utils = pickle.load(input_file)

In [None]:
df_2019 = merge_quotes_wikidata(Wikidata_utils, df_extract)

In [None]:
with open(DATA_PATH + 'df_2019_no_media.pkl', 'wb') as output:
    pickle.dump(df_2019, output)