In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import bz2
import json
from tld import get_tld
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import string
import math
from collections import Counter
from operator import itemgetter
import re
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lisalaurent/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lisalaurent/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### CREATE PATHS

In [2]:
DATA_PATH = './Data/'
FILE2016 = DATA_PATH + 'quotes-2016.json.bz2'
PATH_OUT = DATA_PATH + 'clean-quotes-2016.json.bz2'
#FILE2020 = DATA_PATH + 'quotes-2020.json.bz2'
#df_base = pd.read_json(FILE2016, lines=True, compression='bz2', nrows=10000)

In [None]:
df_base = pd.read_json(FILE2016, lines=True, compression='bz2', nrows=100)
df_base

## Create functions 

In [3]:
def get_sitename(url):
    res = get_tld(url, as_object=True)
    return res.domain

In [4]:
def extract_name(df): 
    df['sitenames'] = np.nan
    for row in range(df.shape[0]):
        df['sitenames'][row] = [get_sitename(site) for site in df['urls'][row]] 

In [5]:
def get_domain(url):
    res = get_tld(url, as_object=True)
    return res.tld

In [6]:
def get_sitename(url):
    res = get_tld(url, as_object=True)
    return res.domain

In [7]:
stemmer = PorterStemmer()
def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation]) # get the text without punctuation
    tokens = nltk.word_tokenize(text) #Tokenizers divide strings into lists of substrings
    return " ".join([stemmer.stem(word.lower()) for word in tokens if word not in stopwords.words('english')]) #stem all words

In [8]:
def generalizeDictionary(matchers):
    for category in matchers:
        for i in range(len(matchers[category])):
            matchers[category][i] = tokenize(matchers[category][i])

In [9]:
def getWordsFromURL(url):
    return re.compile(r'\W+',re.UNICODE).split(url)

In [10]:
def classify(matchers, url): #Give it an already generalized dictionary
    tag_found = []
    general_url = [tokenize(x) for x in getWordsFromURL(url)]
    for category in matchers:
        for i in range(len(matchers[category])):
            match = matchers[category][i]
            if match in general_url:
                tag_found.append(category)
    return tag_found # or you can "return None"

### PROCESSING DATA

In [11]:
matchers = {"art": ["art", "paint", "draw"], "business": ["business", "finance", "economy"], "sport": ["sport", "football"]}
generalizeDictionary(matchers)

In [20]:
def process_chunk_complete(chunk, threshold_probas):
    print(f'Processing chunk with {len(chunk)} rows')
    #DATA CLEANING
    #Remove None speakers
    chunk = chunk.drop(chunk[chunk['speaker']=='None'].index)
    chunk = chunk.reset_index(drop=True)
    
    #Remove none unique ids and keep the first one
    if not(chunk.quoteID.is_unique):
        chunk = chunk.drop_duplicates(subset=['quoteID'], keep='first')
        
    #Remove nan or empty quotes 
    chunk = chunk.drop(chunk[chunk["quotation"].isna() | chunk["quotation"].isnull()].index)
    chunk = chunk.reset_index(drop=True)

    #Remove speakers for which probability is lower than a threshold
        #Gather the first probability for each row
    string_probas, nber_probas = [], []
    string_probas = [chunk['probas'][i][0][1] for i in chunk.index]
    nber_probas = list(map(float, string_probas))
    series_probas = pd.Series(nber_probas, dtype='float64', index=chunk.index)
        #Check if the probability is larger than the threshold, if not remove corresponding index
    chunk = chunk.drop(series_probas[series_probas < threshold_probas].index)
    chunk = chunk.reset_index(drop=True)
    
    #URLS DATA EXTRACTION
    tags_column = [] # List of lists. Each list inside corresponds to a row. Will become the 'tag' column
    site_column = []
    domain_column = []
    for index, row in chunk.iterrows():
            tags = [] #tags for all the urls in that row
            domains = []
            sitenames = []
            for url in row['urls']:
                #Extract data
                tld = get_domain(url)
                name = get_sitename(url)
                categories = classify(matchers, url)
                #Append data
                domains.append(tld)
                sitenames.append(name)
                tags.append(categories)
            tags_column.append(tags)
            site_column.append(sitenames)
            domain_column.append(domains)
    # Create new columns with new data
    chunk['sitenames'] = site_column
    chunk['domain'] = domain_column
    chunk['tags'] = tags_column
    
    tot_length = len(chunk)
    return chunk, tot_length

with pd.read_json(FILE2016, lines=True, compression='bz2', chunksize=1000) as df_reader:
    with bz2.open(PATH_OUT, 'wb') as d_file:
        for chunk in df_reader:
            chunk_cleaned, chunk_length = process_chunk_complete(chunk, 0.5)
            chunk_json = chunk_cleaned.to_json(orient='columns')
            #d_file.write((json.dumps(chunk_cleaned)+'\n').encode('utf-8'))
            d_file.write((chunk_json+'\n').encode('utf-8'))

Processing chunk with 1000 rows
Processing chunk with 1000 rows
Processing chunk with 1000 rows


KeyboardInterrupt: 

In [22]:
df_base = pd.read_json(DATA_PATH + 'clean-quotes-2016.json.bz2', lines=True, compression='bz2', nrows=3000)
df_base

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,sitenames,domain,tags
0,"{'0': '2016-12-26-000040', '1': '2016-07-31-00...",{'0': '[ ] and Chris [ Jones ] were in there a...,"{'0': 'Andy Reid', '1': 'Mike Howe', '2': 'Hil...","{'0': ['Q2622812', 'Q27830815', 'Q470738', 'Q4...","{'0': 1482782700000, '1': 1469953332000, '2': ...","{'0': 1, '1': 2, '2': 1, '3': 1, '4': 1, '5': ...","{'0': [['Andy Reid', '0.9432'], ['None', '0.05...",{'0': ['http://www.kcchiefs.com/news/article-2...,"{'0': 'E', '1': 'E', '2': 'E', '3': 'E', '4': ...","{'0': ['kcchiefs'], '1': ['peninsuladailynews'...","{'0': ['com'], '1': ['com', 'com'], '2': ['com...","{'0': [[]], '1': [[], []], '2': [[]], '3': [[]..."
1,"{'0': '2016-09-16-022093', '1': '2016-08-05-01...",{'0': 'everyone knows the professional hatred ...,"{'0': 'DeAngelo Williams', '1': 'Karen Olson',...","{'0': ['Q3020040'], '1': ['Q6369926'], '2': ['...","{'0': 1474044154000, '1': 1470355200000, '2': ...","{'0': 1, '1': 1, '2': 29, '3': 1, '4': 1, '5':...","{'0': [['DeAngelo Williams', '0.9444'], ['None...",{'0': ['http://wcpo.com/sports/football/bengal...,"{'0': 'E', '1': 'E', '2': 'E', '3': 'E', '4': ...","{'0': ['wcpo'], '1': ['kuow'], '2': ['csmonito...","{'0': ['com'], '1': ['org'], '2': ['com', 'com...","{'0': [['sport', 'sport']], '1': [[]], '2': [[..."
