In [221]:
import os
import sys
import glob
import json
import requests
import pandas as pd
import pandas_gbq as gbq
from pathlib import Path
from IPython.core.debugger import set_trace
from tqdm.auto import tqdm
tqdm.pandas()

# Bigquery config

In [222]:
proj_id = 'nlp-project-253707'
table_id = 'news_dataset.news'

# Tagged news from Bigquery

In [223]:
sql = '''
SELECT url
FROM `nlp-project-253707.news_dataset.news`
'''

tagged_news = gbq.read_gbq(sql, project_id=proj_id)
tagged_news_url = list(tagged_news.url)

# check uniqueness of urls
assert len(tagged_news_url) == len(set(tagged_news_url)), 'url duplicated'

AssertionError: url duplicated

In [224]:
len(tagged_news)

7489

# Listing texts & urls to tag

In [116]:
def extract_to_tag(urls_asis, n_tag=500, where='newsdata/downloaded/*.json'):
    '''
    n_tag=500: uclassify가 허용하는 최대 text수가 500개임
    '''
    
    urls = []
    texts = []
    
    n = 0
    for fname in glob.glob(where):
        js = json.loads(Path(fname).read_text())

        if js['url'] not in urls_asis:        
            texts.append(js['text'])
            urls.append(js['url'])
            n += 1
            print('\r{}'.format(n), end='')
            if n == n_tag: break
                
    return texts, urls

In [117]:
texts, urls = extract_to_tag(tagged_news_url)

500

# Classification using uClassify
https://www.uclassify.com

In [118]:
apikeys = {
            'gem763': 'YbEhDr9hAYmd', 
    'projectester01': 'PUcCZkyO5wPs', # google 로그인해야함
    'projectester02': 'f2ISRvxphKf2',
    'projectester03': 'ebAUBLTIcdna',
    'projectester04': 'UTuQWyexBdOP',
    'projectester05': 'nfsPpO8ivVaR',
    'projectester06': '4659gHVFX8CQ',
    'projectester07': 'UljdQT1tzeVs',
    'projectester08': 'RWh9JGLYRho5',
    'projectester09': 'EHjhm6nONJv4',
    'projectester10': 'QZlAZF62SaEc',
    'projectester11': 'zxJQoBwzi1Bw',
    'projectester12': 'uZRjYK6Vx5aC',
    'projectester13': 'Vjk2T02S0Mvz',
    'projectester14': 'Z46sXa4fYKyO',
    'projectester15': 'EwLxqluy7WcY',
    'projectester16': 'NCVDU2DMxEXB',
    'projectester17': 'xUMOMxOIoWUY',
    'projectester18': 'B93XpEXzZsfa',
    'projectester19': 'vLuLVnQ4sCmM',
    'projectester20': 'ACIHqZxBXEBO',
}

In [119]:
def uclassify(_id, texts):
    headers = {'Authorization':'Token {}'.format(apikeys[_id])}
    endpoint = 'https://api.uclassify.com/v1/uclassify/IAB Taxonomy v2/classify'
    data = json.dumps({'texts': texts})

    return requests.post(endpoint, headers=headers, data=data)


def uclassify_response_to_dataframe(u_resp, urls, texts):
    records = []
    updated_at = str(pd.Timestamp.utcnow().date())

    for i, resp in enumerate(tqdm(u_resp.json())):
        cl = max(resp['classification'], key=lambda cand:cand['p'])    
        records.append({
            'url': urls[i], 
            'text': texts[i], 
            'uclass': cl['className'].split('_')[0], 
            'prob': cl['p'], 
            'updated_at': updated_at, 
        })

    return pd.DataFrame(records)

In [120]:
response = uclassify('projectester13', texts)

In [156]:
df = uclassify_response_to_dataframe(response, urls, texts); df

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




Unnamed: 0,prob,text,uclass,updated_at,url
0,0.941261,But what if the way we think about paying our ...,business and finance,2019-09-26,https://politico.com/interactives/2019/how-to-...
1,1.000000,“Baby Fed” has finally broken through.\n\nForm...,sports,2019-09-26,https://wsj.com/articles/roger-federer-stunned...
2,0.998915,"Air France-KLM shares AF, -8.72% plunged in ea...",business and finance,2019-09-26,https://marketwatch.com/story/air-france-klm-s...
3,1.000000,"MANCHESTER, England (Reuters) - Australia reta...",sports,2019-09-26,https://in.reuters.com/article/cricket-ashes/c...
4,0.372202,Regular Business Insider readers know that eve...,automotive,2019-09-26,https://businessinsider.com/elon-musk-rivian-f...
5,0.998945,Brexit minister Barclay says UK making progres...,travel,2019-09-26,https://investing.com/news/forex-news/brexit-m...
6,0.722439,"The cyclist on the birth of his daughter, his ...",hobbies and interests,2019-09-26,https://theguardian.com/lifeandstyle/2019/sep/...
7,0.771707,"The latest curious, dress code-related story t...",sports,2019-09-26,https://news.yahoo.com/alaska-swimsuit-scandal...
8,0.907942,Clearing up murky goals can lead to concrete r...,religion and spirituality,2019-09-26,https://yahoo.com/lifestyle/horoscope/Leo/dail...
9,1.000000,"MANCHESTER, England (Reuters) - Australia reta...",sports,2019-09-26,https://uk.reuters.com/article/uk-cricket-ashe...


# Bigquery update

In [122]:
gbq.to_gbq(df, table_id, project_id=proj_id, if_exists='append')

1it [00:09,  9.37s/it]


In [137]:
sql = '''
SELECT *
FROM `nlp-project-253707.news_dataset.news`
'''

test = gbq.read_gbq(sql, project_id=proj_id)

In [218]:
test

Unnamed: 0,prob,text,uclass,updated_at,url
0,0.292673,\n\nChris Sharp of the Fairfax County Police D...,pets,2019-09-26,https://washingtonpost.com/local/public-safety...
1,0.472381,Dir: Roger Hinze and Michael William Miles. Fe...,movies,2019-09-26,https://independent.co.uk/arts-entertainment/f...
2,0.621441,"Horror film ""IT Chapter Two"" stayed atop the N...",movies,2019-09-26,https://businesstimes.com.sg/life-culture/it-s...
3,0.993574,The 2019 Emmys are officially here.\n\nThe 71s...,movies,2019-09-26,https://yahoo.com/entertainment/emmys-2019-win...
4,0.293006,Eddie Murphy in Netflix’s Dolemite Is My Name ...,movies,2019-09-26,https://vulture.com/2019/09/tiff-screening-ven...
5,0.961838,This article originally appeared on VICE US.\n...,movies,2019-09-26,https://vice.com/en_asia/article/kz4bvw/mike-f...
6,0.434689,Image copyright Alamy Image caption Movie rema...,movies,2019-09-26,https://bbc.co.uk/news/newsbeat-49755700
7,0.670388,Angelina Jolie is a busy Hollywood actress and...,movies,2019-09-26,https://yahoo.com/entertainment/angelina-jolie...
8,0.511823,MEXICO CITY (Reuters) - Mexico's president on ...,sports,2019-09-26,https://businessinsider.com/mexico-president-s...
9,0.966641,MEXICO CITY (Reuters) - Mexican President Andr...,sports,2019-09-26,https://reuters.com/article/us-mexico-constell...


In [220]:
test[test.duplicated('url')]

Unnamed: 0,prob,text,uclass,updated_at,url
807,0.941261,But what if the way we think about paying our ...,business and finance,2019-09-26,https://politico.com/interactives/2019/how-to-...
