# uClassifier로 뉴스분류하고 Bigquery에 저장

# Objective
* 처음에는, 이미 Category가 tagging 되어 있는 BBC뉴스 데이터셋을 이용하여 Classifier를 만들려고 했다
* 그런데 1) 해당 데이터셋 크기가 작다는 점과 2) BBC뉴스만 있다는 점 때문에 다른 방법을 찾아보다가, 
* uClassifier의 IAB Taxonomy가 꽤 괜찮다는 생각에, 아래와 같이 진행
    * uClassifier 성능이 매우 좋다는 가정하에
    * uClassifier로 실제 다운로드한 뉴스를 tagging (2019.09.28 기준 8만개 이상)
    * 해당 tagging된 데이터셋를 학습시켜 classifer 생성

# 이 노트북의 방식은 drop한다: 2019.09.30
* uClassifier의 성능이 예상 이하인 것 같고, 
* 완벽한 성능이 아닌 classifier로 tagging 한 것을 다시 학습시키는 것이 의미가 없어보임
* 차라리 기존의 BBC뉴스 classifier를 활용하고, 뉴스분류는 참고용으로만 하도록 유도하는 방향으로 하자

In [2]:
import os
import sys
import glob
import json
import requests
import pandas as pd
import pandas_gbq as gbq
from pathlib import Path
from IPython.core.debugger import set_trace
from tqdm.auto import tqdm
tqdm.pandas()

# Bigquery config

In [3]:
proj_id = 'nlp-project-253707'
table_id = 'news_dataset.news'

# Tagged news from Bigquery

In [181]:
sql = '''
SELECT url
FROM `nlp-project-253707.news_dataset.news`
'''

tagged_news = gbq.read_gbq(sql, project_id=proj_id)
tagged_news_url = list(tagged_news.url)

# check uniqueness of urls
assert len(tagged_news_url) == len(set(tagged_news_url)), 'url duplicated'

In [182]:
len(tagged_news)

18000

# Listing texts & urls to tag

In [168]:
def extract_to_tag(urls_asis, n_tag=500, where='newsdata/downloaded/*.json'):
    '''
    n_tag=500: uclassify가 허용하는 최대 text수가 500개임
    '''
    
    urls = []
    texts = []
    
    n = 0
    for fname in glob.glob(where):
        js = json.loads(Path(fname).read_text())

        if js['url'] not in urls_asis:        
            texts.append(js['text'])
            urls.append(js['url'])
            n += 1
            print('\r{}'.format(n), end='')
            if n == n_tag: break
                
    return texts, urls

In [169]:
texts, urls = extract_to_tag(tagged_news_url)

500

# Classification using uClassify
https://www.uclassify.com

In [170]:
apikeys = {
            'gem763': 'YbEhDr9hAYmd', 
    'projectester01': 'PUcCZkyO5wPs', # google 로그인해야함
    'projectester02': 'f2ISRvxphKf2',
    'projectester03': 'ebAUBLTIcdna',
    'projectester04': 'UTuQWyexBdOP',
    'projectester05': 'nfsPpO8ivVaR',
    'projectester06': '4659gHVFX8CQ',
    'projectester07': 'UljdQT1tzeVs',
    'projectester08': 'RWh9JGLYRho5',
    'projectester09': 'EHjhm6nONJv4',
    'projectester10': 'QZlAZF62SaEc',
    'projectester11': 'zxJQoBwzi1Bw',
    'projectester12': 'uZRjYK6Vx5aC',
    'projectester13': 'Vjk2T02S0Mvz',
    'projectester14': 'Z46sXa4fYKyO',
    'projectester15': 'EwLxqluy7WcY',
    'projectester16': 'NCVDU2DMxEXB',
    'projectester17': 'xUMOMxOIoWUY',
    'projectester18': 'B93XpEXzZsfa',
    'projectester19': 'vLuLVnQ4sCmM',
    'projectester20': 'ACIHqZxBXEBO',
}

In [171]:
def uclassify(_id, texts):
    headers = {'Authorization':'Token {}'.format(apikeys[_id])}
    endpoint = 'https://api.uclassify.com/v1/uclassify/IAB Taxonomy v2/classify'
    data = json.dumps({'texts': texts})

    return requests.post(endpoint, headers=headers, data=data)


def uclassify_response_to_dataframe(u_resp, urls, texts):
    records = []
    updated_at = str(pd.Timestamp.utcnow())

    for i, resp in enumerate(tqdm(u_resp.json())):
        cl = max(resp['classification'], key=lambda cand:cand['p'])    
        records.append({
            'url': urls[i], 
            'text': texts[i], 
            'uclass': cl['className'].split('_')[0], 
            'prob': cl['p'], 
            'updated_at': updated_at, 
        })

    return pd.DataFrame(records)

In [172]:
response = uclassify('projectester18', texts)

In [173]:
df = uclassify_response_to_dataframe(response, urls, texts); df

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




Unnamed: 0,prob,text,uclass,updated_at,url
0,0.999896,"When it comes to 5G stocks, I prefer a simple,...",automotive,2019-09-29 08:33:36.074966+00:00,https://finance.yahoo.com/news/5g-stocks-bigge...
1,0.619169,"Sheldon Whitehouse, a Democrat, represents Rho...",news and politics,2019-09-29 08:33:36.074966+00:00,https://washingtonpost.com/opinions/the-suprem...
2,0.576536,Despite beating expectations in its first quar...,personal finance,2019-09-29 08:33:36.074966+00:00,https://news.yahoo.com/why-slacks-become-a-vic...
3,0.987763,Image copyright EPA Image caption Germany's Ur...,news and politics,2019-09-29 08:33:36.074966+00:00,https://bbc.com/news/world-europe-49646809
4,0.721777,UK car production declined for the 14th consec...,news and politics,2019-09-29 08:33:36.074966+00:00,https://independent.co.uk/news/business/news/u...
5,0.768826,"After backing Bernie Sanders in 2016, the Work...",news and politics,2019-09-29 08:33:36.074966+00:00,https://thedailybeast.com/working-families-par...
6,0.708656,Charli XCX has opened up about the stigma surr...,pop culture,2019-09-29 08:33:36.074966+00:00,https://independent.co.uk/life-style/health-an...
7,0.985412,New wave singer behind hits including Good Tim...,music and audio,2019-09-29 08:33:36.074966+00:00,https://theguardian.com/music/2019/sep/16/ric-...
8,0.980354,"On Wednesday, the U.S. Federal Reserve conclud...",personal finance,2019-09-29 08:33:36.074966+00:00,https://realmoney.thestreet.com/investing/fixe...
9,0.248107,“We don’t usually look to Washington to solve ...,business and finance,2019-09-29 08:33:36.074966+00:00,https://politico.com/news/2019/09/28/ransomwar...


# Bigquery update

In [174]:
gbq.to_gbq(df, table_id, project_id=proj_id, if_exists='append')

1it [00:00,  1.78it/s]
