In [71]:
import os
import sys
import glob
import json
import requests
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.path.abspath(os.path.join(os.getcwd(), 'nlp-project-253707-ed480ed908be.json'))

In [19]:
files = glob.glob('newsdata/downloaded/*.json')

In [32]:
import six
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

text = json.loads(Path(files[100]).read_text())['text']
client = language.LanguageServiceClient()

if isinstance(text, six.binary_type):
    text = text.decode('utf-8')

document = types.Document(
    content=text.encode('utf-8'),
    type=enums.Document.Type.PLAIN_TEXT)

categories = client.classify_text(document).categories

for category in categories:
    print(u'=' * 20)
    print(u'{:<16}: {}'.format('name', category.name))
    print(u'{:<16}: {}'.format('confidence', category.confidence))

name            : /Science/Engineering & Technology
confidence      : 0.9900000095367432
name            : /Business & Industrial/Aerospace & Defense/Space Technology
confidence      : 0.9700000286102295


In [32]:
from google.cloud import language, bigquery
from google.cloud.language import enums, types

In [14]:
nl_client = language.LanguageServiceClient()
bq_client = bigquery.Client()

In [15]:
table = bq_client.get_table('nlp-project-253707.news_dataset.news')

In [47]:
def classify(text):
    doc = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT)
    return nl_client.classify_text(doc).categories

In [48]:
records = []
updated_at = str(pd.Timestamp.utcnow().date())

for f in tqdm(files[:10]):
    article = json.loads(Path(f).read_text())
    url = article['url']
    text = article['text']
    resp = classify(text)
    
    if len(resp) > 0:
        cat = resp[0]
        records.append({
            'url': url, 
            'text': text, 
            'updated_at': updated_at, 
            'googlecat': cat.name,
            'confidence': cat.confidence
        })
    
    else:
        print(url)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [49]:
records

[{'url': 'https://us.cnn.com/2019/09/11/politics/donald-trump-vape-e-cigarette-flavors/index.html',
  'text': '(CNN) President Donald Trump said Wednesday that the US Food and Drug Administration would be putting out "some very strong recommendations" regarding the use of flavored e-cigarettes in "a couple of weeks."\n\nHealth and Human Services Secretary Alex Azar, seated in the Oval Office with the President, first lady Melania Trump and the acting commissioner of the FDA, announced that newly proposed enforcement policy would require flavored e-cigarette companies to take their products off the market.\n\n"It\'ll take several weeks for us to put out the final guidance that will announce all the parameters around the enforcement policy, and then there will likely be about a 30-day delay to effective date, as is customary," Azar said. But "at that point all flavored e-cigarettes other than tobacco flavor would have to be removed from the market."\n\nBy May 20, he said, e-cigarette com

In [46]:
classify('NEW DELHI (Reuters) - India banned the sale of electronic cigarettes on Wednesday and warned of an “epidemic” among young people, in the latest and potentially biggest move globally against vaping over growing health concerns.\n\nThe ban cuts off a huge future market from e-cigarette makers at a time when the number of people smoking worldwide is declining. It could dash the expansion plans of companies such as Juul Labs and Philip Morris International in the country.\n\n“These novel products come with attractive appearances and multiple flavours and their use has increased exponentially and acquired epidemic proportions in developed countries, especially among youth and children,” India’s health ministry said.\n\nThe ban also covers the production, import and advertising of e-cigarettes - but not the use of them. It comes at a time when vaping is facing increased scrutiny in other countries.\n\nThe United States last week announced plans to remove flavoured e-cigarettes from stores, warning that sweet flavours had drawn millions of children into nicotine addiction.\n\nThe Indian prohibition will be imposed through an executive order and will include jail terms of up to three years for offenders.\n\nIndia has 106 million adult smokers, second only to China in the world, making it a lucrative market for companies making vaping products such as U.S.-based Juul and Philip Morris, which manufactures a heat-not-burn tobacco device.\n\nThe ban was announced by Finance Minister Nirmala Sitharaman at a news conference, where she showed various types of products to the media, including a Juul vaping device, which resembles a USB flash drive.\n\nJuul had plans to launch its e-cigarette in India and has hired several senior executives in recent months. Philip Morris also has plans to launch its heat-not-burn smoking device in India, Reuters has reported.\n\nA spokeswoman for Juul in India declined to comment. Philip Morris did not respond to a request for comment.\n\nJuul, in which tobacco giant Altria group owns a 35% stake, is already facing government scrutiny in its home market and elsewhere. In China, Juul said on Tuesday its products were not currently available on e-commerce websites, days after it entered the market.\n\nHEALTH DISPUTE\n\nIndia’s vapour-products market was valued at $57 million in 2018, according to data from Euromonitor International. Before the ban, the research group estimated the market in India would grow by nearly 60 percent a year up to 2022.\n\nA Juul e-cigarette and pods are seen in this picture illustration taken September 16, 2018. REUTERS/Ronen Zvulun/Files\n\nShane MacGuill, head of tobacco research at Euromonitor in London, said the India ban could push other countries to follow suit, hurting the global vapour industry.\n\n“India is obviously a market of huge potential for vapour products,” he told Reuters. “This ban would decisively cut off access to that potential growth cohort for companies like Juul Labs and PMI (Philip Morris International).”\n\nThe global market for e-cigarettes is still small compared to cigarettes, but is growing rapidly. Last year global cigarette sales totalled more than $713 billion, compared to $15.7 billion for vapour products, according to Euromonitor. By 2023, the vapour category is projected to more than double to $40 billion, while cigarettes are expected to decline slightly.\n\nAdvocates for e-cigarettes say vaping, which usually involves inhaling a vapour formed from heating up a liquid containing nicotine, is far less harmful than smoking tobacco.\n\nBut many tobacco-control activists are opposed to the devices, saying they could lead to nicotine addiction and push people towards consuming tobacco.\n\nMore than 900,000 people die each year due to tobacco-related illnesses in India, home to about 1.3 billion people.\n\nThe Association of Vapers India, an organisation that represents e-cigarette users across the country, attacked the government’s decision, saying it would deprive millions of smokers of a safer solution to cut back on smoking.\n\nCOURT CHALLENGE?\n\nThe ban order will impose a jail term of up to one year and a fine of 100,000 rupees ($1,404) for first-time offenders. A repeat violation would attract a jail term of up to three years and a penalty of up to 500,000 rupees, the government said.\n\nThe ban would cover the manufacture, import, sale, advertisement and distribution of e-cigarettes. But it would not apply to the end users of such devices, Vikas Sheel, a senior official at India’s health ministry, told Reuters.\n\n“Over a period of time, people will not get their (vape) refills, so they will become responsible,” he said.\n\nThe government expects the ban order to be challenged in court, but was prepared to defend its decision, Sheel added.\n\nShares in ITC, India’s top cigarette maker, ended up nearly 1%, while its rival Godfrey Phillips surged 5.2%. E-cigarettes form just a tiny part of their product range.\n\nA man holds an electronic cigarette as he vapes at a Vape Shop in Monterrey, Mexico February 1, 2019. REUTERS/Daniel Becerril/Files\n\nThe ban order needs to be approved by the president before it takes effect, but this is typically a formality.\n\nSuch executive orders are typically issued as an emergency measure when Indian parliament is not in session. The ban order could lapse if it is not approved when lawmakers convene against in the next session of parliament, which will most likely be held in November.')

categories {
  name: "/Sensitive Subjects"
  confidence: 0.9599999785423279
}
categories {
  name: "/Health/Substance Abuse/Smoking & Smoking Cessation"
  confidence: 0.949999988079071
}
categories {
  name: "/Shopping/Tobacco Products"
  confidence: 0.8100000023841858
}

In [62]:
records[9]['googlecat']

'/Sensitive Subjects'

In [63]:
classify(records[9]['text'])

[name: "/Sensitive Subjects"
confidence: 0.9599999785423279
, name: "/Health/Substance Abuse/Smoking & Smoking Cessation"
confidence: 0.949999988079071
, name: "/Shopping/Tobacco Products"
confidence: 0.8100000023841858
]

In [100]:
records[8]['text']

'Xero has bolstered its machine learning capabilities for document processing and extraction using technology from Hubdoc, a company that the cloud accounting firm acquired last year for $70 million.\n\nAccording to Xero, the new machine learning capabilities means advisors will have access to more accurate information, gain deeper understanding of financial health, and the flow of data will become more automated.\n\nXero added it is working with advisors to pilot a short-term cash flow tool that uses data and advanced statistical methods to provide small businesses with a 30-day view of artificial intelligence (AI)-powered cash flow predictions, including impact of existing bills and invoices.\n\nIn addition, the company has introduced single sign-on, touting it as making it easier for developers to build on the Xero platform, onboard new users, and integrate Xero with certified third-party apps.\n\nSee also: How small businesses can deal with getting regulated (TechRepublic)\n\nUpdat

In [101]:
headers = {"Authorization":"Token YbEhDr9hAYmd"}
endpoint = 'https://api.uclassify.com/v1/uclassify/IAB Taxonomy/classify'
data = json.dumps({'texts': [records[8]['text']]})

response = requests.post(endpoint, headers=headers, data=data)
out = response.json();out

[{'textCoverage': 0.863481,
  'classification': [{'className': 'arts and entertainment_books and literature_1_1',
    'p': 5.15551e-09},
   {'className': 'arts and entertainment_celebrity fan and gossip_1_2',
    'p': 9.37527e-09},
   {'className': 'arts and entertainment_fine art_1_3', 'p': 2.4244e-11},
   {'className': 'arts and entertainment_humor_1_4', 'p': 1.37162e-10},
   {'className': 'arts and entertainment_movies_1_5', 'p': 1.27323e-05},
   {'className': 'arts and entertainment_music_1_6', 'p': 7.7933e-08},
   {'className': 'arts and entertainment_television_1_7', 'p': 1.96076e-06},
   {'className': 'automotive_auto parts_2_1', 'p': 9.6587e-13},
   {'className': 'automotive_auto repair_2_2', 'p': 1.77519e-08},
   {'className': 'automotive_buying and selling cars_2_3', 'p': 3.29982e-09},
   {'className': 'automotive_car culture_2_4', 'p': 8.73806e-08},
   {'className': 'automotive_certified pre owned_2_5', 'p': 1.22604e-20},
   {'className': 'automotive_convertible_2_6', 'p': 1

In [103]:
pd.DataFrame(out[0]['classification'])#.sort_values('p', ascending=False)

Unnamed: 0,className,p
0,arts and entertainment_books and literature_1_1,5.155510e-09
1,arts and entertainment_celebrity fan and gossi...,9.375270e-09
2,arts and entertainment_fine art_1_3,2.424400e-11
3,arts and entertainment_humor_1_4,1.371620e-10
4,arts and entertainment_movies_1_5,1.273230e-05
5,arts and entertainment_music_1_6,7.793300e-08
6,arts and entertainment_television_1_7,1.960760e-06
7,automotive_auto parts_2_1,9.658700e-13
8,automotive_auto repair_2_2,1.775190e-08
9,automotive_buying and selling cars_2_3,3.299820e-09
