In [487]:
import pandas as pd
import re
from nltk.stem.porter import PorterStemmer
from time import sleep

In [488]:
df = pd.read_csv("./dataset/meta.csv")

In [489]:
df['text'] = df['title']
df['text'] = df['text'].apply(lambda x: x.split(" "))

In [490]:
def clean_list(list_to_clean):
    """
    Function to clean a list
    Removes any non-alphanumeric characters
    Stems words
    Gets rid of any empty elements in the list

    Param_1: List, containing strings
    Output_1: List, containing cleaned strings
    """
    stemmer = PorterStemmer()
    items_to_clean = set(list(stopwords.words('english')) + ['\n','\n\n','\n\n\n','\n\n\n\n','ocroutput','',' '])
    # Items to clean
    regex_non_alphanumeric = re.compile('[^0-9a-zA-Z]')  # REGEX for non alphanumeric chars
    for index,item in enumerate(list_to_clean):
        item = regex_non_alphanumeric.sub('', item)  # Filter text, remove non alphanumeric chars
        item = item.lower()  # Lowercase the text
#         item = stemmer.stem(item)  # Stem the text
        if len(item) < 3:  # If the length of item is lower than 3, remove item
            item = ''
        list_to_clean[index] = item  # Put item back to the list
    cleaned_list = [elem for elem in list_to_clean if elem not in items_to_clean]
    # Remove empty items from the list
    return cleaned_list

In [491]:
def remove_frequent_items(cleaned_list):
    """
    Remove frequently occured words

    Param_1: List of list containing strings
    Param_2: Above x percentage of occurance will be removed
    Output_1: Cleaned list
    """
    percentage=90
    treshold = int(len(cleaned_list) * percentage / 100)
    DF = defaultdict(int)
    for word in set(cleaned_list):
            DF[word] += 1
    words_to_remove = {k:v for k,v in DF.items() if v > treshold }
    # A new dictionary of items that only has count above treshold
    words_to_remove_as_list = set(words_to_remove.keys())
    freq_items_removed_book_word_list = []
    freq_items_removed_list = [word for word in cleaned_list if word not in words_to_remove_as_list]
    return freq_items_removed_list


In [492]:
df['text'] = df['text'].apply(clean_list)
df['text'] = df['text'].apply(remove_frequent_items)
df['text']

0     [video, sparks, fears, hong, kong, protesters,...
1                                      [woppa, twitter]
2     [intolerably, offensive, boys, nazi, costume, ...
3     [utah, elementary, student, wears, nazi, costu...
4     [principal, teacher, suspended, student, dress...
5     [man, arrested, killing, peeping, tom, fort, l...
6     [delray, man, arrested, 2018, peeping, tom, ki...
7     [florida, man, charged, beating, peeping, tom,...
8     [naked, florida, man, allegedly, beats, peepin...
9     [florida, peeping, tom, beaten, death, naked, ...
10    [two, arkansas, chemistry, professors, arreste...
11    [two, chemistry, professors, arrested, alleged...
12    [two, arkansas, chemistry, professors, arreste...
13    [chemistry, professors, charged, making, meth,...
14    [chemistry, professors, accused, making, meth,...
15    [bellevue, police, arrest, men, prostitution, ...
16    [seattle, police, captain, arrested, undercove...
17    [seattle, police, captain, arrested, under

In [493]:
df['text'] = df['text'].apply(lambda x: "+".join(x))
df['text']

0     video+sparks+fears+hong+kong+protesters+taiwan...
1                                         woppa+twitter
2     intolerably+offensive+boys+nazi+costume+elemen...
3     utah+elementary+student+wears+nazi+costume+hal...
4     principal+teacher+suspended+student+dresses+hi...
5      man+arrested+killing+peeping+tom+fort+lauderdale
6          delray+man+arrested+2018+peeping+tom+killing
7         florida+man+charged+beating+peeping+tom+death
8     naked+florida+man+allegedly+beats+peeping+tom+...
9     florida+peeping+tom+beaten+death+naked+man+saw...
10    two+arkansas+chemistry+professors+arrested+all...
11    two+chemistry+professors+arrested+allegedly+ma...
12    two+arkansas+chemistry+professors+arrested+all...
13    chemistry+professors+charged+making+meth+schoo...
14    chemistry+professors+accused+making+meth+unive...
15        bellevue+police+arrest+men+prostitution+sting
16    seattle+police+captain+arrested+undercover+vic...
17    seattle+police+captain+arrested+undercover

In [494]:
from pprint import pprint as pp
import requests
import json
tokens=['6e8b08ab29db6f672eecf1447815ea40']
url_base = "https://gnews.io/api/v3/search?q={}&token={}"
token_idx=0
d = list()
for i in df.index:
#     pp(df['title'][i])
    sleep(2)
    string = df['text'][i]
    print(string)
    url=url_base.format(string,tokens[token_idx])
    print(url)
    response = requests.get(url)
    print(response.status_code)
    if response.status_code == 409:
        token_idx +=1
        print("getting next token ",token_idx)
        if token_idx >= len(tokens):
            break
        url = url_base.format(string,tokens[token_idx])
        print("new url",url)
        response = requests.get(url)
    
    if response.status_code !=200:
        print("continuing for ",url," status code",response.status_code)
        continue
    j = json.loads(response.text)
    if len(j['articles']) is not 0:
        print(j['articles'])
        d += (j['articles'])

video+sparks+fears+hong+kong+protesters+taiwan+news
https://gnews.io/api/v3/search?q=video+sparks+fears+hong+kong+protesters+taiwan+news&token=6e8b08ab29db6f672eecf1447815ea40
200
woppa+twitter
https://gnews.io/api/v3/search?q=woppa+twitter&token=6e8b08ab29db6f672eecf1447815ea40
200
[{'title': 'Hong Kong’s fast-learning, dexterous protesters are stumped by Twitter', 'description': 'One of the major stumbling blocks for Hong Kongers taking to Twitter is language. One user, @woppa, a retired 40-year-old called Trevor who resides in Canada, started a hashtag that translates as “Get ...', 'url': 'https://qz.com/1698002/hong-kong-protesters-flock-to-twitter-to-shape-global-message/', 'image': None, 'publishedAt': '2019-09-01 19:54:00 UTC', 'source': {'name': 'Quartz', 'url': 'https://qz.com'}}]
intolerably+offensive+boys+nazi+costume+elementary+school+halloween+parade+sparks+outrage
https://gnews.io/api/v3/search?q=intolerably+offensive+boys+nazi+costume+elementary+school+halloween+parade+s

naked+florida+man+allegedly+beats+peeping+tom+death
https://gnews.io/api/v3/search?q=naked+florida+man+allegedly+beats+peeping+tom+death&token=6e8b08ab29db6f672eecf1447815ea40
200
florida+peeping+tom+beaten+death+naked+man+saw+sex+police
https://gnews.io/api/v3/search?q=florida+peeping+tom+beaten+death+naked+man+saw+sex+police&token=6e8b08ab29db6f672eecf1447815ea40
200
[{'title': 'Florida peeping tom beaten to death by naked man he saw having sex: police', 'description': 'A Florida man was arrested after cops accused him of beating to death a peeping tom who saw him having sex with his girlfriend through a window. Victor Vickery’s arrest Thursday for manslaughter ...', 'url': 'https://www.foxnews.com/us/florida-peeping-tom-death-naked-man-sex-police', 'image': None, 'publishedAt': '2019-10-20 07:10:00 UTC', 'source': {'name': 'Fox News', 'url': 'https://www.foxnews.com'}}]
two+arkansas+chemistry+professors+arrested+allegedly+making+meth+college+lab
https://gnews.io/api/v3/search?q=two+

chemistry+professors+charged+making+meth+school+lab+report+strange+odor
https://gnews.io/api/v3/search?q=chemistry+professors+charged+making+meth+school+lab+report+strange+odor&token=6e8b08ab29db6f672eecf1447815ea40
200
[{'title': 'Chemistry professors charged with making meth in school lab after report of strange odor', 'description': 'Two chemistry professors in Arkansas were charged with manufacturing methamphetamine and using drug paraphernalia ... Three days earlier, she said, police investigated a report of a chemical odor in ...', 'url': 'https://www.usatoday.com/story/news/nation/2019/11/18/arkansas-chemistry-professors-charged-making-meth-school-lab/4226884002/', 'image': 'https://images.gnews.io/5e4ea3f023914fe3a8b074250d317351', 'publishedAt': '2019-11-19 17:32:00 UTC', 'source': {'name': 'USA Today', 'url': 'https://www.usatoday.com'}}]
chemistry+professors+accused+making+meth+university+lab
https://gnews.io/api/v3/search?q=chemistry+professors+accused+making+meth+universit

seattle+police+captain+arrested+undercover+prostitution+sting
https://gnews.io/api/v3/search?q=seattle+police+captain+arrested+undercover+prostitution+sting&token=6e8b08ab29db6f672eecf1447815ea40
200
[{'title': 'Police captain arrested in own department’s prostitution sting', 'description': 'SEATTLE (KCPQ) – A Seattle police captain is accused of sexual exploitation after he was arrested in an undercover operation by his own ... Sources told KIRO that Woolery offered the undercover ...', 'url': 'https://www.nbc4i.com/news/u-s-world/police-captain-arrested-in-own-departments-prostitution-sting/', 'image': 'https://images.gnews.io/9985e4fe273e0a396399e6e6ade79ae1', 'publishedAt': '2019-11-17 09:10:00 UTC', 'source': {'name': 'NBC4i', 'url': 'https://www.nbc4i.com'}}, {'title': 'Seattle police captain busted in prostitution sting after offering $40 to undercover cop: reports', 'description': 'A police captain in Seattle was among the five men arrested this week in a prostitution sting, re

200
[{'title': 'Couple blew through $120K accidentally deposited in their bank account', 'description': 'A Pennsylvania couple is accused of theft after going on a shopping spree with $120,000 that was mistakenly deposited into their bank account, according to a report. Robert Williams and his wife, ...', 'url': 'https://nypost.com/2019/09/08/couple-blew-through-120k-accidentally-deposited-in-their-bank-account/', 'image': None, 'publishedAt': '2019-09-07 23:08:00 UTC', 'source': {'name': 'New York Post', 'url': 'https://nypost.com'}}]
pennsylvania+couple+allegedly+went+massive+spending+spree+120g+mistakenly+added+bank+account
https://gnews.io/api/v3/search?q=pennsylvania+couple+allegedly+went+massive+spending+spree+120g+mistakenly+added+bank+account&token=6e8b08ab29db6f672eecf1447815ea40
200
[{'title': 'Couple allegedly went on massive spending spree after $120G mistakenly added to bank account', 'description': '(Fox News) - A Pennsylvania couple is accused of theft after going on a s

children+teachers+hospitalized+china+kindergarten+attack
https://gnews.io/api/v3/search?q=children+teachers+hospitalized+china+kindergarten+attack&token=6e8b08ab29db6f672eecf1447815ea40
200
[{'title': 'More than 50 children and teachers hospitalized in China after kindergarten chemical attack', 'description': 'were hospitalized in southwest China Tuesday after a man broke into a kindergarten and sprayed them with a corrosive chemical as "revenge on society." Fifty-one children and three teachers were ...', 'url': 'https://www.pressdemocrat.com/news/10309618-181/more-than-50-children-and', 'image': 'https://images.gnews.io/7addfde17431a40961fce41c593049f2', 'publishedAt': '2019-11-13 06:11:00 UTC', 'source': {'name': 'Santa Rosa Press Democrat', 'url': 'https://www.pressdemocrat.com'}}, {'title': 'More than 50 children and teachers hospitalized in China after kindergarten attack', 'description': 'were hospitalized in southwest China on Tuesday after a man broke into a kindergarten and s

In [495]:
from pandas.io.json import json_normalize
d_norm = json_normalize(d)

In [496]:
tf = pd.DataFrame(d_norm)
d

[{'title': 'Hong Kong’s fast-learning, dexterous protesters are stumped by Twitter',
  'description': 'One of the major stumbling blocks for Hong Kongers taking to Twitter is language. One user, @woppa, a retired 40-year-old called Trevor who resides in Canada, started a hashtag that translates as “Get ...',
  'url': 'https://qz.com/1698002/hong-kong-protesters-flock-to-twitter-to-shape-global-message/',
  'image': None,
  'publishedAt': '2019-09-01 19:54:00 UTC',
  'source': {'name': 'Quartz', 'url': 'https://qz.com'}},
 {'title': 'Utah elementary student wears Nazi costume in Halloween school parade, principal and teacher suspended',
  'description': 'DAVIS COUNTY, Utah (KSTU) — A principal and a teacher at a Davis County school are on leave for allowing a boy dressed as Adolf Hitler to march in a Halloween parade. A photo obtained ... telling a ...',
  'url': 'https://www.wfla.com/news/national/utah-principal-teacher-on-leave-after-student-permitted-to-wear-hitler-costume-in-school-

In [497]:
tf

Unnamed: 0,description,image,publishedAt,source.name,source.url,title,url
0,One of the major stumbling blocks for Hong Kon...,,2019-09-01 19:54:00 UTC,Quartz,https://qz.com,"Hong Kong’s fast-learning, dexterous protester...",https://qz.com/1698002/hong-kong-protesters-fl...
1,"DAVIS COUNTY, Utah (KSTU) — A principal and a ...",,2019-11-01 17:54:00 UTC,WFLA News Channel 8,https://www.wfla.com,Utah elementary student wears Nazi costume in ...,https://www.wfla.com/news/national/utah-princi...
2,The principal and a teacher at a Kaysville ele...,,2019-11-01 17:00:00 UTC,The Salt Lake Tribune,https://www.sltrib.com,Utah elementary student wears Nazi costume in ...,https://www.sltrib.com/news/2019/11/01/utah-el...
3,A Utah elementary school principal and teacher...,https://images.gnews.io/d01738e949d6a6c9332e4a...,2019-11-05 07:50:00 UTC,YAHOO!,https://news.yahoo.com,"Principal, teacher suspended after student dre...",https://news.yahoo.com/principal-teacher-suspe...
4,A Utah elementary school principal and teacher...,,2019-11-05 02:25:00 UTC,huffingtonpost.co.uk,https://www.huffingtonpost.co.uk,"Principal, Teacher Suspended After Student Dre...",https://www.huffingtonpost.co.uk/entry/princip...
5,FORT LAUDERDALE (CBSMiami/AP) – A man accused ...,,2019-10-21 06:55:00 UTC,CBS Miami,https://miami.cbslocal.com,Naked Florida Man Accused Of Killing Suspected...,https://miami.cbslocal.com/2019/10/21/naked-fl...
6,(WSVN) - A man accused of killing a Peeping To...,,2019-10-18 09:14:00 UTC,WSVN,https://wsvn.com,Man arrested for killing Peeping Tom in Fort L...,https://wsvn.com/news/local/broward/man-arrest...
7,BOCA RATON — A Delray Beach man charged in the...,,2019-10-19 05:45:00 UTC,The Palm Beach Post,https://www.palmbeachpost.com,Delray man arrested in 2018 ’peeping Tom’ killing,https://www.palmbeachpost.com/news/20191019/de...
8,"FORT LAUDERDALE, Fla. - A South Florida man is...",,2019-10-21 14:52:00 UTC,WPLG,https://www.local10.com,Man charged with manslaughter after beating pe...,https://www.local10.com/news/local/fort-lauder...
9,"Victor Van Vickery, 30, of Delray Beach, was a...",,2019-10-20 11:27:00 UTC,WSB Radio,https://www.wsbradio.com,Florida man accused of beating peeping Tom to ...,https://www.wsbradio.com/news/national/florida...


In [501]:
df2= tf.drop_duplicates("url")
df2.index

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 26, 27, 29, 31, 32, 33, 34, 35, 36, 37,
            38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56,
            57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73,
            74, 75, 76, 77],
           dtype='int64')

In [502]:
df2

Unnamed: 0,description,image,publishedAt,source.name,source.url,title,url
0,One of the major stumbling blocks for Hong Kon...,,2019-09-01 19:54:00 UTC,Quartz,https://qz.com,"Hong Kong’s fast-learning, dexterous protester...",https://qz.com/1698002/hong-kong-protesters-fl...
1,"DAVIS COUNTY, Utah (KSTU) — A principal and a ...",,2019-11-01 17:54:00 UTC,WFLA News Channel 8,https://www.wfla.com,Utah elementary student wears Nazi costume in ...,https://www.wfla.com/news/national/utah-princi...
2,The principal and a teacher at a Kaysville ele...,,2019-11-01 17:00:00 UTC,The Salt Lake Tribune,https://www.sltrib.com,Utah elementary student wears Nazi costume in ...,https://www.sltrib.com/news/2019/11/01/utah-el...
3,A Utah elementary school principal and teacher...,https://images.gnews.io/d01738e949d6a6c9332e4a...,2019-11-05 07:50:00 UTC,YAHOO!,https://news.yahoo.com,"Principal, teacher suspended after student dre...",https://news.yahoo.com/principal-teacher-suspe...
4,A Utah elementary school principal and teacher...,,2019-11-05 02:25:00 UTC,huffingtonpost.co.uk,https://www.huffingtonpost.co.uk,"Principal, Teacher Suspended After Student Dre...",https://www.huffingtonpost.co.uk/entry/princip...
5,FORT LAUDERDALE (CBSMiami/AP) – A man accused ...,,2019-10-21 06:55:00 UTC,CBS Miami,https://miami.cbslocal.com,Naked Florida Man Accused Of Killing Suspected...,https://miami.cbslocal.com/2019/10/21/naked-fl...
6,(WSVN) - A man accused of killing a Peeping To...,,2019-10-18 09:14:00 UTC,WSVN,https://wsvn.com,Man arrested for killing Peeping Tom in Fort L...,https://wsvn.com/news/local/broward/man-arrest...
7,BOCA RATON — A Delray Beach man charged in the...,,2019-10-19 05:45:00 UTC,The Palm Beach Post,https://www.palmbeachpost.com,Delray man arrested in 2018 ’peeping Tom’ killing,https://www.palmbeachpost.com/news/20191019/de...
8,"FORT LAUDERDALE, Fla. - A South Florida man is...",,2019-10-21 14:52:00 UTC,WPLG,https://www.local10.com,Man charged with manslaughter after beating pe...,https://www.local10.com/news/local/fort-lauder...
9,"Victor Van Vickery, 30, of Delray Beach, was a...",,2019-10-20 11:27:00 UTC,WSB Radio,https://www.wsbradio.com,Florida man accused of beating peeping Tom to ...,https://www.wsbradio.com/news/national/florida...


In [500]:
df2.to_csv("./dataset/test.csv")

In [462]:
tokens[1]

'3665e6466a1a60d9c5c7ee75a6be9e74'