In [64]:
import numpy as np
import pandas as pd
from tld import get_tld
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import string
from nltk.corpus import stopwords
import math
from collections import Counter
from operator import itemgetter
import re
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/selene/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/selene/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load data 

In [2]:
DATA_PATH = 'data/'
FILE_PATH = DATA_PATH + 'quotes-2019-domains.json.bz2'
OLD_PATH = DATA_PATH + 'quotes-2019.json.bz2'

In [None]:
def process_chunk(chunk):
        print(f'Processing chunk with {len(chunk)} rows')
        print(chunk.columns)

with pd.read_json(FILE_PATH, lines=True, compression='bz2', chunksize=1000000) as df_reader:
    for chunk in df_reader:
        process_chunk(chunk)

### Partially reading data (with already extracted domains)

In [3]:
df_test = pd.read_json(FILE_PATH, lines=True, compression='bz2', nrows=100)
df_test.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,domains
0,2019-10-14-000009,% 9: D: D @? 6 H2J H6 E9@F89E H6 4@F=5 8: G6 3...,,[],2019-10-14 07:30:00,1,"[[None, 0.5595], [Julio Morales, 0.4405]]",[http://www.ivpressonline.com/news/local/ivc-f...,E,[com]
1,2019-04-08-048753,It is immoral. It is harmful. It is hurtful.,President Donald Trump,[Q22686],2019-04-08 16:22:00,44,"[[President Donald Trump, 0.5802], [None, 0.36...",[https://www.mercedsunstar.com/news/business/a...,E,"[com, com, com, com, com, com, com, com, com, ..."
2,2019-07-17-000030,"[ Amber ] loves her son more than anything,",,[],2019-07-17 22:54:35,1,"[[None, 0.8276], [Amber Portwood, 0.1724]]",[https://www.inquisitr.com/5535969/did-amber-p...,E,[com]
3,2019-05-15-053302,It is important for our equine science student...,Sally Johnson,[Q42336656],2019-05-15 18:03:22,1,"[[Sally Johnson, 0.5721], [None, 0.4279]]",[https://www.lanereport.com/113381/2019/05/qua...,E,[com]
4,2019-04-20-000011,... an Afrikaans family living in South Africa...,,[],2019-04-20 22:30:57,1,"[[None, 0.8331], [you long, 0.1669]]",[http://filmthreat.com/reviews/the-harvester/],E,[com]


### Extract site name

In [4]:
def get_sitename(url):
    res = get_tld(url, as_object=True)
    return res.domain

In [5]:
def extract_name(df): 
    df['sitenames'] = np.nan
    for row in range(df.shape[0]):
        df['sitenames'][row] = [get_sitename(site) for site in df['urls'][row]] 

In [6]:
extract_name(df_test)
df_test.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sitenames'][row] = [get_sitename(site) for site in df['urls'][row]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,domains,sitenames
0,2019-10-14-000009,% 9: D: D @? 6 H2J H6 E9@F89E H6 4@F=5 8: G6 3...,,[],2019-10-14 07:30:00,1,"[[None, 0.5595], [Julio Morales, 0.4405]]",[http://www.ivpressonline.com/news/local/ivc-f...,E,[com],[ivpressonline]
1,2019-04-08-048753,It is immoral. It is harmful. It is hurtful.,President Donald Trump,[Q22686],2019-04-08 16:22:00,44,"[[President Donald Trump, 0.5802], [None, 0.36...",[https://www.mercedsunstar.com/news/business/a...,E,"[com, com, com, com, com, com, com, com, com, ...","[mercedsunstar, sacbee, mynorthwest, lasvegass..."
2,2019-07-17-000030,"[ Amber ] loves her son more than anything,",,[],2019-07-17 22:54:35,1,"[[None, 0.8276], [Amber Portwood, 0.1724]]",[https://www.inquisitr.com/5535969/did-amber-p...,E,[com],[inquisitr]
3,2019-05-15-053302,It is important for our equine science student...,Sally Johnson,[Q42336656],2019-05-15 18:03:22,1,"[[Sally Johnson, 0.5721], [None, 0.4279]]",[https://www.lanereport.com/113381/2019/05/qua...,E,[com],[lanereport]
4,2019-04-20-000011,... an Afrikaans family living in South Africa...,,[],2019-04-20 22:30:57,1,"[[None, 0.8331], [you long, 0.1669]]",[http://filmthreat.com/reviews/the-harvester/],E,[com],[filmthreat]


### Extract domain and sitename for whole .json file 
#### Careful : can take >1hr per file
- Will read the raw file
- Will write into new .json file
- Change the paths to adapt to local setup

In [None]:
def get_domain(url):
    res = get_tld(url, as_object=True)
    return res.tld

def get_sitename(url):
    res = get_tld(url, as_object=True)
    return res.domain

In [None]:
import bz2
import json

path_to_file = 'data/quotes-2019.json.bz2' 
path_to_out = 'data/quotes-2019-domains-names.json.bz2'

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            urls = instance['urls'] # extracting list of links
            domains = []
            sitenames = []
            for url in urls:
                tld = get_domain(url)
                name = get_sitename(url)
                domains.append(tld)
                sitenames.append(name)
            instance['domains'] = domains # updating the sample with domain name
            instance['sitenames'] = sitenames
            d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file

## Task 3: Manually select websites to keep (that have always have a same specific topic)

In [59]:
df_test2 = pd.read_json(FILE_PATH, lines=True, compression='bz2', nrows=10000)

In [60]:
extract_name(df_test2)
df_test2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sitenames'][row] = [get_sitename(site) for site in df['urls'][row]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,domains,sitenames
0,2019-10-14-000009,% 9: D: D @? 6 H2J H6 E9@F89E H6 4@F=5 8: G6 3...,,[],2019-10-14 07:30:00,1,"[[None, 0.5595], [Julio Morales, 0.4405]]",[http://www.ivpressonline.com/news/local/ivc-f...,E,[com],[ivpressonline]
1,2019-04-08-048753,It is immoral. It is harmful. It is hurtful.,President Donald Trump,[Q22686],2019-04-08 16:22:00,44,"[[President Donald Trump, 0.5802], [None, 0.36...",[https://www.mercedsunstar.com/news/business/a...,E,"[com, com, com, com, com, com, com, com, com, ...","[mercedsunstar, sacbee, mynorthwest, lasvegass..."
2,2019-07-17-000030,"[ Amber ] loves her son more than anything,",,[],2019-07-17 22:54:35,1,"[[None, 0.8276], [Amber Portwood, 0.1724]]",[https://www.inquisitr.com/5535969/did-amber-p...,E,[com],[inquisitr]
3,2019-05-15-053302,It is important for our equine science student...,Sally Johnson,[Q42336656],2019-05-15 18:03:22,1,"[[Sally Johnson, 0.5721], [None, 0.4279]]",[https://www.lanereport.com/113381/2019/05/qua...,E,[com],[lanereport]
4,2019-04-20-000011,... an Afrikaans family living in South Africa...,,[],2019-04-20 22:30:57,1,"[[None, 0.8331], [you long, 0.1669]]",[http://filmthreat.com/reviews/the-harvester/],E,[com],[filmthreat]


In [20]:
all_sites = pd.Series([x for row in df_test2['sitenames'] for x in row])
sites = all_sites.drop_duplicates(keep='first')

In [29]:
for item in sites:
    print(item)

ivpressonline
mercedsunstar
sacbee
mynorthwest
lasvegassun
wtop
ctpost
kdhnews
newstimes
registercitizen
stamfordadvocate
wftv
expressnews
fairfieldcitizenonline
nhregister
thehour
wpxi
wsbtv
wsoctv
thestarphoenix
infotel
citynews
570news
news965
startribune
indiatimes
wibx950
bradenton
theolympian
charlotteobserver
heraldonline
businessinsider
smdailyjournal
wthr
chron
cbsnews
msn
nbc-2
bostonglobe
business-standard
inquisitr
lanereport
filmthreat
kstp
comicbook
9news
marketwatch
inews
eastidahonews
thesun
theepochtimes
geek
mamamia
wordpress
vanityfair
ipolitics
wdtv
maxim
fox29
breakingisraelnews
etcanada
westernjournal
bizpacreview
salon
nypost
atimes
prisonplanet
independent
express
sudbury
newuniversity
timescolonist
teenvogue
nashvillescene
bordermail
ottawasun
channel3000
cnbc
zawya
canada
hellenicshippingnews
tubefilter
crn
kstatecollegian
fastcoexist
billboard
complex
newshub
atlantablackstar
jezebel
thestar
pulse
breitbart
upstreamonline
perezhilton
miragenews
mitchellrepubl

accountingtoday
thetablet
bangordailynews
tvnz
capitolhillblue
theyeshivaworld
reason
dailyeasternnews
thescore
gamasutra
football365
virtual-strategy
thenationonlineng
wetinhappen
gamingintelligence
edmontonexaminer
businesswire
lexblog
ua
tvguide
historynewsnetwork
rushthekop
canadianarchitect
koimoi
labiotech
dealerscope
macrumors
sleepreviewmag
research-live
theonlinecitizen
frontpagemag
mobilenewscwp
theprideoflondon
theindianalawyer
lifesitenews
runnersworld
lareviewofbooks
thirdsector
bhatkallys
uinterview
artvoice
defensesystems
fcw
peoplesworld
statnews
globalsecurity
buffalo
busseltonmail
keyc
nbcchicago
nbcphiladelphia
nbcwashington
talkbusiness
laist
justsecurity
acs
dealstreetasia
winteriscoming
myspace
sciencefiction
globalissues
themreport
northafricapost
newsy
mynewsdesk
boatingindustry
therealdeal
247wallst
perishablenews
ecowatch
mediaupdate
nst
smartcompany
themandarin
autoremarketing
wheels24
newsgram
pymnts
newschannel9
plos
closerweekly
nebraska
hollywood
newstoda

blogto
sci-news
counterpunch
macdailynews
federaltimes
connectionnewspapers
suffolkfreepress
dmagazine
idsnews
nwfdailynews
royalcentral
indiansportsnews
fitsnews
mic
idyllwildtowncrier
byu
thefamuanonline
russianmachineneverbreaks
econotimes
democracynow
theinterrobang
forbesindia
flickeringmyth
mtsusidelines
miamitodaynews
epilepsysociety
onmedica
rochdaleonline
nwasianweekly
lynnnews
bgr
dailynorseman
pix11
hauteliving
clarecountyreview
ttnews
wttw
leesvilledailyleader
dataweek
intouchweekly
carolinapublicpress
eater
statetechmagazine
dairyherd
investors
goldenskate
burlingtoncountytimes
galwayindependent
techvibes
northdevongazette
aptnnews
aldianews
chosun
brafton
wkzo
deadlinenews
worldipreview
bnnbloomberg
slamonline
findlaw
qgazette
westmeathexaminer
dawgsbynature
onlinecasinoarchives
progressiverailroading
foodsafetynews
golfchannel
southeastagnet
labmanager
thinkgeoenergy
kwtx
benarnews
gamerevolution
usgamer
leftfootforward
therebel
mountvernonnews
becclesandbungayjournal
di

Tentative selection of sites (selected because of foucs on one topic): --> bad idea

- Business: businessinsider, marketwatch
- Art/Cutlure: filmthreat, comicbook, billboard
- Lifestyle/fashion: mamamia, vanityfair, teenvogue, vogue, healthcaredive
- Politics: ipolitics
- Tech: geek, cultofmac
- Sport: espncricinfo, motorsport

Seems a bit hard to hand select from list.\
Instead, I took the wikipedia subcategories of "journalism" and tried to find a couple specialised news sources in each field.
Some sources used: 
https://en.wikipedia.org/wiki/List_of_business_newspapers \
https://en.wikipedia.org/wiki/List_of_magazines_by_circulation#United_States \
https://www.allyoucanread.com/magazines/ \
https://en.wikipedia.org/wiki/List_of_scientific_journals \
https://en.wikipedia.org/wiki/List_of_science_magazines \

Or just search "list of ... magazines"

- **Arts**: 
Alta Journal, Billboard Magazine, Architectural Digest, Juxtapoz, Art in America, AntiqueWeek, Antique Trader, ArtNews, Pastel Journal, Southwest Art
- **Business**: 
Nihon Keizai Shimbun (Japan, see if there is), Financial Times, The Wall Street Journal, The Economist, The Economic Times (India), Il Sole 24 Ore (Italy), Mint, Business Standard, The Australian Financial Review, Inside Business, Forbes Business Insider, Money, Fortune, Bloomberg Businessweek
- **Entertainment**: 
Entertainment Tonight, The Hollywood Reporter, Variety, Better Homes And Gardens, Game Informer Magazine, Good Housekeeping, Southern Living, Men's Health, Car and Driver, Motor Trend, People, Rolling Stone, The New Yorker, US Weekly
- **Environment**: 
National Geographic, Nature Conservancy, National Wildlife, Sierra, Mother Earth News 
- **Fashion**: 
Vogue, Elle, Harper's Bazaar, Cosmopolitan, Business of Fashion, W, Allure, Grazia, Marie Claire, Vanity Fair, InStyle, Seventeen, Glamour, W, GQ
- **Medicine**: 
WebMd, Psychology Today, Stanford Medecine Magazine, Harvard Medicine Magazine
- **Music**: 
Mojo, Stereogun, The Quietus, Pitchfork, Spin, The Fader, Consequence of Sound
- **Politics**: 
- **Science**: 
Popular science, Wired, PCMag, ComputerWorld, MIT Technology Review, Popular Mechanics, American Scientist, Psychology Today, Smithsonian, Science News, Nature, Discover, 
- **Sports**: 
ESPN, Sports Illustrated, Golf Digest, Slam, Tennis, The Red Bulletin, Baseball America

## Task 4: Find topic in URL

- Define a list of categories, and matching words that could indicate the category
- Make this list "general" so that variants of the words can be found (ex: design, designer, designing... could all be classified in the same topic)
- Check each url for possible matches (for that, the url words also need to be "generalised")
- Add tag to url

Create a dictionary with terms to search
- arts: visual arts, film, literature, music, theater, architecture
- business: economic, financial 
- entertainment: lifestyle, television, film, theater music, video game, celebrity 
- environment: nature, climate
- fashion: clothes, 
- medicine: health, wellbeing
- music
- politics: political, government, policy
- science
- sports : list of sports?
- 

### Create discitionary of topics for classifying the urls

In [None]:
# Create a dictionary: to modify 

tag_list = ["art", "business", "entertainment", "environment" ...]
matchers = {"art": ["art", "paint", "draw"], "business": ["business", "finance", "economy"] ...}

^ Problem: how to define this list? 

### Check that dictionary is generlized enough

In [39]:
stemmer = PorterStemmer() #will find the "general" version of a word

def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation]) # get the text without punctuation
    tokens = nltk.word_tokenize(text) #Tokenizers divide strings into lists of substrings
    return " ".join([stemmer.stem(word.lower()) for word in tokens if word not in stopwords.words('english')]) #stem all words

In [42]:
# Example
test = 'Hello everyone my names are fancy potatoes and the sky''s colors are beautiful tonight'
tokenize(test)

'hello everyon name fanci potato ski color beauti tonight'

In [93]:
def generalizeDictionary(matchers):
    for category in matchers:
        for i in range(len(matchers[category])):
            matchers[category][i] = tokenize(matchers[category][i])
            
            
# Example
matchers = {"art": ["art", "paint", "draw"], "business": ["business", "finance", "economy"]}
generalizeDictionary(matchers)
print('Generalized dictionary is:\n', matchers)

Generalized dictionary is:
 {'art': ['art', 'paint', 'draw'], 'business': ['busi', 'financ', 'economi']}


### Split URL into words

In [65]:
def getWordsFromURL(url):
    return re.compile(r'\W+',re.UNICODE).split(url)

In [66]:
test_url = df_test2['urls'][0][0]
getWordsFromURL(test_url)

['http',
 'www',
 'ivpressonline',
 'com',
 'news',
 'local',
 'ivc',
 'foundation',
 'to',
 'house',
 'homeless',
 'students',
 'article_61ff9aa4',
 'ee30',
 '11e9',
 '8ae9',
 'b37ec7cd9579',
 'html']

### Generalize words in url

In [74]:
# Example
general_url = [tokenize(x) for x in getWordsFromURL(test_url)]
general_url

['http',
 'www',
 'ivpressonlin',
 'com',
 'news',
 'local',
 'ivc',
 'foundat',
 '',
 'hous',
 'homeless',
 'student',
 'article61ff9aa4',
 'ee30',
 '11e9',
 '8ae9',
 'b37ec7cd9579',
 'html']

### Check if word from list is in *generalized* url

In [108]:
def classify(matchers, url): #Give it an already generalized dictionary
    tag_found = []
    general_url = [tokenize(x) for x in getWordsFromURL(url)]
    for category in matchers:
        for i in range(len(matchers[category])):
            match = matchers[category][i]
            if match in general_url:
                tag_found.append(category)
    return tag_found # or you can "return None"

In [114]:
# Example
matchers = {"art": ["art", "paint", "draw"], "business": ["business", "finance", "economy"], "sport": ["sport", "football"]}
test_url = df_test2['urls'][1][0] #contains 'business'

#Generalize dictionary before sending it to the classify() function
generalizeDictionary(matchers)
classify(matchers, test_url)

['business']

In [115]:
# Add tag to dataframe

tags_column = [] # List of lists. Each list inside corresponds to a row. Will become the 'tag' column
for index, row in df_test2.iterrows():
    tags = [] #tags for all the urls in that row
    for url in row['urls']:
        categories = classify(matchers, url)
        tags.append(categories)
    tags_column.append(tags)

df_test2['tags'] = tags_column
df_test2

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,domains,sitenames,tags
0,2019-10-14-000009,% 9: D: D @? 6 H2J H6 E9@F89E H6 4@F=5 8: G6 3...,,[],2019-10-14 07:30:00,1,"[[None, 0.5595], [Julio Morales, 0.4405]]",[http://www.ivpressonline.com/news/local/ivc-f...,E,[com],[ivpressonline],[[]]
1,2019-04-08-048753,It is immoral. It is harmful. It is hurtful.,President Donald Trump,[Q22686],2019-04-08 16:22:00,44,"[[President Donald Trump, 0.5802], [None, 0.36...",[https://www.mercedsunstar.com/news/business/a...,E,"[com, com, com, com, com, com, com, com, com, ...","[mercedsunstar, sacbee, mynorthwest, lasvegass...","[[business], [business], [], [], [], [], [], [..."
2,2019-07-17-000030,"[ Amber ] loves her son more than anything,",,[],2019-07-17 22:54:35,1,"[[None, 0.8276], [Amber Portwood, 0.1724]]",[https://www.inquisitr.com/5535969/did-amber-p...,E,[com],[inquisitr],[[]]
3,2019-05-15-053302,It is important for our equine science student...,Sally Johnson,[Q42336656],2019-05-15 18:03:22,1,"[[Sally Johnson, 0.5721], [None, 0.4279]]",[https://www.lanereport.com/113381/2019/05/qua...,E,[com],[lanereport],[[]]
4,2019-04-20-000011,... an Afrikaans family living in South Africa...,,[],2019-04-20 22:30:57,1,"[[None, 0.8331], [you long, 0.1669]]",[http://filmthreat.com/reviews/the-harvester/],E,[com],[filmthreat],[[]]
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2019-06-13-115504,We have one match left here and I hope we can ...,Yoon Deok-yeo,[Q487477],2019-06-13 00:00:00,7,"[[Yoon Deok-yeo, 0.7274], [None, 0.2726]]",[http://asia.eurosport.com/football/koreans-cl...,E,"[com, com, co.uk, com, com, co.ke, com]","[eurosport, eurosport, eurosport, eurosport, e...","[[sport], [sport], [sport], [sport], [], [], [..."
9996,2019-04-02-062264,Jump in head-first; grab some options that cap...,,[],2019-04-02 05:54:30,2,"[[None, 0.7858], [Duchess of Sussex, 0.1581], ...",[https://www.vogue.com.au/culture/racing-carni...,E,"[com.au, com.au]","[vogue, vogue]","[[], []]"
9997,2019-03-14-121949,"We have opportunities, we certainly have some ...",,[],2019-03-14 18:35:00,1,"[[None, 0.5911], [Jim Cramer, 0.4089]]",[https://www.cnbc.com/2019/03/14/ge-ceo-culp-t...,E,[com],[cnbc],[[]]
9998,2019-02-28-064897,"Jungle cruise coming to life,",,[],2019-02-28 16:51:52,2,"[[None, 0.8612], [Emily Blunt, 0.0725], [Dwayn...",[https://comicbook.com/2019/02/28/the-rock-beh...,E,"[com, com]","[comicbook, comicbook]","[[], []]"


### Check for tags in url and save tags

#### Option 1: write new file (also extracts domain and sitename)

In [None]:
import bz2
import json

path_to_file = 'data/quotes-2019.json.bz2' 
path_to_out = 'data/quotes-2019-domains-names.json.bz2'

matchers = {...} # dictionary to define
generalizeDictionary(matchers)

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            urls = instance['urls'] # extracting list of links
            domains = []
            sitenames = []
            tags = []
            for url in urls:
                tld = get_domain(url)
                name = get_sitename(url)
                categories = classify(matchers, url)
                domains.append(tld)
                sitenames.append(name)
                tags.append(categories)
            instance['domains'] = domains # updating the sample with domain name
            instance['sitenames'] = sitenames # updating the sample with sitename
            instance['tags'] = tags # updating the sample with tags of topics
            d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file

In [None]:
#with bz2.open(path_to_file, 'rb') as s_file:
with pd.read_json... #read the original file
    with bz2.open(path_to_out, 'wb') as d_file:
        #for instance in s_file:
        for chunk in df_chunk:
            #instance = json.loads(instance) # loading a sample
            chunk = process_chunk
            chunk.to_json
            d_file.write((json.dumps(instance)+'\n').encode('utf-8'))

#### Option 2: modify file opened in memory

In [None]:
matchers = {...} # dictionary to define
generalizeDictionary(matchers)

def process_chunk_tagging(chunk):
        print(f'Processing chunk with {len(chunk)} rows')
        tags_column = [] # List of lists. Each list inside corresponds to a row. Will become the 'tag' column
        for index, row in chunk.iterrows():
            tags = [] #tags for all the urls in that row
            for url in row['urls']:
                categories = classify(matchers, url)
                tags.append(categories)
            tags_column.append(tags)
        chunk['tags'] = tags_column
        

with pd.read_json(FILE_PATH, lines=True, compression='bz2', chunksize=1000000) as df_reader:
    for chunk in df_reader:
        process_chunk_tagging(chunk)
        
        
def process_chunk_urls(chunk):
        tags_column = [] # List of lists. Each list inside corresponds to a row. Will become the 'tag' column
        site_column = []
        domain_column = []
        for index, row in chunk.iterrows():
            tags = [] #tags for all the urls in that row
            domains = []
            sitenames = []
            for url in row['urls']:
                #Extract data
                tld = get_domain(url)
                name = get_sitename(url)
                categories = classify(matchers, url)
                #Append data
                domains.append(tld)
                sitenames.append(name)
                tags.append(categories)
            tags_column.append(tags)
            site_column.append(sitenames)
            domain_column.append(domains)
        # Create new columns with new data
        chunk['sitenames'] = site_column
        chunk['domain'] = domaine_column
        chunk['tags'] = tags_column

# Finding synonyms of words

In [2]:
import nltk
nltk.download()
from nltk.corpus import wordnet

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [4]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("business"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))

{'job', 'business_organization', 'business_concern', 'business_organisation', 'occupation', 'patronage', 'byplay', 'commercial_enterprise', 'business_sector', 'concern', 'line', 'business_enterprise', 'line_of_work', 'stage_business', 'business', 'clientele'}


In [10]:
# Creating a list of synonyms for the dicitionary

matchers = {"art": ["art", "paint", "draw"], "business": ["business", "finance", "economy"], "sport": ["sport", "football"]}

for category in matchers:
    synonyms = []
    for i in range(len(matchers[category])):
        word = matchers[category][i]
        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())
        matchers[category] = synonyms


In [12]:
matchers

{'art': ['art',
  'fine_art',
  'art',
  'artistic_creation',
  'artistic_production',
  'art',
  'artistry',
  'prowess',
  'artwork',
  'art',
  'graphics',
  'nontextual_matter',
  'art',
  'fine_art',
  'art',
  'fine_art',
  'art',
  'artistic_creation',
  'artistic_production',
  'art',
  'artistry',
  'prowess',
  'artwork',
  'art',
  'graphics',
  'nontextual_matter'],
 'business': ['business',
  'concern',
  'business_concern',
  'business_organization',
  'business_organisation',
  'commercial_enterprise',
  'business_enterprise',
  'business',
  'occupation',
  'business',
  'job',
  'line_of_work',
  'line',
  'business',
  'business',
  'business',
  'business',
  'business_sector',
  'clientele',
  'patronage',
  'business',
  'business',
  'stage_business',
  'byplay',
  'concern',
  'concern',
  'care',
  'fear',
  'concern',
  'concern',
  'worry',
  'headache',
  'vexation',
  'business',
  'concern',
  'business_concern',
  'business_organization',
  'business_organ