In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urlparse
import json

In [2]:
news = pd.read_csv("uci-news-aggregator.csv")

# utils

In [3]:
def remove_www(x):
    dot_split = x.split(".")
    if len(dot_split) > 2:
        return ".".join(dot_split[1:])
    else:
        return x

# cleaning list

In [4]:
news["HOSTNAME"] = news.HOSTNAME.apply(remove_www)

In [5]:
news = news.drop_duplicates(subset="HOSTNAME", keep="last")

In [6]:
news_domains = news.HOSTNAME.unique().tolist()

In [7]:
for i in news_domains:
    if ".edu" in i:
        news_domains.remove(i)
len(news_domains)

9549

In [8]:
for i in news_domains:
    if ".edu" in i:
        print(i)

hawaii.edu
uconn.edu
buffalo.edu
iwu.edu
uiowa.edu


In [9]:
for i in news_domains:
    if ".edu" in i:
        news_domains.remove(i)
len(news_domains)

9545

In [10]:
to_rem = [
    "youtube.com",
    "co.ke",
    "co.nz",
    "co.tt",
    "co.tz",
    "co.ug",
    "co.uk",
    "co.za",
    "com.au",
    "com.lb",
    "com.ng",
    "com.ph",
    "com.pk",
    "com.sg",
    "com.ua",
    "linkedin.com",
    "techcrunch.com",
]

In [11]:
news_domains = [x for x in news_domains if x not in to_rem]
len(news_domains)

9529

In [12]:
to_add = [
    "techmeme.com",
    "news.sky.com",
    "bellingcat.com",
    "apnews.com",
    "nbcnews.to",
    "buzzfeednews.com",
    "coindesk.com",
    "tvnz.co.nz",
    "faz.net"
]
for i in to_add:
    news_domains.append(i)
len(news_domains)

9538

In [13]:
# # use short url list
with open("short_url.json") as json_file:
    short_url = json.load(json_file)

short_url = pd.DataFrame.from_dict(short_url)
# select only short urls, if their long url is in news
short_url_news = short_url[short_url.is_news == True].short_url.unique().tolist()

df = short_url[short_url.short_url.isin(short_url_news)]
df["is_news"] = df.is_news.astype(int)

# calculate procentage of times short url domain leads to news site
dfgb = df.groupby(by=["short_url"]).sum()
s = df.short_url.value_counts()
df = dfgb.merge(s.rename("all"), left_index=True, right_index=True)
df.index.name = "short_url"
df.reset_index(inplace=True)
df["perc"] = df["is_news"] / df["all"] * 100

# # create list of short urls, I more then 50% times it leads to news site
short_url_list = df[df["perc"] > 50].short_url.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["is_news"] = df.is_news.astype(int)


In [14]:
short_url_list

['1843m.ag',
 '4NN.cx',
 '53eig.ht',
 '808ne.ws',
 'WhiteHouse.gov',
 'Whitehouse.gov',
 'abc30.tv',
 'abc7.la',
 'abc7ne.ws',
 'action.consumerreports.org',
 'afp.com',
 'apne.ws',
 'b-gat.es',
 'bbc.in',
 'bloom.bg',
 'bos.gl',
 'buildbackbetter.gov',
 'cbsn.ws',
 'cityjourn.al',
 'cnb.cx',
 'cnet.co',
 'cnn.it',
 'cntrvlr.co',
 'cos.lv',
 'dailym.ai',
 'econ.st',
 'ellemag.co',
 'engt.co',
 'google.it',
 'hill.cm',
 'ja.ma',
 'jtim.es',
 'mol.im',
 'mtr.cool',
 'natgeo.com',
 'nationalenquirer.com',
 'nie.mn',
 'nym.ag',
 'nyti.ms',
 'nytimes.com',
 'nzzl.us',
 'on.mktw.net',
 'on.theatln.tc',
 'p4k.in',
 'petitions.whitehouse.gov',
 'pewrsr.ch',
 'propub.li',
 'rdcu.be',
 'reut.rs',
 'rol.st',
 'sabahdai.ly',
 'sc.mp',
 'slate.trib.al',
 'st.news',
 'str.sg',
 'tcrn.ch',
 'tdrt.io',
 'thr.cm',
 'ti.me',
 'tmz.me',
 'to.pbs.org',
 'trib.al',
 'vntyfr.com',
 'vult.re',
 'wapo.st',
 'washex.am',
 'wdrb.news',
 'whitehouse.gov.',
 'wired.trib.al',
 'yhoo.it',
 'zd.net']

In [15]:
for i in short_url_list:
    news_domains.append(i)
len(news_domains)

9609

In [16]:
with open("news_domains.txt", "w") as f:
    f.write(json.dumps(news_domains))