# load data

In [1]:
import json
import sqlite3
import pandas as pd
import re
from urllib.parse import urlparse
from requests_html import HTMLSession

In [2]:
from datetime import date
from datetime import timedelta

## load tweets older then two weeks

In [3]:
# this part is needed to import local modules to jupyter notebook
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path + "/tweetfeed")

from data import load_tweets

In [None]:
# data keep on coming - to work on a static dataset I saved it once to pickle

In [None]:
# df_tweets = load_tweets("../home.db", days=0)
# df_tweets = df_tweets[df_tweets.retweeted_status == "N/A"] # remove RT
# df_tweets.to_pickle("shorturl.pkl")

In [4]:
df_tweets = pd.read_pickle("shorturl.pkl")
df_tweets.shape

(74869, 9)

# utils

In [5]:
df_tweets.tail()

Unnamed: 0,id,user,full_text,created_at,lang,retweeted_status,quoted_status,is_quote_status,in_reply_to_status_id
95305,1359964994532614144,2178012643,"It’s one thing to ban violent crimes, universa...",2021-02-11T20:38:16+00:00,en,,,0,1.359963436117938e+18
95306,1359965511396675585,15626406,(It would plausibly reduce corn subsidies - le...,2021-02-11T20:40:19+00:00,en,,,0,1.359964356541182e+18
95307,1359965679110287360,259034658,Winter dopamine famine.,2021-02-11T20:40:59+00:00,de,,,0,
95308,1359965833502494722,817386,Google adds some of its Pixel-exclusive photo ...,2021-02-11T20:41:36+00:00,en,,,0,
95309,1359965919028547587,1416500532,"And just like that, a migraine emerges,,,",2021-02-11T20:41:56+00:00,en,,,0,


In [6]:
# way of extracting titles from urls
url = "https://lukecarneal.substack.com/p/why-are-some-organic-farmers-turning"
print(HTMLSession().get(url).html.find("title", first=True).text)

Why Are Some Organic Farmers Turning to Reactionary Politics? - The Farm Worker Bulletin


In [7]:
def grab_title(url):
    try:
        title = HTMLSession().get(url).html.find("title", first=True).text
        return title
    except Exception as ex:
        return 0

In [9]:
%%time
url = "https://buff.ly/2T4jips"
grab_title(url)

CPU times: user 69.9 ms, sys: 0 ns, total: 69.9 ms
Wall time: 2.31 s


'3 Tiny Mental Habits I Practice Every Morning – In Less Than 5 Minutes'

## Utils

In [12]:
def rem_short_links(tweet: str) -> str:
    """removes some of short links (bit.ly, buff.ly, t.co) from tweets"""
    tweet = re.sub(r"https://bit.ly/\S+", "", tweet)
    tweet = re.sub(r"http://bit.ly/\S+", "", tweet)
    tweet = re.sub(r"https://buff.ly/\S+", "", tweet)
    tweet = re.sub(r"http://buff.ly/\S+", "", tweet)
    tweet = re.sub(r"https://t.co/\S+", "", tweet)
    tweet = re.sub(r"http://t.co/\S+", "", tweet)
    return tweet

In [13]:
# my find_news function drops unnecessery columns, but for this task they are needed, so I modify it a little

from data import remove_tw_urls, find_url, get_domain, remove_empty_str

def find_news(df, news_domains_list):
    df = df.copy()
    df["clean_text"] = df["full_text"].apply(remove_tw_urls).apply(rem_short_links)
    df["clean_text"] = df["clean_text"].apply(rem_short_links)
    df["urls"] = df["clean_text"].apply(find_url)
    df.drop(["clean_text"], axis=1, inplace=True)
    df["domains"] = df.urls.apply(lambda x: [get_domain(d) for d in x])
    df["domains"] = df.domains.apply(remove_empty_str)
    # df.drop(["urls"], axis=1, inplace=True)

    new_columns_list = []
    max_nr_dom = df.domains.str.len().max()
    for i in range(max_nr_dom):
        new_columns_list.append(f"domain{i+1}")
    df.reset_index(drop=True, inplace=True)
    df[new_columns_list] = pd.DataFrame(df.domains.tolist())

    for col in new_columns_list:
        df[col] = df[col].isin(news_domains_list)

    #     df.drop(["domains"], axis=1, inplace=True)

    df["contains_news"] = df[new_columns_list].sum(axis=1)
    df["contains_news"] = df.contains_news.apply(lambda x: x if x == 0 else 1)
    df.drop(new_columns_list, axis=1, inplace=True)

    return df

In [14]:
def drop_contains(df, column_name, word_list):
    for string in word_list:
        df["lower"] = df["full_text"].str.lower()
        df = df[df["lower"].str.contains(string)]
        df.drop(["lower"], axis=1, inplace=True)
    return df

# searching for not-expanded news urls

In [15]:
dfz = df_tweets.copy()

In [16]:
with open("20200223_news_domains.txt", "r") as f:
    news_domains = json.loads(f.read())

In [17]:
dfz = find_news(dfz, news_domains)

In [22]:
dfz["has_domain"] = dfz.domains.apply(lambda x: len(x))

In [23]:
dfz.head()

Unnamed: 0,id,user,full_text,created_at,lang,retweeted_status,quoted_status,is_quote_status,in_reply_to_status_id,urls,domains,contains_news,has_domain
7,35466982635601920,15012642,#Bitcoin reaches parity with the US Dollar! h...,2011-02-09T22:36:02+00:00,en,,,0,,[http://bitcoincharts.com/markets/],[bitcoincharts.com],0,1
25,451159870306549761,159169312,Course slides from @Stanford and @stanfordsyms...,2014-04-02T00:51:09+00:00,en,,,0,,[http://stanford.edu/~zdar/week1.pdf],[stanford.edu],0,1
34,563101166771650561,1460035021,"Watched Vincent Price in ""Confessions of an Op...",2015-02-04T22:25:54+00:00,en,,,0,,[http://imdb.com/rg/an_share/title/title/tt005...,[imdb.com],0,1
35,574518676575162369,68132773,Robert Capa's Omaha Beach by Dominique Bertail...,2015-03-08T10:35:01+00:00,en,,,0,,[http://bandedessinee.blog.lemonde.fr/2014/06/...,[bandedessinee.blog.lemonde.fr],0,1
39,627427921624481792,794010396,600cals per day for 7 days reverses T2 diabete...,2015-08-01T10:37:28+00:00,en,,,0,,[http://link.springer.com/article/10.1007/s001...,[springer.com],0,1


In [24]:
# take tweets that have domains
dfz = dfz[dfz.has_domain > 0]
dfz.shape

(15120, 13)

In [25]:
# let's grab only those WITHOUT news
dfz = dfz[dfz.contains_news == 0]
dfz.shape

(15120, 13)

In [30]:
import warnings; warnings.simplefilter('ignore')
# response = HTMLSession().get(link, verify=False) will generate warnings

In [None]:
# import json

# # short_url =[]
# with open("short_url.json") as json_file:
#     short_url = json.load(json_file)
# urls_list = pd.DataFrame(short_url).url.tolist()

# x = 0
# for c, i in enumerate(dfz.urls[x:]):
#     print(c + x, i)
#     if (c + x + 1) % 100 == 0:
#         print(f"{(c+1)} / {len(dfz.urls)}")
#     for link in i:
#         print(c+x)
#         if link in urls_list:
#             pass

#         else:
#             try:
#                 char_to_rem = "',)\"!"
#                 for char in char_to_rem:
#                     link = link.replace(char, "")
#                 if link.split(".")[-1] == "pdf":
#                     short_url.append(
#                         {
#                             "url": link,
#                             "links": i,
#                             "idx": c+x,
#                             "error": "pdf",
#                         }
#                     )
#                 else:
#                     domain = get_domain(link)
#                     response = HTMLSession().get(link, verify=False)
#                     e_link = response.url
#                     response.html.find('title', first=True).text
#                     dom_expanded = get_domain(e_link)
#                     short_url.append(
#                         {
#                             "url": link,
#                             "title": response.html.find('title', first=True).text,
#                             "short_url": domain,
#                             "long_url": dom_expanded,
#                             "is_news": (dom_expanded in news_domains),
#                             "diff_url": (dom_expanded != domain),
#                         }
#                     )
#             except Exception as ex:
#                 short_url.append(
#                     {
#                         "url": link,
#                         "links": i,
#                         "idx": c+x,
#                         "error": str(ex),
#                     }
#                 )
#     with open("short_url.json", "w") as file:
#         file.write(json.dumps(short_url, indent=4))

## Analyse short links

In [26]:
# use short url list
with open("short_url.json") as json_file:
    short_url = json.load(json_file)

In [27]:
short_url = pd.DataFrame.from_dict(short_url)
# select only short urls, if their long url is in news
short_url_news = short_url[short_url.is_news == True].short_url.unique().tolist()

In [28]:
short_url_news

['bloom.bg',
 'nyti.ms',
 'cnb.cx',
 'rdcu.be',
 'trib.al',
 'ti.me',
 'on.theatln.tc',
 'reut.rs',
 'zd.net',
 'afp.com',
 'ow.ly',
 'google.com',
 'tcrn.ch',
 'st.news',
 'wp.me',
 'thr.cm',
 'apne.ws',
 'bbc.in',
 'petitions.whitehouse.gov',
 'dlvr.it',
 'action.consumerreports.org',
 'politi.co',
 'econ.st',
 'washex.am',
 'str.sg',
 'jtim.es',
 'google.it',
 'cnn.it',
 'abc7ne.ws',
 'wapo.st',
 'go.usa.gov',
 'vntyfr.com',
 'b-gat.es',
 'nie.mn',
 'mtr.cool',
 'propub.li',
 'to.pbs.org',
 'ellemag.co',
 'mitsln.co',
 'hill.cm',
 'cos.lv',
 'ift.tt',
 '1843m.ag',
 'bos.gl',
 'nzzl.us',
 'tdrt.io',
 'wdrb.news',
 'nytimes.com',
 '53eig.ht',
 'tmz.me',
 'ja.ma',
 'dailym.ai',
 'mol.im',
 'yhoo.it',
 'nationalenquirer.com',
 'nvda.ws',
 'wired.trib.al',
 'sabahdai.ly',
 'lnkd.in',
 'nym.ag',
 'cnet.co',
 'buildbackbetter.gov',
 'WhiteHouse.gov',
 'pewrsr.ch',
 '808ne.ws',
 'sc.mp',
 'cntrvlr.co',
 'on.mktw.net',
 'zcu.io',
 'ibm.co',
 'Whitehouse.gov',
 'whitehouse.gov.',
 'cbsn.ws',


In [32]:
# filter dataframe to just short url that expand into news urls
df = short_url[short_url.short_url.isin(short_url_news)]
df["is_news"] = df.is_news.astype(int)

In [33]:
df.head()

Unnamed: 0,url,title,short_url,long_url,is_news,diff_url,idx,error,links
11,http://bloom.bg/2iqJnMo,This Mile High Club Will Come With Drinks and ...,bloom.bg,bloomberg.com,1,True,,,
12,https://nyti.ms/2nMAShX,How Uber Uses Psychological Tricks to Push Its...,nyti.ms,nytimes.com,1,True,,,
17,http://ow.ly/QHDf30fzBjh,Carme Torras Genís,ow.ly,iri.upc.edu,0,True,,,
66,https://nyti.ms/2ZsakYf,Humans Are Impetuous and Shortsighted. Can We ...,nyti.ms,nytimes.com,1,True,,,
124,https://cnb.cx/2Wya3zC,NY Gov. Cuomo says he won't sacrifice human li...,cnb.cx,cnbc.com,1,True,,,


In [34]:
# calculate procentage of times short url domain leads to news site
dfgb = df.groupby(by=["short_url"]).sum()
s = df.short_url.value_counts()
df = dfgb.merge(s.rename("all"), left_index=True, right_index=True)
df.index.name = "short_url"
df.reset_index(inplace=True)
df["perc"] = df["is_news"] / df["all"] * 100

In [35]:
df[df["perc"] < 50]

Unnamed: 0,short_url,is_news,idx,all,perc
14,bddy.me,1,0.0,3,33.333333
27,dlvr.it,7,0.0,23,30.434783
28,dpmd.ai,1,0.0,5,20.0
34,go.usa.gov,1,0.0,4,25.0
35,google.com,20,0.0,206,9.708738
39,ift.tt,5,0.0,15,33.333333
42,lnkd.in,1,0.0,124,0.806452
49,nvda.ws,4,0.0,50,8.0
56,ow.ly,11,0.0,87,12.643678


In [36]:
short_url[
    (short_url.short_url == "google.com")
    & (short_url.diff_url == True)
    & (short_url.is_news == True)
]

Unnamed: 0,url,title,short_url,long_url,is_news,diff_url,idx,error,links
724,https://www.google.com/amp/s/www.bbc.com/news/...,Hayabusa-2: Capsule with asteroid samples in '...,google.com,bbc.com,True,True,,,
1183,https://www.google.com/amp/s/www.vanityfair.co...,Serenity Sets a New Bar for Ridiculous Movie T...,google.com,vanityfair.com,True,True,,,
1348,https://www.google.com/amp/s/www.washingtonpos...,- The Washington Post,google.com,washingtonpost.com,True,True,,,
3277,https://www.google.com/amp/s/www.techrepublic....,The 50 most overused business cliches - TechRe...,google.com,techrepublic.com,True,True,,,
4193,https://www.google.com/amp/s/www.wsj.com/amp/a...,"A California Plan to Chase Away the Rich, Then...",google.com,wsj.com,True,True,,,
4209,https://www.google.com/amp/s/news.sky.com/stor...,COVID-19:Variant found in UK may be more deadl...,google.com,sky.com,True,True,,,
4618,https://www.google.com/amp/s/wcfcourier.com/ne...,"After discovering a love for computers, Cedar ...",google.com,wcfcourier.com,True,True,,,
5821,https://www.google.com/amp/s/www.wlky.com/amp/...,Site Not Available,google.com,wlky.com,True,True,,,
5832,https://www.google.com/amp/s/www.cnn.com/style...,"Italy to build 1,500 pop-up vaccine pavilions,...",google.com,cnn.com,True,True,,,
6614,https://www.google.com/amp/s/www.zdnet.com/goo...,Anthropologist 'confirms' Apple is a religion ...,google.com,zdnet.com,True,True,,,


In [38]:
url = "We went into space, found an asteroid, dug up some rocks, and flew it back to Australia. #longhumanity 🚀🚀 https://www.google.com/amp/s/www.bbc.com/news/amp/science-environment-55201662"

In [39]:
from data import rem_short_links
# my added remove amp to rem_short_links function
rem_short_links(url)

'We went into space, found an asteroid, dug up some rocks, and flew it back to Australia. #longhumanity 🚀🚀 https://www.bbc.com/news/amp/science-environment-55201662'

In [40]:
# create list of short urls, I more then 80% times it leads to news site
short_url_list = df[df["perc"] > 50].short_url.tolist()

In [41]:
short_url_list

['1843m.ag',
 '4NN.cx',
 '53eig.ht',
 '808ne.ws',
 'WhiteHouse.gov',
 'Whitehouse.gov',
 'abc30.tv',
 'abc7.la',
 'abc7ne.ws',
 'action.consumerreports.org',
 'afp.com',
 'apne.ws',
 'b-gat.es',
 'bbc.in',
 'bloom.bg',
 'bos.gl',
 'buildbackbetter.gov',
 'cbsn.ws',
 'cityjourn.al',
 'cnb.cx',
 'cnet.co',
 'cnn.it',
 'cntrvlr.co',
 'cos.lv',
 'dailym.ai',
 'econ.st',
 'ellemag.co',
 'engt.co',
 'google.it',
 'hill.cm',
 'ja.ma',
 'jtim.es',
 'mol.im',
 'mtr.cool',
 'natgeo.com',
 'nationalenquirer.com',
 'nie.mn',
 'nym.ag',
 'nyti.ms',
 'nytimes.com',
 'nzzl.us',
 'on.mktw.net',
 'on.theatln.tc',
 'p4k.in',
 'petitions.whitehouse.gov',
 'pewrsr.ch',
 'propub.li',
 'rdcu.be',
 'reut.rs',
 'rol.st',
 'sabahdai.ly',
 'sc.mp',
 'slate.trib.al',
 'st.news',
 'str.sg',
 'tcrn.ch',
 'tdrt.io',
 'thr.cm',
 'ti.me',
 'tmz.me',
 'to.pbs.org',
 'trib.al',
 'vntyfr.com',
 'vult.re',
 'wapo.st',
 'washex.am',
 'wdrb.news',
 'whitehouse.gov.',
 'wired.trib.al',
 'yhoo.it',
 'zd.net']