# load data

In [1]:
import sqlite3
import pandas as pd
import re
from urllib.parse import urlparse

In [2]:
cnx = sqlite3.connect("../home.db")
df_tweets = pd.read_sql_query("SELECT id, full_text FROM tweets", cnx)
df_tweets_2 = df_tweets.copy()

In [3]:
with open('news_domains.txt', 'r') as f:
    news_domains = json.loads(f.read())

# utils

In [4]:
def find_url(tweet):
    return re.findall("http\S+", tweet)

def clean_links(tweet):
    tweet = re.sub(r'bit.ly/\S+', '', tweet)
    tweet = re.sub(r't.co/\S+', '', tweet)
    tweet = re.sub(r'buff.ly/\S+', '', tweet)
    tweet = re.sub(r'twitter.com/\S+', '', tweet)
    return tweet

def get_domain(url):
    domain = urlparse(url).netloc
    dot_split = domain.split(".")
    if len(dot_split) > 2:
        return ".".join(dot_split[1:])
    else:
        return domain
    
def remove_empty_str(l):
    for i in l:
        if len(i) == 0:
            l.remove(i)
    return(l)

# function for df

In [5]:
def find_news(df):

    df["urls"] = df["full_text"].apply(find_url)
    df["urls"] = df.urls.apply(lambda x: [clean_links(d) for d in x])
    df["domains"] = df.urls.apply(lambda x: [get_domain(d) for d in x])
    df["domains"] = df.domains.apply(remove_empty_str)
    df.drop(["urls"], axis=1, inplace=True)


    new_columns_list = []
    max_nr_dom = df.domains.str.len().max()
    for i in range(max_nr_dom):
        new_columns_list.append(f"domain{i+1}")
    df[new_columns_list] = pd.DataFrame(df.domains.tolist())


    for col in new_columns_list:
        df[col] = df[col].isin(news_domains).astype(int)

    df.drop(["domains"], axis=1, inplace=True)

    df["contains_news"] = df[new_columns_list].sum(axis=1)
    df.drop(new_columns_list, axis=1, inplace=True)
    
    return df


In [6]:
%%time
df_tweets = find_news(df_tweets)

CPU times: user 516 ms, sys: 8.41 ms, total: 525 ms
Wall time: 525 ms


In [7]:
df_tweets[df_tweets["contains_news"] > 0]

Unnamed: 0,id,full_text,contains_news
96,986941808348409856,My comments in The Guardian on implementation ...,1
150,1083458327143739392,Cyber insurance experts agree they're not sure...,1
188,1121030235108118528,@wmarybeard @Lord_Keynes2 @Steve_Sailer The Yo...,1
196,1129155364404858880,Peasants in medieval England ate a diet of mea...,1
222,1153145699086680069,The Finns know the score: regular sauna usage ...,1
...,...,...,...
50167,1350121300925313028,"Individual contributions dominate in Canada, a...",1
50197,1350126505339346957,A Webflow founder posts how Hackernews saved t...,1
50209,1350129672529481728,"Sources: at an all-hands staff meeting, incomi...",2
50210,1350129720311115777,"In 10 years: In order to train our new model, ...",1


In [16]:
df_tweets.iloc[50167].full_text

'Individual contributions dominate in Canada, and Canadians generally think that strict limits on corporate money in politics are good. But then... Canada should perhaps be more subject to extreme political canadidates than the USA. /2\nhttps://www.cbc.ca/news/politics/elections-canada-data-donors-increase-1.5322682'

# make function working on 1 tweet

In [9]:
df_tweets_2 = pd.read_sql_query("SELECT id, full_text FROM tweets", cnx)

In [10]:
tweet1 = df_tweets_2.iloc[0]
print(tweet1.full_text)

I wish I had kept my 1,700 BTC @ $0.06 instead of selling them at $0.30, now that they're $8.00! #bitcoin


In [11]:
def find_news(tweet_full_text):
    urls = find_url(tweet_full_text)
    urls = [clean_links(d) for d in urls]
    domains = [get_domain(d) for d in urls]
    domains = remove_empty_str(domains)
    contains_news = 0
    for i in domains:
        if i in news_domains:
            contains_news = 1
            break
            
    return contains_news

In [12]:
find_news(tweet1.full_text) is True

False

In [13]:
df_tweets_2

Unnamed: 0,id,full_text
0,70261648811761665,"I wish I had kept my 1,700 BTC @ $0.06 instead..."
1,303962112307363840,If corporate America built its own low orbit i...
2,320150923336892416,These two books contain the sum total of all h...
3,334391242395906048,I really do enjoy following @Aelkus.
4,380578576283611136,I'm just going to fill my van with dildos and ...
...,...,...
50269,1350138274984165376,RT @InsallArch: Great to be working with @Glen...
50270,1350138529985257473,RT @lindsey: Was campus real? I had an office...
50271,1350138537262374925,this is what they took from you https://twitte...
50272,1350138580048506880,1/24th of the year is already gone. Don’t wait.


In [14]:
%%time
df_tweets_2["contains_news"] = df_tweets_2.full_text.apply(find_news)

CPU times: user 1.98 s, sys: 4.73 ms, total: 1.98 s
Wall time: 1.99 s


In [15]:
df_tweets_2[df_tweets_2["contains_news"] > 0]

Unnamed: 0,id,full_text,contains_news
96,986941808348409856,My comments in The Guardian on implementation ...,1
150,1083458327143739392,Cyber insurance experts agree they're not sure...,1
188,1121030235108118528,@wmarybeard @Lord_Keynes2 @Steve_Sailer The Yo...,1
196,1129155364404858880,Peasants in medieval England ate a diet of mea...,1
222,1153145699086680069,The Finns know the score: regular sauna usage ...,1
...,...,...,...
50167,1350121300925313028,"Individual contributions dominate in Canada, a...",1
50197,1350126505339346957,A Webflow founder posts how Hackernews saved t...,1
50209,1350129672529481728,"Sources: at an all-hands staff meeting, incomi...",1
50210,1350129720311115777,"In 10 years: In order to train our new model, ...",1
