# load data

In [1]:
import sqlite3
import pandas as pd
import re
from urllib.parse import urlparse

In [2]:
from datetime import date
from datetime import timedelta

## load tweets older then two weeks

In [3]:
two_weeks_old = date.today() - timedelta(days=14)

In [4]:
cnx = sqlite3.connect("../home.db")
query = f"SELECT id,user, full_text, created_at, lang FROM tweets WHERE created_at < '{str(two_weeks_old)}'"
print(query)
df_tweets = pd.read_sql_query(
    query,
    cnx,
)

SELECT id,user, full_text, created_at, lang FROM tweets WHERE created_at < '2021-01-08'


In [5]:
df_tweets.tail()

Unnamed: 0,id,user,full_text,created_at,lang
40195,1347331166018277379,466880049,Bezos needs to hire Musk as a comp coach. http...,2021-01-07T23:55:56+00:00,en
40196,1347331537704943616,2986693873,RT @MONSTERPALOOZA1: #Artist @sazenlee human/...,2021-01-07T23:57:25+00:00,en
40197,1347331740931665923,242973461,"If you work for Facebook, you can in fact leav...",2021-01-07T23:58:13+00:00,en
40198,1347331762842599424,1051396218,Josh Hawley is the slicker demagogue we’ve bee...,2021-01-07T23:58:19+00:00,en
40199,1347331936147165185,492443928,It seems clear we’ve lost the plot when tape i...,2021-01-07T23:59:00+00:00,en


# utils

In [6]:
def find_url(tweet):
    return re.findall(r"http\S+", tweet)


def clean_links(tweet):
    tweet = re.sub(r"bit.ly/\S+", "", tweet)
    tweet = re.sub(r"t.co/\S+", "", tweet)
    tweet = re.sub(r"buff.ly/\S+", "", tweet)
    tweet = re.sub(r"twitter.com/\S+", "", tweet)
    return tweet


def get_domain(url):
    domain = urlparse(url).netloc
    dot_split = domain.split(".")
    if len(dot_split) > 2:
        return ".".join(dot_split[1:])
    else:
        return domain


def remove_empty_str(l):
    for i in l:
        if len(i) == 0:
            l.remove(i)
    return l

# function for df

In [7]:
def find_news(df, news_domains_list):

    df["urls"] = df["full_text"].apply(find_url)
    df["urls"] = df.urls.apply(lambda x: [clean_links(d) for d in x])
    df["domains"] = df.urls.apply(lambda x: [get_domain(d) for d in x])
    df["domains"] = df.domains.apply(remove_empty_str)
    df.drop(["urls"], axis=1, inplace=True)

    new_columns_list = []
    max_nr_dom = df.domains.str.len().max()
    for i in range(max_nr_dom):
        new_columns_list.append(f"domain{i+1}")
    df[new_columns_list] = pd.DataFrame(df.domains.tolist())

    for col in new_columns_list:
        df[col] = df[col].isin(news_domains_list)

    df.drop(["domains"], axis=1, inplace=True)

    df["contains_news"] = df[new_columns_list].sum(axis=1)
    df["contains_news"] = df.contains_news.apply(lambda x: x if x == 0 else 1)
    df.drop(new_columns_list, axis=1, inplace=True)

    return df

In [8]:
with open("news_domains.txt", "r") as f:
    news_domains = json.loads(f.read())

In [9]:
%%time
df_tweets = find_news(df_tweets, news_domains)

CPU times: user 454 ms, sys: 3.48 ms, total: 457 ms
Wall time: 457 ms


In [10]:
df_tweets.head()

Unnamed: 0,id,user,full_text,created_at,lang,contains_news
0,2627602600,21454322,"Went on a USO trip to Guantanamo Bay, Cuba a f...",2009-07-14T05:15:27+00:00,en,0
1,70261648811761665,5416652,"I wish I had kept my 1,700 BTC @ $0.06 instead...",2011-05-16T22:57:37+00:00,en,0
2,177008089394970624,5110861,"In 1996 a man took a NZ radio station hostage,...",2012-03-06T12:29:51+00:00,en,1
3,234002950274560000,108471631,What is she thinking? https://twitter.com/MELA...,2012-08-10T19:07:06+00:00,en,0
4,281811460718477312,16298441,did you know that the bible doesn't actually c...,2012-12-20T17:21:02+00:00,en,0


# remove seen

In [11]:
seen_tweets = pd.read_csv("seen.csv")

In [12]:
seen_tweets.head()

Unnamed: 0,tweet_id,err_reason
0,1337875267096875015,no_errors
1,1335698027537969155,no_errors
2,1345699105385959435,not_found
3,1336874318882467842,no_errors
4,1337717825629253633,not_found


In [13]:
seen_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_id    2500 non-null   int64 
 1   err_reason  2500 non-null   object
dtypes: int64(1), object(1)
memory usage: 39.2+ KB


In [14]:
seen_tweets.drop_duplicates(inplace=True)

In [15]:
df_tweets[df_tweets["id"].isin(seen_tweets["tweet_id"].tolist())].shape

(2500, 6)

In [16]:
%%time
print(df_tweets.shape)
df_tweets = df_tweets[~df_tweets["id"].isin(seen_tweets["tweet_id"].tolist())]

(40200, 6)
CPU times: user 7.94 ms, sys: 405 µs, total: 8.35 ms
Wall time: 10.9 ms


In [17]:
# fiter out non english tweets

In [18]:
df_tweets.shape

(37700, 6)

In [19]:
df_tweets = df_tweets[df_tweets["lang"] == "en"]
df_tweets.shape

(34579, 6)

# suffle them and filter out news

In [20]:
df_tweets[df_tweets["contains_news"] == 0].shape

(32910, 6)

In [21]:
to_custom_news_feed = (
    df_tweets[df_tweets["contains_news"] == 0]
    .sample(frac=1)
    .reset_index(drop=True)[:1000]
)
to_custom_news_feed.shape

(1000, 6)

In [22]:
to_custom_news_feed

Unnamed: 0,id,user,full_text,created_at,lang,contains_news
0,1337305449041838080,13502732,this is the second VHS I've watched in two day...,2020-12-11T07:57:19+00:00,en,0
1,1343662653395234816,15534471,🆕 blog post: Maximally optimizing image loadin...,2020-12-28T20:58:35+00:00,en,0
2,1336361683791478784,97114171,And of course here's its better logo. https://...,2020-12-08T17:27:08+00:00,en,0
3,1345211663725674496,13502732,DO I LOOK LIKE I KNOW WHAT IRQ THIS USES? http...,2021-01-02T03:33:48+00:00,en,0
4,1346079884083408897,778952476565368832,RT @deonteleologist: Cusset on Left dissolutio...,2021-01-04T13:03:47+00:00,en,0
...,...,...,...,...,...,...
995,1339586879218352134,15919116,"In less than one hour, join us for the annual ...",2020-12-17T15:02:54+00:00,en,0
996,1345111654544117760,47126544,Two books to start the year with :-) https://t...,2021-01-01T20:56:24+00:00,en,0
997,1341089861919358976,13502732,People are spreading this virus because they h...,2020-12-21T18:35:13+00:00,en,0
998,1336051519238664195,1528558014,Not many as thoughtful as Dennis. A great oppo...,2020-12-07T20:54:39+00:00,en,0


In [23]:
# to_custom_news_feed[["id", "user"]].to_csv("batch_to_add.csv")