# load data

In [13]:
import sqlite3
import pandas as pd
import re
from urllib.parse import urlparse

In [14]:
from datetime import date
from datetime import timedelta

## load tweets older then two weeks

In [15]:
def load_tweets(db_path, days):
    time_delta = date.today() - timedelta(days=days)
    cnx = sqlite3.connect(db_path)
    query = f"SELECT id,user, full_text, created_at, lang, quoted_status, in_reply_to_status_id FROM tweets WHERE created_at < '{str(time_delta)}'"
    #TODO add restrain, to remove tweets I liked, but for that I need to setup another cron job too.
    df = pd.read_sql_query(
        query,
        cnx,
    )
    return df


In [16]:
df_tweets = load_tweets("../home.db", days=21)

# utils

In [27]:
for col in df_tweets.columns:
    if df_tweets[col].dtype == "float64":
        df_tweets[col] = df_tweets[col].fillna(0).astype(np.int64)

In [28]:
df_tweets.tail()

Unnamed: 0,id,user,full_text,created_at,lang,quoted_status,in_reply_to_status_id,contains_news,all_news
34519,1345156352956514307,14622002,@niedakh Yes! Out of maybe 5 places we stayed ...,2021-01-01T23:54:00+00:00,en,0,1.3451555351680243e+18,0,0
34520,1345156542295904256,2343198944,"Oh weird, twitter doesn't bring up the list wh...",2021-01-01T23:54:46+00:00,en,0,1.345155313725563e+18,0,0
34521,1345157369903239168,14379530,"""in a well-run civilization, the first covid v...",2021-01-01T23:58:03+00:00,en,0,,0,0
34522,1345157575256363008,1025982573665648640,"You may remember http://whatdayisitagain.org, ...",2021-01-01T23:58:52+00:00,en,0,,0,0
34523,1345157722002444289,1416500532,The internet is magic https://twitter.com/dotS...,2021-01-01T23:59:27+00:00,en,1345157575256363008,,0,0


In [29]:
# df_qt = df_tweets[["id", "full_text"]].copy()
# df_qt.columns = ["quoted_status", "quoted_text"]
# df_tweets = df_tweets.merge(df_qt, on="quoted_status", how="left")

# df_r = df_tweets[["id", "full_text"]].copy()
# df_r.columns = ['in_reply_to_status_id', 'reply_text']
# df_tweets = df_tweets.merge(df_r, on="in_reply_to_status_id", how="left")

# df_tweets["all_text"] = (
#     df_tweets["full_text"].astype(str)
#     + df_tweets["quoted_status"].astype(str)
#     + df_tweets["in_reply_to_status_id"].astype(str)
# )

# df.drop(["in_reply_to_status_id"], axis=1, inplace=True)
# df.drop(["quoted_status"], axis=1, inplace=True)

In [30]:
def find_url(tweet):
    return re.findall(r"http\S+", tweet)


def clean_links(tweet):
    tweet = re.sub(r"bit.ly/\S+", "", tweet)
    tweet = re.sub(r"t.co/\S+", "", tweet)
    tweet = re.sub(r"buff.ly/\S+", "", tweet)
    tweet = re.sub(r"twitter.com/\S+", "", tweet)
    return tweet


def get_domain(url):
    domain = urlparse(url).netloc
    dot_split = domain.split(".")
    if len(dot_split) > 2:
        return ".".join(dot_split[1:])
    else:
        return domain


def remove_empty_str(l):
    for i in l:
        if len(i) == 0:
            l.remove(i)
    return l

# function for df

In [32]:
def find_news(df, news_domains_list):

    df["urls"] = df["full_text"].apply(find_url)
    df["urls"] = df.urls.apply(lambda x: [clean_links(d) for d in x])
    df["domains"] = df.urls.apply(lambda x: [get_domain(d) for d in x])
    df["domains"] = df.domains.apply(remove_empty_str)
    df.drop(["urls"], axis=1, inplace=True)

    new_columns_list = []
    max_nr_dom = df.domains.str.len().max()
    for i in range(max_nr_dom):
        new_columns_list.append(f"domain{i+1}")
    df[new_columns_list] = pd.DataFrame(df.domains.tolist())

    for col in new_columns_list:
        df[col] = df[col].isin(news_domains_list)

    df.drop(["domains"], axis=1, inplace=True)

    df["contains_news"] = df[new_columns_list].sum(axis=1)
    df["contains_news"] = df.contains_news.apply(lambda x: x if x == 0 else 1)
    df.drop(new_columns_list, axis=1, inplace=True)

    return df

def drop_contains(df, column_name, word_list):
    for string in word_list:
        df["lower"] = df["full_text"].str.lower()
        df = df[df["lower"].str.contains(string)]
        df.drop(["lower"], axis=1, inplace=True)
    return df

In [33]:
with open("news_domains.txt", "r") as f:
    news_domains = json.loads(f.read())

In [34]:
%%time
df_tweets = find_news(df_tweets, news_domains)

CPU times: user 426 ms, sys: 7.55 ms, total: 433 ms
Wall time: 433 ms


In [36]:
def news_in_qt_rt(df):
    df["all_news"] = df["contains_news"].copy()

    df_qt = df[["id", "contains_news"]].copy()
    df_qt.columns = ["quoted_status", "qt_news"]
    df = df.merge(df_qt, on="quoted_status", how="left")
    df["qt_news"] = df["qt_news"].fillna(0).astype(np.int64)
    df["all_news"] = df["qt_news"].astype(np.int64) + df["contains_news"].astype(np.int64)
    # df.drop(["quoted_status"], axis=1, inplace=True)

    df_qt = df[["id", "contains_news"]].copy()
    df_qt.columns = ["in_reply_to_status_id", "rt_news"]
    df = df.merge(df_qt, on="in_reply_to_status_id", how="left")
    df["rt_news"] = df["rt_news"].fillna(0).astype(np.int64)
    df["all_news"] = df["rt_news"].astype(np.int64) + df["all_news"].astype(np.int64)
    
    return df


In [38]:
df = df_tweets.copy()

In [39]:
df.head()

Unnamed: 0,id,user,full_text,created_at,lang,quoted_status,in_reply_to_status_id,contains_news,all_news
0,2627602600,21454322,"Went on a USO trip to Guantanamo Bay, Cuba a f...",2009-07-14T05:15:27+00:00,en,0,,0,0
1,70261648811761665,5416652,"I wish I had kept my 1,700 BTC @ $0.06 instead...",2011-05-16T22:57:37+00:00,en,0,,0,0
2,177008089394970624,5110861,"In 1996 a man took a NZ radio station hostage,...",2012-03-06T12:29:51+00:00,en,0,,1,1
3,234002950274560000,108471631,What is she thinking? https://twitter.com/MELA...,2012-08-10T19:07:06+00:00,en,0,,0,0
4,281811460718477312,16298441,did you know that the bible doesn't actually c...,2012-12-20T17:21:02+00:00,en,0,,0,0


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34524 entries, 0 to 34523
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     34524 non-null  int64 
 1   user                   34524 non-null  int64 
 2   full_text              34524 non-null  object
 3   created_at             34524 non-null  object
 4   lang                   34524 non-null  object
 5   quoted_status          34524 non-null  int64 
 6   in_reply_to_status_id  8342 non-null   object
 7   contains_news          34524 non-null  int64 
 8   all_news               34524 non-null  int64 
dtypes: int64(5), object(4)
memory usage: 2.4+ MB


In [42]:
df["in_reply_to_status_id"] = df["in_reply_to_status_id"].fillna(0).astype(np.int64)

df_qt = df[["id", "contains_news"]].copy()
df_qt.columns = ["in_reply_to_status_id", "rt_news"]
df = df.merge(df_qt, on="in_reply_to_status_id", how="left")
df["rt_news"] = df["rt_news"].fillna(0).astype(np.int64)
df["all_news"] = df["rt_news"].astype(np.int64) + df["all_news"].astype(np.int64)

In [None]:
df = df.merge(df_qt, on="in_reply_to_status_id", how="left")

In [37]:
%%time
df_tweets = news_in_qt_rt(df_tweets)

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

In [None]:
print(df_tweets.shape)

In [None]:
df_tweets[df_tweets.id == 1197689942396174336]

# remove seen

In [None]:
seen_tweets = pd.read_csv("seen.csv")

In [None]:
seen_tweets.head()

In [None]:
seen_tweets.info()

In [None]:
seen_tweets.drop_duplicates(inplace=True)

In [None]:
df_tweets[df_tweets["id"].isin(seen_tweets["tweet_id"].tolist())].shape

In [None]:
%%time
print(df_tweets.shape)
df_tweets = df_tweets[~df_tweets["id"].isin(seen_tweets["tweet_id"].tolist())]

In [None]:
# fiter out non english tweets

In [None]:
df_tweets.shape

In [None]:
df_tweets = df_tweets[df_tweets["lang"] == "en"]
df_tweets.shape

# suffle them and filter out news

In [None]:
df_tweets[df_tweets["contains_news"] == 0].shape

In [None]:
def drop_contains(df, column_name, str_list, lower=True):
    for string in str_list:
        if lower:
            df["filter"] = df[column_name].str.lower().copy()
        if not lower:
            df["filter"] = df[column_name].copy()
        df = df_tweets[~df["filter"].str.contains(string)]
        df = df.drop(["filter"], axis=1).copy()
    return df

In [None]:
df_tweets.shape

In [None]:
🍿

In [None]:
def show_contains(df, column_name, str_list, lower=True):
    for string in str_list:
        if lower:
            df["filter"] = df[column_name].str.lower().copy()
        if not lower:
            df["filter"] = df[column_name].copy()
        df = df_tweets[df["filter"].str.contains(string)]
        df = df.drop(["filter"], axis=1).copy()
    return df

In [None]:
str_to_show = ["🍿"]
show_contains(df_tweets, column_name="full_text", str_list = str_to_show, lower=False)

In [None]:
str_to_drop = ["breaking:"]
df_tweets = drop_contains(df_tweets, column_name="full_text", str_list = str_to_drop)
df_tweets.shape

In [None]:
str_to_drop = ["GOP"]
df_tweets = drop_contains(df_tweets, column_name="full_text", str_list = str_to_drop, lower=False)
df_tweets.shape

In [None]:
string = "breaking:"

In [None]:
df_tweets["lower"] = df_tweets["full_text"].str.lower().copy()
df_tweets = df_tweets[~df_tweets["lower"].str.contains(string)]
df_tweets.drop(["lower"], axis=1, inplace=True)

In [None]:
df_tweets

In [None]:
df_tweets["full_text"].str.lower()

In [None]:
to_custom_news_feed = (
    df_tweets[df_tweets["contains_news"] == 0]
    .sample(frac=1)
    .reset_index(drop=True)[:1000]
)
to_custom_news_feed.shape

In [None]:
to_custom_news_feed

In [None]:
# to_custom_news_feed[["id", "user"]].to_csv("batch_to_add.csv")