# tweetfeed

# Data

In [1]:
import os
import pandas as pd
import sys
import re
from urllib.parse import urlparse
import numpy as np


In [2]:
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path + "/tweetfeed")
    
from data import load_tweets,find_news

In [3]:
import tweepy
#auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(wait_on_rate_limit=True)

class Tweet(object):
    def __init__(self, embed_str=None):
        self.embed_str = embed_str

    def _repr_html_(self):
        return self.embed_str
    
def embed_str(idx):
    return api.get_oembed(idx)["html"]

In [4]:
# df = load_tweets("../home.db", days=24)
# df.to_pickle("tweetfeed.pkl")

In [5]:
df = pd.read_pickle("tweetfeed.pkl"); df.shape

(55485, 9)

In [6]:
df.head()

Unnamed: 0,id,user,full_text,created_at,lang,retweeted_status,quoted_status,is_quote_status,in_reply_to_status_id
0,5596520,713143,soo hungry need to find my wife and head to pf...,2007-02-20T02:43:19+00:00,en,,,0,
1,154674522,7193842,Hofstadter’s Law: A task always takes longer t...,2007-07-17T20:45:46+00:00,en,,,0,
2,412145582,4829901,Adobe Updater must update itself before instal...,2007-11-13T21:45:35+00:00,en,,,0,
3,922321981,16298441,no,2008-09-15T17:25:20+00:00,und,,,0,
4,2627602600,21454322,"Went on a USO trip to Guantanamo Bay, Cuba a f...",2009-07-14T05:15:27+00:00,en,,,0,


### Retweets

Retweets should be removed - tweets that are retweeted are in the database, and contain full text of a tweet, icluding links and urls. This is not a case for retweets. 

They have `RT ` in front of the tweet, and are often cut at the end.

Retweets don't add anything to consversation, they just amplify signal.

In [7]:
# Retweet
df[df["id"] == 1340085773203193856].iloc[0].full_text

'RT @SophieCo_RT: Life = Matter + Information - chasing the formula of our existence with physicist and cosmologist, Prof. Paul Davies @Beyo…'

In [8]:
# Retweeted tweet
df[df["id"] == 1339896134291091458].iloc[0].full_text

'Life = Matter + Information - chasing the formula of our existence with physicist and cosmologist, Prof. Paul Davies @Beyond_ASU: https://www.rt.com/shows/sophieco-visionaries/510041-paul-davies-life-definition/ https://twitter.com/SophieCo_RT/status/1339896134291091458/video/1'

In [9]:
# filter out retweets - retweeted tweets are in the database
# so RT only dup
df = df[df["retweeted_status"] == "N/A"] ; df.shape

(44125, 9)

## Reviewed tweets

In [10]:
reviewed_df = pd.read_csv("../tweetfeed/data/seen.csv"); reviewed_df.shape

(7620, 2)

In [11]:
reviewed_df["err_reason"].value_counts()

no_errors          5837
not_found          1641
protected_tweet     142
Name: err_reason, dtype: int64

In [12]:
deleted_tweets = reviewed_df[reviewed_df.err_reason == "not_found"].tweet_id.tolist()
df = df[~df["id"].isin(deleted_tweets)]; df.shape

(43924, 9)

In [13]:
### let's deal only with tweet that are in English

In [14]:
df = df[df["lang"] == "en"]; df.shape

(40433, 9)

## Augmentation

### Finding news related tweets

In [15]:
with open("../tweetfeed/data/news_domains.txt", "r") as f:
    news_domains = json.loads(f.read())
# df = find_news(df, news_domains); df.shape

In [16]:
df_news = df.copy()

In [17]:
def remove_tw_urls(tweet: str) -> str:
    """removes twitter links / urls from tweet"""
    tweet = re.sub(r"https://twitter.com/\S+", "", tweet)
    tweet = re.sub(r"http://twitter.com/\S+", "", tweet)
    tweet = re.sub(r"https://api.twitter.com/\S+", "", tweet)
    tweet = re.sub(r"http://api.twitter.com/\S+", "", tweet)
    return tweet

def rem_short_links(tweet: str) -> str:
    """removes some of short links (bit.ly, buff.ly, t.co) from tweets"""
    tweet = re.sub(r"https://bit.ly/\S+", "", tweet)
    tweet = re.sub(r"http://bit.ly/\S+", "", tweet)
    tweet = re.sub(r"https://buff.ly/\S+", "", tweet)
    tweet = re.sub(r"http://buff.ly/\S+", "", tweet)
    tweet = re.sub(r"https://t.co/\S+", "", tweet)
    tweet = re.sub(r"http://t.co/\S+", "", tweet)
    return tweet

In [18]:
df_news["clean_text"] = df_news["full_text"].apply(remove_tw_urls).apply(rem_short_links)

In [19]:
df_news[df_news["clean_text"].str.contains("twitter")].iloc[0].full_text

'Your twitter timeline as an interface to your mind. Someone liking your tweets can force you to have those thoughts.'

In [20]:
df_news[df_news["clean_text"].str.contains("bit.ly")]

Unnamed: 0,id,user,full_text,created_at,lang,retweeted_status,quoted_status,is_quote_status,in_reply_to_status_id,clean_text


In [21]:
def find_url(tweet: str) -> list:
    """find all urls in string and returns a list of all urls"""
    return re.findall(r"http\S+", tweet)

In [22]:
df_news["urls"] = df_news["clean_text"].apply(find_url) 

In [23]:
df_news.drop(["clean_text"], axis=1, inplace=True)

In [24]:
def get_domain(url: str) -> str:
    """extracts domain from url, returns it"""
    domain = urlparse(url).netloc.replace("www.", "")
    dot_split = domain.split(".")
    if (len(dot_split) > 2) & (
        dot_split[-1] == "com"
    ):  # for links like "edition.cnn.com", but not like "site.co.nz"
        return ".".join(dot_split[1:])
    else:
        return domain
def remove_empty_str(string_list: list) -> list:
    """removes items that are empty strings from the list"""
    for i in string_list:
        if len(string_list) == 0:
            string_list.remove(i)
    return string_list

In [25]:
df_news["domains"] = df_news.urls.apply(lambda x: [get_domain(d) for d in x]).apply(
    remove_empty_str
)
df_news.drop(["urls"], axis=1, inplace=True)

In [26]:
# get max value of domains, expand each one to new column (unpack from list)
new_columns_list = []
max_nr_dom = df_news.domains.str.len().max()
for i in range(max_nr_dom):
    new_columns_list.append(f"domain{i+1}")
df_news.reset_index(drop=True, inplace=True)
df_news[new_columns_list] = pd.DataFrame(df_news.domains.tolist())

for col in new_columns_list:
    df_news[col] = df_news[col].isin(news_domains)

In [27]:
df_news[[
    "id", "domains", "domain1", "domain2", "domain3", "domain4", "domain5", "domain6"
]].head(10)

Unnamed: 0,id,domains,domain1,domain2,domain3,domain4,domain5,domain6
0,5596520,[],False,False,False,False,False,False
1,154674522,[],False,False,False,False,False,False
2,412145582,[],False,False,False,False,False,False
3,2627602600,[],False,False,False,False,False,False
4,35122446658965504,[],False,False,False,False,False,False
5,35466982635601920,[bitcoincharts.com],False,False,False,False,False,False
6,70261648811761665,[],False,False,False,False,False,False
7,71705067115388928,[],False,False,False,False,False,False
8,177008089394970624,[cnn.com],True,False,False,False,False,False
9,193480622533120001,[],False,False,False,False,False,False


In [28]:
df_news[df_news.domain1 == True][[
    "id", "domains", "domain1", "domain2", "domain3", "domain4", "domain5", "domain6"
]].head(10)

Unnamed: 0,id,domains,domain1,domain2,domain3,domain4,domain5,domain6
8,177008089394970624,[cnn.com],True,False,False,False,False,False
53,703286688064221184,[nature.com],True,False,False,False,False,False
69,814110461436235776,[bloom.bg],True,False,False,False,False,False
81,848731022854205440,[nyti.ms],True,False,False,False,False,False
162,986941808348409856,[theguardian.com],True,False,False,False,False,False
225,1051756275585843201,[coindesk.com],True,False,False,False,False,False
241,1065383519419912193,[philly.com],True,False,False,False,False,False
257,1083458327143739392,[ft.com],True,False,False,False,False,False
298,1105013938524106757,[phys.org],True,False,False,False,False,False
305,1109481961180008448,[barrons.com],True,False,False,False,False,False


In [29]:
df_news[df_news.domain4 == True][[
    "id", "domains", "domain1", "domain2", "domain3", "domain4", "domain5", "domain6"
]].head(10) # max domains in tweet is 6, but max *news* domains is 4.

Unnamed: 0,id,domains,domain1,domain2,domain3,domain4,domain5,domain6
8987,1337449293834113025,"[simplecast.com, dailymail.co.uk, insider.com,...",False,True,False,True,False,False
14199,1339626801513996293,"[cargo.one,, Tech.eu), tech.eu, techmeme.com]",False,False,False,True,False,False
30338,1347533968212688896,"[fbi.gov, fbi.gov, fbi.gov, fbi.gov]",True,True,True,True,False,False


In [30]:
df[df["id"] == 1344730628407881729].iloc[0].full_text

'Finally, inspiring works really worth a read:\n+ https://arxiv.org/abs/1911.05248, by @sarahookr \n+ http://arxiv.org/abs/2006.00995, https://arxiv.org/abs/1912.13283,  by @yanaiela et al\n+ https://bair.berkeley.edu/blog/2019/05/13/oltr\n+ https://arxiv.org/abs/1901.11373 by @DaniYogatama et al\n+ https://arxiv.org/abs/2009.07118 by @timo_schick et.'

In [31]:
most_links = (
    df_news.domains.apply(lambda x: len(x)).sort_values(ascending=False).index[0]
) #get index of tweet with most links

In [32]:
Tweet(embed_str(df_news.iloc[most_links]["id"]))

In [33]:
df_news.drop(["domains"], axis=1, inplace=True)
# sum it all up
df_news["contains_news"] = df_news[new_columns_list].sum(axis=1)

In [34]:
df_news["contains_news"].value_counts()

0    38042
1     1714
2      673
3        3
4        1
Name: contains_news, dtype: int64

In [35]:
# we are interest in only a True-False, so let's change that.
df_news["contains_news"] = df_news.contains_news.apply(lambda x: x if x == 0 else 1)
# now we can remove added columns
df_news.drop(new_columns_list, axis=1, inplace=True)

In [36]:
df_news["contains_news"].value_counts()

0    38042
1     2391
Name: contains_news, dtype: int64

In [37]:
def find_news(df: pd.DataFrame, news_domains_list: list) -> pd.DataFrame:
    """Takes DataFrame, and list of domains of news sites.
    Removes from DataFrame rows that contain links to sites from that list.
    Args:
        df (pd.DataFrame): DataFrame to be cleaned
        news_domains_list (list): list of domains of news sites

    Returns:
        pd.DataFrame: DataFrame without tweets linking to news
    """
    df = df.copy()
    df["clean_text"] = (
        df["full_text"].apply(remove_tw_urls).apply(rem_short_links)
    )
    df["clean_text"] = df["clean_text"].apply(rem_short_links)
    df["urls"] = df["clean_text"].apply(find_url)
    df.drop(["clean_text"], axis=1, inplace=True)
    df["domains"] = df.urls.apply(lambda x: [get_domain(d) for d in x])
    df["domains"] = df.domains.apply(remove_empty_str)
    df.drop(["urls"], axis=1, inplace=True)

    # get max value of domains, expand each one to new column (unpack from list)
    new_columns_list = []
    max_nr_dom = df.domains.str.len().max()
    for i in range(max_nr_dom):
        new_columns_list.append(f"domain{i+1}")
    df.reset_index(drop=True, inplace=True)
    df[new_columns_list] = pd.DataFrame(df.domains.tolist())

    for col in new_columns_list:
        df[col] = df[col].isin(news_domains_list)

    df.drop(["domains"], axis=1, inplace=True)
    # sum it all up
    df["contains_news"] = df[new_columns_list].sum(axis=1)
    df["contains_news"] = df.contains_news.apply(lambda x: x if x == 0 else 1)
    # remove added columns
    df.drop(new_columns_list, axis=1, inplace=True)

    return df

In [38]:
df = find_news(df, news_domains); df.shape

(40433, 10)

In [39]:
df_news["contains_news"].value_counts()

0    38042
1     2391
Name: contains_news, dtype: int64

## explore muted accounts, and muted words

In [40]:
with open("../tweetfeed/data/mute_list.txt", "r") as f:
    mute_list = json.loads(f.read())
with open("../tweetfeed/data/mute_list_cs.txt", "r") as f:
    mute_list_cs = json.loads(f.read())
with open("../tweetfeed/data/mutedacc.txt", "r") as f:
    mutedacc = json.loads(f.read())

In [41]:
mute_list_cs

['GOP', 'BREAKING', 'BLM', 'MAGA', 'Rep.']

In [42]:
mute_list[:4]

['breaking:', '🍿', '🚨', 'New Yorker']

In [43]:
mutedacc[:5]

[780557598089109504,
 97114171,
 1282696358034657281,
 1242108820362530820,
 1168694596450299906]

In [44]:
df_mute = df.copy()
m_list = mute_list
column_name = "full_text"
for item in m_list:
    df_mute[item] = df_mute[column_name].str.contains(item)
df_mute["is_mute_list"] = df_mute[mute_list].sum(axis=1)
df_mute["is_mute_list"] = df_mute["is_mute_list"].apply(lambda x: x if x == 0 else 1)
df_mute.drop(mute_list, axis=1, inplace=True)
df_mute[df_mute.is_mute_list > 0].shape

(563, 11)

In [45]:
def is_muted(df, mute_list, column_name, new_col, case=True):
    df_mute = df.copy()
    for item in mute_list:
        df_mute[item] = df_mute[column_name].str.contains(item, case)
    df_mute[new_col] = df_mute[mute_list].sum(axis=1)
    df_mute[new_col] = df_mute[new_col].apply(lambda x: x if x == 0 else 1)
    df_mute.drop(mute_list, axis=1, inplace=True)
    return df_mute

In [46]:
df = is_muted(df, mute_list, "full_text", "is_mute_list", case=False)
df = is_muted(df, mute_list_cs, "full_text", "is_mute_list_cs")
df["mutedacc"] = np.where(df["user"].isin(mutedacc), 1, 0)

In [47]:
columns_list = ["is_mute_list", "is_mute_list_cs", "mutedacc"]
df["muted"] = (
    df[columns_list]
    .sum(axis=1)
    .apply(lambda x: x if x == 0 else 1)
)
df.drop(columns_list, axis=1, inplace=True)

In [48]:
df[["id", "user", "contains_news", "muted"]].head(10)

Unnamed: 0,id,user,contains_news,muted
0,5596520,713143,0,0
1,154674522,7193842,0,0
2,412145582,4829901,0,0
3,2627602600,21454322,0,0
4,35122446658965504,5691532,0,0
5,35466982635601920,15012642,0,0
6,70261648811761665,5416652,0,0
7,71705067115388928,14182218,0,0
8,177008089394970624,5110861,1,0
9,193480622533120001,175624200,0,0


In [49]:
df[(df.contains_news > 0) | (df.muted > 0)].shape #.copy and put into new dataframe

(7456, 11)

In [50]:
reviewed_df = pd.read_csv("../tweetfeed/data/seen.csv"); reviewed_df.shape

(7620, 2)

In [51]:
Tweet(embed_str("177008089394970624"))

In [52]:
seen = reviewed_df[reviewed_df.err_reason == "no_errors"].tweet_id.tolist(); len(seen)

5837

In [53]:
df[df.id.isin(seen) & (df.contains_news > 0) & (df.muted == 0)]

Unnamed: 0,id,user,full_text,created_at,lang,retweeted_status,quoted_status,is_quote_status,in_reply_to_status_id,contains_news,muted
1864,1330677864555896834,14301074,Sex Pistols star Johnny Rotten bitten by a fle...,2020-11-23T01:01:40+00:00,en,,,0,,1,0
2199,1334612372720324611,380648579,Leading American infectious disease scientist ...,2020-12-03T21:36:00+00:00,en,,,0,,1,0
7461,1337034350043553792,3819701,World Economic Forum launches how-to guide on ...,2020-12-10T14:00:04+00:00,en,,,0,,1,0
8634,1337362745621159936,380648579,#UPDATE Pharmaceutical giant AstraZeneca's Rus...,2020-12-11T11:45:00+00:00,en,,,0,1.3373313784306115e+18,1,0
11377,1338235166821675012,2894902813,Where is the flaw in my core critique of econo...,2020-12-13T21:31:41+00:00,en,,,0,,1,0
11541,1338276857775611910,1157604793,Good article https://www.theatlantic.com/heal...,2020-12-14T00:17:21+00:00,en,,,0,,1,0
12922,1339148923240394759,15309804,D.C.-area forecast: Snow and sleet arriving la...,2020-12-16T10:02:38+00:00,en,,,0,,1,0
14107,1339612008182919168,220907703,Many of Amazon's warehouse workers struggle to...,2020-12-17T16:42:46+00:00,en,,,0,,1,0
15009,1339990100629991424,2981803053,We’re on @CNN today! Watch how our waste meter...,2020-12-18T17:45:10+00:00,en,,,0,,1,0
15078,1340065863802773518,15276573,New: A Zoom exec worked with the Chinese gover...,2020-12-18T22:46:13+00:00,en,,,0,,1,0


In [54]:
Tweet(embed_str("1330677864555896834"))

In [55]:
# I remember seeing this tweet, and  then I added `tvnz.co.nz` to domains containing news

## Exploratory Data Analysis (EDA)

In [56]:
# which accounts post most news?
# which accounts post most?




## Annotation

## Labeling
## Spliting
## Preprocessing


# Modeling
## Random
## Rule-based
## Simple ML
## CNN
## RNN
## Transformers


# Experiment tracking
## Training
## Tracking
## Viewing
## Loading

# Optimization