# Imports

In [None]:
import pandas as pd

# Load Data

In [None]:
def load_csv(filename, clickbait, threshold):
    df = pd.read_csv(filename)
    df.drop_duplicates(subset=['url', 'title'], inplace=True)
    df['title'] = df['title'].str.lower().str.strip()
    df['subreddit'] = df['subreddit'].str.lower().str.strip()
    
    subreddits = ['politics', 'worldnews', 'news', 'nottheonion', 'upliftingnews']
    for sub in subreddits:
        df = df[df['subreddit'] != sub]
        
    df['title'].replace('&amp;', 'and', inplace=True, regex=True)
    
    regs = ['\.', ',', '^\[ap news\]', '^\(ap:\)', '^\[ap\]', '^(?:\[)(.*)(?:\])(.?)-', 'https:\/\/url4ever.com(.*)$']
    for reg in regs:
        df['title'].replace(reg, '', inplace=True, regex=True)
    df['title'] = df['title'].str.strip()
    
    news_orgs = ['forbes', 'cnbc', 'espn', 'bbc', 'huffpo', 'nyt', 'reuters', 'ap news', 'associated press', 'pbs newshour', 'pbs', 'cnn', 'sydney morning herald', 'chicago tribune', 'chicago sun-times', 'la times', 'iol', 'al jazeera', 'washington post', 'toronto star', 'telegraph', 'bbc', 'south china morning post', 'npr', 'guardian', 'the japan times', 'abc', 'fox', 'al arabiya', 'nbc', 'irish times', 'manila bulletin', 'nz herald', 'times of india', 'ap', 'sana', 'usatoday', 'nypost']
    for org in news_orgs:
        df['title'].replace('\|(.?)' + org + '(.*)$', '', inplace=True, regex=True)
        df['title'].replace(':(.?)' + org + '(.*)$', '', inplace=True, regex=True)
        df['title'].replace('-(.?)' + org + '(.*)$', '', inplace=True, regex=True)
        df['title'].replace('^' + org + '(.*):', '', inplace=True, regex=True)
        df['title'].replace('^breaking(.?)' + org + '(.?):', '', inplace=True, regex=True)
    df['title'] = df['title'].str.strip()

    other_prefixes = ['news:', 'exclusive:', 'breaking news:', 'the indicator:', 'opinion:', 'study:', 'poll:', 'special report:', 'report:', 'analysis:', 'alert:', 'fact check:', 'interview:', '[salon]', '[forbes]', 'news wrap:', '[national] -', 'authorities:', 'officials:', 'news brief:', 'watch:', 'watch live:', 'breaking:', 'the latest:']
    df['title'] = df['title'].apply(lambda x: presplit(x, other_prefixes))
    other_postfixes = ['| february 24, 2021', '[ap]', '(ap)', ': coronavirus updates']
    df['title'] = df['title'].apply(lambda x: postsplit(x, other_postfixes))
    
    df = df[df['title'].str.contains('savedyouaclick', regex=False) == False]
    df = df[df['title'] != '']
    
    df.drop_duplicates(subset='title', inplace=True)
    df = df[df['score'] >= threshold]
    df = df[['created_utc', 'title']]
    df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
    df['clickbait'] = clickbait
    
    return df

def presplit(title, prefixes):
    for pre in prefixes:
        if title.startswith(pre):
            return title.split(pre)[1].strip()
    
    return title
    
def postsplit(title, postfixes):
    for post in postfixes:
        if title.endswith(post):
            return title.split(post)[0].strip()
    
    return title

In [None]:
# load clickbait sources
clickbait = load_csv('data/clickbait.csv.gz', 1, 5)
clickbait = clickbait[clickbait['title'].str.contains('|', regex=False)]
clickbait['title'] = clickbait['title'].str.split('|').str[0].str.strip()
clickbait = clickbait[clickbait['title'] != '']
clickbait.drop_duplicates(subset='title', inplace=True)
#display(clickbait)

# load non-clickbait sources
apnews = load_csv('data/apnews.csv.gz', 0, 0)
npr = load_csv('data/npr.csv.gz', 0, 0)
pbs = load_csv('data/pbs.csv.gz', 0, 0)
reuters = load_csv('data/reuters.csv.gz', 0, 0)
news = pd.concat([apnews, reuters, npr, pbs], ignore_index=True)
#display(news)

# ensure balance between clickbait and non-clickbait entries
clickbait = clickbait.sample(min(clickbait.shape[0], news.shape[0]), ignore_index=True)
news = news.sample(min(clickbait.shape[0], news.shape[0]), ignore_index=True)
#display(clickbait)
#display(news)

In [None]:
# combine clickbait and news datasets
data = pd.concat([clickbait, news], ignore_index=True)
data = data.sample(frac=1, ignore_index=True)
#display(data)

In [None]:
data.to_csv('data/filtered.csv.gz', index=False, compression="gzip")