# Imports

In [1]:
import pandas as pd

# Load Data

In [2]:
def load_csv(filename, clickbait):
    df = pd.read_csv(filename)
    df.drop_duplicates(subset='url', inplace=True)
    df['title'] = df['title'].str.lower().str.strip()
    
    regex_prefixes = ['^(?:\[)(.*)(?:\])(.?)-', 'https:\/\/url4ever.com(.*)$']
    for reg in regex_prefixes:
        df['title'].replace(reg, '', inplace=True, regex=True)
    
    news_orgs = ['espn', 'bbc', 'huffpo', 'nyt', 'reuters', 'ap news', 'associated press', 'pbs newshour', 'pbs', 'cnn', 'sydney morning herald', 'chicago tribune', 'chicago sun-times', 'la times', 'iol', 'al jazeera', 'washington post', 'toronto star', 'telegraph', 'bbc', 'south china morning post', 'npr', 'guardian', 'the japan times', 'abc', 'fox', 'al arabiya', 'nbc', 'irish times', 'manila bulletin', 'nz herald', 'times of india', 'sana', 'usatoday', 'nypost']
    for org in news_orgs:
        df['title'].replace('\|(.?)' + org + '(.*)$', '', inplace=True, regex=True)
        df['title'].replace('-(.?)' + org + '(.*)$', '', inplace=True, regex=True)
        df['title'].replace('^' + org + '(.*):', '', inplace=True, regex=True)
    
    df['title'] = df['title'].str.strip()
    
    ap_prefixes = ['[ap]', '(ap:)', '[ap news]']
    df['title'] = df['title'].apply(lambda x: presplit(x, ap_prefixes))
    ap_postfixes = ['| february 24, 2021', '[ap]', '(ap)']
    df['title'] = df['title'].apply(lambda x: postsplit(x, ap_postfixes))
    
    other_prefixes = ['[salon]', '[forbes]', 'news wrap:', 'authorities:', 'officials:', 'news brief:', 'watch:', 'watch live:', 'breaking:', 'the latest:']
    df['title'] = df['title'].apply(lambda x: presplit(x, other_prefixes))
    other_postfixes = []
    df['title'] = df['title'].apply(lambda x: postsplit(x, other_postfixes))
    
    df = df[df['title'].str.contains('savedyouaclick', regex=False) == False]
    
    df.drop_duplicates(subset='title', inplace=True)
    df = df[['created_utc', 'title', 'score']]
    df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
    df['clickbait'] = clickbait
    
    return df

def presplit(title, prefixes):
    for pre in prefixes:
        if title.startswith(pre):
            return title.split(pre)[1].strip()
    
    return title
    
def postsplit(title, postfixes):
    for post in postfixes:
        if title.endswith(post):
            return title.split(post)[0].strip()
    
    return title

In [3]:
# load clickbait sources
clickbait = load_csv('data/clickbait.csv.gz', 1)
clickbait = clickbait[clickbait['title'].str.contains('|', regex=False)]
clickbait['title'] = clickbait['title'].str.split('|').str[0].str.strip()
clickbait.drop_duplicates(subset='title', inplace=True)
clickbait = clickbait[clickbait["title"].notna()]
clickbait = clickbait[clickbait['score'] > 10]
#display(clickbait)

# load non-clickbait sources
apnews = load_csv('data/apnews.csv.gz', 0)
npr = load_csv('data/npr.csv.gz', 0)
pbs = load_csv('data/pbs.csv.gz', 0)
reuters = load_csv('data/reuters.csv.gz', 0)
news = pd.concat([apnews, reuters, npr, pbs], ignore_index=True)
#display(news)

# ensure balance between clickbait and non-clickbait entries
clickbait = clickbait.sample(min(clickbait.shape[0], news.shape[0]), ignore_index=True)
news = news.sample(min(clickbait.shape[0], news.shape[0]), ignore_index=True)
#display(clickbait)
#display(news)

In [4]:
# combine clickbait and news datasets
data = pd.concat([clickbait, news], ignore_index=True)
data = data[['title', 'clickbait']]
data = data.sample(frac=1, ignore_index=True)
data.to_csv("data/filtered.csv.gz", compression='gzip', index=False)

display(data)

Unnamed: 0,title,clickbait
0,jon hamm just came clean about his condition a...,1
1,forest whitaker's is playing a fan favorite ch...,1
2,"why german divisions remain, 30 years after fa...",0
3,"kim jong un is dangerous and a risk-taker, but...",0
4,nearly all mass shooters have this one thing i...,1
...,...,...
20745,how school administrators and parents are find...,0
20746,nasa released photos of jupiter and everyone i...,1
20747,"typhoon in philippines leaves 13 missing, disp...",0
20748,twitter in standoff with india's government ov...,0


In [5]:
#for i in range(data.shape[0]):
#    if "[" in data['title'].iloc[i]:
#        print(data['title'].iloc[i] + '\n')