# Imports

In [52]:
# import nltk
import pandas as pd

# nltk.download('omw-1.4')
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Load Data

In [53]:
def load_csv(filename, clickbait):
    df = pd.read_csv(filename)
    df.drop_duplicates(subset='url', inplace=True)
    df['title'] = df['title'].str.strip()
    df['title'] = df['title'].str.split('https://url4ever.com').str[0].str.strip()
    df['title'] = df['title'].str.split('- reuters').str[0].str.strip()
    
    autonp_prefixes = ['[economy] -', '[arts] -', '[op-ed] -', '[national] -', '[world] -', '[politics] -', '[local] -', '[video] -', '[sports] -', '[business] -', '[entertainment] -', '[tech] -', '[science] -', '[health] -']
    df['title'] = df['title'].apply(lambda x: presplit(x, autonp_prefixes))
    autonp_postfixes = ['| pbs', '| cnn', '| sydney morning herald', '| chicago tribune', '| chicago sun-times', 'la times', 'iol', '| al jazeera', '| washington post', '| toronto star', '| telegraph', '| bbc', '| south china morning post', '| npr', '| guardian', '| the japan times', '| abc', '| fox', '| al arabiya', '| nbc', '| irish times', '| manila bulletin', '| nz herald', '| times of india', '| sana', '| usatoday', '| nypost']
    df['title'] = df['title'].apply(lambda x: postsplit(x, autonp_postfixes))
    
    ap_prefixes = ['(ap:)', 'ap report:', 'ap investigation:', 'ap poll:', 'ap analysis:', 'ap sources:', 'ap source:', 'ap fact check:', '[ap news]', 'ap photos:', 'ap news:', 'ap:', 'ap interview:', 'the ap interview:']
    df['title'] = df['title'].apply(lambda x: presplit(x, ap_prefixes))
    ap_postfixes = ['- associated press', '| ap news', '| february 24, 2021', '[ap]', '(ap)']
    df['title'] = df['title'].apply(lambda x: postsplit(x, ap_postfixes))
    
    other_prefixes = ['news brief:', 'bbc:', 'watch:', 'reuters:', 'watch live:', 'breaking:', 'the latest:']
    df['title'] = df['title'].apply(lambda x: presplit(x, other_prefixes))
    other_postfixes = []
    df['title'] = df['title'].apply(lambda x: postsplit(x, other_postfixes))
    
    banned = ['ap poll', 'reuters', 'pbs', 'npr', 'apnews', 'associated press', 'savedyouaclick']
    for term in banned:
        df = df[df['title'].str.contains(term, regex=False) == False]
    
    df.drop_duplicates(subset='title', inplace=True)
    df = df[['created_utc', 'title', 'score']]
    df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
    df['clickbait'] = clickbait
    
    return df

def presplit(title, prefixes):
    for pre in prefixes:
        if title.startswith(pre):
            return title.split(pre)[1].strip()
    
    return title
    
def postsplit(title, postfixes):
    for post in postfixes:
        if title.endswith(post):
            return title.split(post)[0].strip()
    
    return title

In [54]:
# load clickbait sources
clickbait = load_csv('clickbait.csv.gz', 1)
clickbait = clickbait[clickbait['title'].str.contains('|', regex=False)]
clickbait['title'] = clickbait['title'].str.split('|').str[0].str.strip()
clickbait.drop_duplicates(subset='title', inplace=True)
clickbait = clickbait[clickbait['title'].notna()]
clickbait = clickbait[clickbait['score'] > 5]
#display(clickbait)

# load non-clickbait sources
apnews = load_csv('apnews.csv.gz', 0)
npr = load_csv('npr.csv.gz', 0)
pbs = load_csv('pbs.csv.gz', 0)
reuters = load_csv('reuters.csv.gz', 0)
news = pd.concat([apnews, reuters, npr, pbs], ignore_index=True)
#display(news)

# ensure balance between clickbait and non-clickbait entries
clickbait = clickbait.sample(min(clickbait.shape[0], news.shape[0]), ignore_index=True)
news = news.sample(min(clickbait.shape[0], news.shape[0]), ignore_index=True)
#display(clickbait)
#display(news)

In [55]:
data = pd.concat([clickbait, news], ignore_index=True)
data = data[['title', 'clickbait']]
data = data.sample(frac=1, ignore_index=True)

data.to_csv("filtered.csv.gz", compression='gzip', index=False)

display(data)

Unnamed: 0,title,clickbait
0,Why Song of the South is the Movie Disney Does...,1
1,There's one good reason to update to macOS Hig...,1
2,He's Been Secretly Taking Pictures Of His Best...,1
3,Ukrainian sailors tried to block a Russian oli...,0
4,Death toll rises after Ida’s remnants hit Nort...,0
...,...,...
21713,"EXCLUSIVE U.S. asks Japan, China, others to co...",0
21714,SK Innovation to invest $4.3 bln in U.S. batte...,0
21715,"[National] - In ‘Scranton Lace,’ nostalgia for...",0
21716,Word Leaks Out About Hillary's Post-Election C...,1
