# Imports

In [1]:
import nltk
import pandas as pd

nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\CJ\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\CJ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CJ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CJ\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load Data

In [2]:
def load_csv(filename, clickbait):
    df = pd.read_csv(filename)
    df.drop_duplicates(subset='url', inplace=True)
    df['title'] = df['title'].str.lower().str.strip()
    df['title'] = df['title'].str.split('https://url4ever.com').str[0].str.strip()
    df['title'] = df['title'].str.split('- reuters').str[0].str.strip()
    
    autonp_prefixes = ['[economy] -', '[arts] -', '[op-ed] -', '[national] -', '[world] -', '[politics] -', '[local] -', '[video] -', '[sports] -', '[business] -', '[entertainment] -', '[tech] -', '[science] -', '[health] -']
    df['title'] = df['title'].apply(lambda x: presplit(x, autonp_prefixes))
    autonp_postfixes = ['| pbs', '| cnn', '| sydney morning herald', '| chicago tribune', '| chicago sun-times', 'la times', 'iol', '| al jazeera', '| washington post', '| toronto star', '| telegraph', '| bbc', '| south china morning post', '| npr', '| guardian', '| the japan times', '| abc', '| fox', '| al arabiya', '| nbc', '| irish times', '| manila bulletin', '| nz herald', '| times of india', '| sana', '| usatoday', '| nypost']
    df['title'] = df['title'].apply(lambda x: postsplit(x, autonp_postfixes))
    
    ap_prefixes = ['(ap:)', 'ap report:', 'ap investigation:', 'ap poll:', 'ap analysis:', 'ap sources:', 'ap source:', 'ap fact check:', '[ap news]', 'ap photos:', 'ap news:', 'ap:', 'ap interview:', 'the ap interview:']
    df['title'] = df['title'].apply(lambda x: presplit(x, ap_prefixes))
    ap_postfixes = ['- associated press', '| ap news', '| february 24, 2021', '[ap]', '(ap)']
    df['title'] = df['title'].apply(lambda x: postsplit(x, ap_postfixes))
    
    other_prefixes = ['news brief:', 'bbc:', 'watch:', 'reuters:', 'watch live:', 'breaking:', 'the latest:']
    df['title'] = df['title'].apply(lambda x: presplit(x, other_prefixes))
    other_postfixes = []
    df['title'] = df['title'].apply(lambda x: postsplit(x, other_postfixes))
    
    banned = ['reuters', 'pbs', 'npr', 'apnews', 'associated press', 'savedyouaclick']
    for term in banned:
        df = df[df['title'].str.contains(term, regex=False) == False]
    
    df = df[['created_utc', 'title', 'score']]
    df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
    df['clickbait'] = clickbait
    
    return df

def presplit(title, prefixes):
    for pre in prefixes:
        if title.startswith(pre):
            return title.split(pre)[1].strip()
    
    return title
    
def postsplit(title, postfixes):
    for post in postfixes:
        if title.endswith(post):
            return title.split(post)[0].strip()
    
    return title

In [3]:
clickbait = load_csv('clickbait.csv.gz', 1)
clickbait = clickbait[clickbait['title'].str.contains('|', regex=False)]
clickbait['title'] = clickbait['title'].str.split('|').str[0].str.strip()
clickbait.drop_duplicates(subset='title', inplace=True)

#clickbait = clickbait[clickbait['score'] > 10]
#display(clickbait)

apnews = load_csv('apnews.csv.gz', 0)
apnews = apnews[apnews['title'].str.contains('ap poll', regex=False) == False]
apnews.drop_duplicates(subset='title', inplace=True)

reuters = load_csv('reuters.csv.gz', 0)
reuters.drop_duplicates(subset='title', inplace=True)

npr = load_csv('npr.csv.gz', 0)
npr.drop_duplicates(subset='title', inplace=True)

pbs = load_csv('pbs.csv.gz', 0)
pbs.drop_duplicates(subset='title', inplace=True)

news = pd.concat([apnews, reuters, npr, pbs], ignore_index=True)
#news = news[news['score'] > 10]
#display(news)

clickbait = clickbait.sample(min(clickbait.shape[0], news.shape[0]))
news = news.sample(min(clickbait.shape[0], news.shape[0]))

#display(clickbait)
#display(news)

In [4]:
data = pd.concat([clickbait, news], ignore_index=True)
data = data[data['score'] > 10]
data = data[['title', 'clickbait']]
data = data.sample(frac=1, ignore_index=True)
display(data)

Unnamed: 0,title,clickbait
0,how high can runaway helium balloons fly?,1
1,"guy pours salt on the beach, what comes out is...",1
2,what alex from target is doing now is unbeliev...,1
3,uno just officially declared this popular move...,1
4,look what happens when you shred toilet paper!,1
...,...,...
10747,8 reasons why men are so much in love with bre...,1
10748,busboy who held dying robert f. kennedy shares...,1
10749,you’ll never guess where dolly parton likes to...,1
10750,the least stressful jobs of 2015,1
