# Imports

In [None]:
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import time

#nltk.config_megam('megam.opt')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load Data

In [None]:
filtered = pd.read_csv('data/filtered.csv.gz')
#display(filtered)

# Prepare Data

In [None]:
stopwords = nltk.corpus.stopwords.words("english")

def filter_stopwords(wordlist):
    filtered = []
    
    for word in wordlist:
        if word not in stopwords:
            filtered.append(word)
    
    return filtered

def lemmatize_words(wordlist):
    lemmatized = []
    
    for word in wordlist:
        lemmatized.append(nltk.stem.wordnet.WordNetLemmatizer().lemmatize(word))
    
    return lemmatized

def prepare_titles(data):
    data['title'] = data['title'].str.lower().str.strip()
    data = data[data['title'] != '']
    data.dropna(subset='title', inplace=True)
            
    data['title'] = data['title'].apply(lambda x: nltk.word_tokenize(x))
    #data['title'] = data['title'].apply(lambda x: filter_stopwords(x)) # lowers accuracy
    data['title'] = data['title'].apply(lambda x: lemmatize_words(x))
    
    return data

In [None]:
def find_feature(wordlist):
    feature = {}
    
    for x in all_features:
        feature[x] = x in wordlist
        
    return feature

def create_featuresets(data, train, num_features=100):
    #if train:
    #    document = [(row['title'], row['clickbait']) for index, row in data.iterrows()]
    #else:
    #    document = [(row['title']) for index, row in data.iterrows()]
    
    all_words = []

    for index, row in data.iterrows():
        for word in row['title']:
            all_words.append(word)

    if train:
        global all_features
        all_features = list(nltk.FreqDist(all_words))[:num_features]
        
        # vectorizing using numpy shaves off about 40% of the processing time for this section
        
        featuresets = np.array(data[['title', 'clickbait']])
        find_vector = np.vectorize(find_feature)
        featuresets[:,0] = find_vector(featuresets[:,0])
    else:
        featuresets = np.array(data[['title']])
        find_vector = np.vectorize(find_feature)
        featuresets[:,0] = find_vector(featuresets[:,0])
    
    #if train:
    #    featuresets = [(find_feature(wordlist), category) for (wordlist, category) in document]
    #else:
    #    featuresets = [(find_feature(wordlist)) for (wordlist) in document]
        
    return featuresets

In [None]:
def prepare_data(data, train, num_feaures=100):
    #time0 = time.time()
    data = prepare_titles(data)
    #print('prepare_titles time: {:.2f}s'.format(time.time() - time0))
    
    # used for error analysis below
    #global temp_datacopy
    #temp_datacopy = data.copy()
    
    #time0 = time.time()
    data = create_featuresets(data, train, num_feaures)
    #print('create_featuresets time: {:.2f}s'.format(time.time() - time0))
    
    return data

In [None]:
prepared = prepare_data(filtered, True, 500)
train_ratio = 0.7
train, test = prepared[:int(len(prepared) * train_ratio)], prepared[int(len(prepared) * train_ratio):]

# Train Models

In [None]:
time0 = time.time()
nbclassifier = nltk.NaiveBayesClassifier.train(train)
#print('time: {:.2f}s'.format(time.time() - time0))

In [None]:
# trains too slowly
#time0 = time.time()
#meclassifier_iis = nltk.MaxentClassifier.train(train, algorithm='iis', max_iter=5)
#print('\ntraining duration: {:.2f}s'.format(time.time() - time0))

# trains too slowly
#time0 = time.time()
#meclassifier_gis = nltk.MaxentClassifier.train(train, algorithm='gis', max_iter=5)
#print('\ntraining duration: {:.2f}s'.format(time.time() - time0))

# much less accurate than NaiveBayesClassifier
#time0 = time.time()
#meclassifier_megam = nltk.MaxentClassifier.train(train, algorithm='megam')
#print('\ntraining duration: {:.2f}s'.format(time.time() - time0))

# Test Models

In [None]:
#time0 = time.time()
accuracy = nltk.classify.accuracy(nbclassifier, test)
#print('accuracy: {:.4f}, time: {:.2f}s'.format(accuracy, time.time() - time0))

In [None]:
# used for error analysis

#errors = []
#index = 0

#for (words, clickbait) in train:
#    prediction = nbclassifier.classify(words)
    
#    if prediction != clickbait:
#        errors.append((clickbait, temp_datacopy['title'].iloc[index], filtered['title'].iloc[index]))

#    index += 1
    
#for e in errors:
#    if e[0] == 1:
#        print('clickbait: yes')
#    else:
#        print('clickbait: no')

#    print(e[2])
#    print(e[3] + '\n')

In [None]:
# trains too slowly
#time0 = time.time()
#print(nltk.classify.accuracy(meclassifier_iis, test))
#print('\ntesting duration: {:.2f}s'.format(time.time() - time0))

# trains too slowly
#time0 = time.time()
#print(nltk.classify.accuracy(meclassifier_gis, test))
#print('\ntesting duration: {:.2f}s'.format(time.time() - time0))

# much less accurate than NaiveBayesClassifier
#time0 = time.time()
#print(nltk.classify.accuracy(meclassifier_megam, test))
#print('\ntesting duration: {:.2f}s'.format(time.time() - time0))

# Classify

In [None]:
files = ['data/nottheonion_lg.csv.gz', 
         'data/politics_lg.csv.gz', 
         'data/upliftingnews_lg.csv.gz', 
         'data/worldnews_lg.csv.gz']#, 
         #'data/news_lg.csv.gz']

In [None]:
dfs = []

for file in files:
    dfs.append(pd.read_csv(file))

In [None]:
filtered_dfs = []

for df in dfs:
    df = df[df['score'] > 50]
    df = df[['created_utc', 'title']]
    
    if type(df['created_utc'].iloc[0]) == str:
        df['created_utc'] = pd.to_datetime(df['created_utc'], format='%Y-%m-%d %H:%M:%S')
    else:
        df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
        
    df = df[df['created_utc'].dt.year > 2013]
    
    filtered_dfs.append(df)

In [None]:
featuresets = []

for df in filtered_dfs:
    df = prepare_data(df, False)
    featuresets.append(df)

In [None]:
def custom_classify(featureset):
    return nltk.NaiveBayesClassifier.classify(nbclassifier, featureset)

In [None]:
vector_classify = np.vectorize(custom_classify)

In [None]:
df_predictions = filtered_dfs.copy()

for i in range(len(featuresets)):
    predictions = np.array(featuresets[i])
    predictions = vector_classify(predictions)
    df_predictions[i]['nb_predictions'] = predictions
    df_predictions[i].to_csv('data/nltk_predictions_' + str(i) + '.csv.gz', index=False, compression="gzip")

# Visualize Results

In [None]:
def visualize_results(data, thresh):
    temp = data.copy()
    temp['created_utc'] = temp['created_utc'].dt.year
    
    result_types = ['nb']
    result = pd.DataFrame()
    
    for rt in result_types:
        count = temp.pivot_table(index='created_utc', columns=rt + '_predictions', aggfunc='size')
        count[rt + '_ratio'] = count[1] / (count[1] + count[0])
        count = count[(count[1] + count[0]) > thresh]
        result = result.join(count[[rt + '_ratio']], how='right')
    
    return result

In [None]:
plt.figure(figsize=(12,8))

for df in df_predictions:
    result = visualize_results(df, 150)
    plt.plot(result * 100)

plt.title('Percentage of clickbait titles in selected news subreddits (2014-2021)')
plt.legend(['r/NotTheOnion', 'r/politics', 'r/UpliftingNews', 'r/worldnews'])
plt.xlabel('Year')
plt.ylabel('Percentage (%)')
plt.savefig('nltk_analysis.png')