In [1]:
from Data import *
from Tools import *
from Naive_Bayes import *

import random
import time

### Load data

In [2]:
data = get_data("./Boydstun_NYT_FrontPage_Dataset_1996-2006_0.csv", "Article_ID")
topic_col = "Topic_2digit"

### N-fold validation

In [6]:
def get_error_model(data, selected_words, dict_features, news_per_topic, words_topic_dictionary, model, debug=False):
    error = 0
    hits = {} # key:topic, value:nº of hits
    
    for topic in news_per_topic.keys():
        hits[topic] = 0
        
    for n_id in dict_features.keys():
        result_topic = model(news_per_topic, words_topic_dictionary, selected_words, dict_features[n_id])
        real_topic = data.loc[n_id][topic_col]
        
        if result_topic == real_topic:
            hits[result_topic] += 1
        else:
            error += 1
    
    if debug:
        for topic in news_per_topic.keys():
            hit_pct = float(hits[topic])/news_per_topic[topic]
            print (f"Tópico:{topic} / Aciertos:{hits[topic]} / Notícias:{news_per_topic[topic]} ---> %.2f%%" %(hit_pct*100))
    
    return float(error)/len(dict_features.keys())

In [7]:
def n_fold(data, n, model):
    # shuffle the indexes in DataFrame
    indexes = list(data.index)
    rnd.shuffle(indexes)
    
    # to add a new column 'mod' with the result of applying modulo operation 'n' to shuffled indexes
    data["mod"] = [id%n for id in indexes]
    
    # 'n_fold' validation method
    total_error = 0
    for i in range(n):
        data_test = data[data["mod"] == i] # contains 1/n of the data
        data_training = data[data["mod"] != i] # contains (n-1)/n of the data
        
        # 1. let's build the auxiliary dictionaries with the information of the data in 'data_training'
        news_per_topic = get_topic_news_count(data_training, topic_col)
        words_dictionary = get_word_count(data_training)
        words_topic_dictionary = get_word_topic_count(data_training)
        
        # 2. let's pick the 'N' words more representatives for each topic and get a list of all of them (without repetition)
        N = 15
        top_words = get_top_N_words(data_training, words_dictionary, words_topic_dictionary, news_per_topic, N)
        selected_words = get_selected_words(top_words)
        
        # 3. let's take the features vector for each one of the news in 'data_test'
        dict_features = create_features(data_test, selected_words)
        
        # 4. to calculate the error value for 'data_test' with the given model
        error = get_error_model(data_test, selected_words, dict_features, news_per_topic, words_topic_dictionary, model)
        total_error += error
        
    return float(total_error)/n

### Naïve-Bayes model

In [8]:
t_init = time.perf_counter()
n = 50
error_n_fold = n_fold(data, n, get_naive_bayes_topic)
t_end = time.perf_counter()
print ("Processing time: %0.3f ms" % ((t_end-t_init)*1000))
print (f"ERROR {n}-fold: %.2f%%" %(error_n_fold*100))

Processing time: 1677254.830 ms
ERROR 50-fold: 57.37%


### Random model

In [9]:
def get_random_topic(news_per_topic, * args):
    return rnd.choice(news_per_topic.keys())

In [10]:
t_init = time.perf_counter()
n = 50
error_n_fold = n_fold(data, n, get_random_topic)
t_end = time.perf_counter()
print ("Processing time: %0.3f ms" % ((t_end-t_init)*1000))
print (f"ERROR {n}-fold: %.2f%%" %(error_n_fold*100))

Processing time: 650125.633 ms
ERROR 50-fold: 96.33%
