# Ejercicio 2

## Leer datos

In [74]:
import pandas as pd
import numpy as np
import operator

# news_data = pd.read_excel('datasets/Noticias_argentinas.xlsx', na_values=['NA'], usecols="A:D")
news_data = pd.read_excel('datasets/Noticias_argentinas.xlsx')

news_data = news_data.loc[news_data['categoria'].notnull()]

In [69]:
import numpy as np
EXPECTED_BY_CATEGORY= {
    'Internacional': 3850,
    'Nacional': 3860,
    'Destacadas': 3859,
    'Deportes': 3855,
    'Salud': 3840,
    'Ciencia y Tecnologia': 3856,
    'Entretenimiento': 3850,
    'Economia': 3850,
    'Noticias destacadas': 133819
}

print(len(news_data[news_data['categoria'] == 'Nacional']))
for k,v in EXPECTED_BY_CATEGORY.items():
    print(f"Categoria: {k}")
    actual = len(news_data[news_data['categoria'] == k])
    print(f"Expected: {v}, Actual: {actual}, Diff: {abs(actual -v)}")

# Total da raro pero el resto como debe
print("Total")
print(f"Expected: {EXPECTED_TOTAL}, Actual: {len(news_data)}, Diff: {abs(len(news_data) - EXPECTED_TOTAL)}")

3860
Categoria: Internacional
Expected: 3850, Actual: 3850, Diff: 0
Categoria: Nacional
Expected: 3860, Actual: 3860, Diff: 0
Categoria: Destacadas
Expected: 3859, Actual: 3859, Diff: 0
Categoria: Deportes
Expected: 3855, Actual: 3855, Diff: 0
Categoria: Salud
Expected: 3840, Actual: 3840, Diff: 0
Categoria: Ciencia y Tecnologia
Expected: 3856, Actual: 3856, Diff: 0
Categoria: Entretenimiento
Expected: 3850, Actual: 3850, Diff: 0
Categoria: Economia
Expected: 3850, Actual: 3850, Diff: 0
Categoria: Noticias destacadas
Expected: 133819, Actual: 133819, Diff: 0
Total
Expected: 160789, Actual: 164639, Diff: 3850


In [65]:
class Category:

    def __init__(self, name):
        self.name = name
        self.headline_count = 0
        self.probability = 0
        self.words = {}
        self.relative_frequencies = {}
        self.probability_of_no_data = 0

    def add_words_from_headline(self, headline):
        
        self.headline_count += 1

        keys = headline.lower().split()
        key_set = list(set(keys))

        for key in key_set:
        	# processed[key] = keys.count(key)
        	if key not in self.words.keys():
	            self.words[key] = 0
	        self.words[key] = self.words.get(key) + keys.count(key)  


    def learn(self, total_headlines):
	    # probability of each word
	    k = len(self.words.keys())
	    total = sum(self.words.values())
	    self.probability = self.headline_count / total_headlines
	    self.probability_of_no_data = 1 / (total + k)

	    for word, cardinal in self.words.items():
	    	self.relative_frequencies[word] = (cardinal + 1) / (total + k)


    def get_productorial(self, headline):
        words = headline.lower().split()

        prod = 1

        for word in words:
            prod = prod * self.probability_of_no_data if word not in self.relative_frequencies.keys() \
                else self.relative_frequencies.get(word)

        # for word in words:
        # 	if word not in self.relative_frequencies.keys():
        # 		prod = prod * self.probability_of_no_data
        # 	else:
        # 		prod = prod * self.relative_frequencies.get(word)

        return prod * self.probability


class Bayes:

    def __init__(self, df):
        self.categories = {}
        self.confusion_matrix = {}

        for headline, category in zip(df['titular'], df['categoria']):
	        if category not in self.categories.keys():
	            self.categories[category] = Category(category)
	        self.categories[category].add_words_from_headline(headline)

    def learn(self, total_headlines):
    	for category in self.categories.values():
    		category.learn(total_headlines)
    	print("Bayes learning finished.")


    def classify(self, testing_data):

        self.confusion_matrix = {
            category: {category: 0 for category in self.categories.keys()}
            for category in self.categories.keys()
        }

        for i in range(len(testing_data)):
            row = testing_data.iloc[i]
            headline = row.titular
            category = row.categoria

            productorial = {name: category.get_productorial(headline) for name, category in self.categories.items()}
            
            winner = max(productorial.items(), key = operator.itemgetter(1))[0]

            self.confusion_matrix[category][winner] += 1


In [66]:
def split_dataframe(df, percentage):

    msk = np.random.rand(len(df)) < percentage
    training_data = df[msk]
    testing_data = df[~msk]

    return training_data, testing_data


In [71]:

categories_filter = [
    'Salud',
    'Deportes',
    'Economia',
    'Ciencia y Tecnologia',
    'Entretenimiento',
    'Nacional',
    'Internacional'
]

news_data = news_data.loc[news_data['categoria'].isin(categories_filter)]

print(len(news_data.index))

26961


In [72]:
training_data, testing_data = split_dataframe(news_data, percentage = 0.99)

In [86]:
bayes = Bayes(training_data)

bayes.learn(len(news_data.index))

bayes.classify(testing_data)
confusion_matrix = bayes.confusion_matrix
pd.DataFrame.from_dict(confusion_matrix, orient='index')

Bayes learning finished.


Unnamed: 0,Nacional,Deportes,Salud,Ciencia y Tecnologia,Entretenimiento,Economia,Internacional
Nacional,24,1,0,3,1,1,1
Deportes,1,37,4,0,0,4,2
Salud,3,0,35,0,0,3,1
Ciencia y Tecnologia,1,0,0,27,0,1,0
Entretenimiento,1,0,1,3,27,0,3
Economia,0,0,0,2,0,36,4
Internacional,6,2,2,0,1,3,22


## Metricas

In [95]:
def true_positive(matrix, class_name):
    return matrix[class_name][class_name]

def true_negative(matrix, class_name):
    count = 0
    for i in matrix.keys():
        if i != class_name:
            for j in matrix[i].keys():
                if j != class_name:
                    count += matrix[i][j]
    return count

def false_negative(matrix, class_name):
    count = 0
    for i in matrix[class_name].keys():
        if i != class_name:
            count += matrix[i][class_name]
    return count

def false_positive(matrix, class_name):
    count = 0
    for i in matrix[class_name].keys():
        if i != class_name:
            count += matrix[class_name][i]
    return count

def accuracy(tp,tn,fp, fn):
    return (tp+tn)/(tp+tn+fn+fp)

def precision(tp,fp):
    return tp/(tp+fp)

def recall(tp,fn):
    return tp/(tp+fn)

def f1_score(p, r):
    return (2*p*r)/(p+r)

def tp_rate(tp, fn):
    return tp/(tp+fn)

def fp_rate(fp, tn):
    return fp/(fp+tn)


def metrics_table(matrix):
    to_return={}
    
    for k in matrix.keys():
        to_return[k]={}
        to_return[k]['tp'] = tp = true_positive(matrix, k)
        to_return[k]['tn'] = tn = true_negative(matrix, k)
        to_return[k]['fp'] = fp = false_positive(matrix, k)
        to_return[k]['fn'] = fn = false_negative(matrix, k)
        to_return[k]['accuracy'] = accuracy(tp, tn, fp, fn)
        to_return[k]['precision']= p = precision(tp, fp)
        to_return[k]['recall'] = r = recall(tp, fn)
        to_return[k]['f1_score'] = f1_score(p, r)
        to_return[k]['tp_rate'] = tp_rate(tp, fn)
        to_return[k]['fp_rate'] = fp_rate(fp, tn)
    return to_return

metrics = metrics_table(confusion_matrix)
pd.DataFrame.from_dict(metrics, orient='index')

Unnamed: 0,tp,tn,fp,fn,accuracy,precision,recall,f1_score,tp_rate,fp_rate
Nacional,24,220,7,12,0.927757,0.774194,0.666667,0.716418,0.666667,0.030837
Deportes,37,212,11,3,0.946768,0.770833,0.925,0.840909,0.925,0.049327
Salud,35,214,7,7,0.946768,0.833333,0.833333,0.833333,0.833333,0.031674
Ciencia y Tecnologia,27,226,2,8,0.961977,0.931034,0.771429,0.84375,0.771429,0.008772
Entretenimiento,27,226,8,2,0.961977,0.771429,0.931034,0.84375,0.931034,0.034188
Economia,36,209,6,12,0.931559,0.857143,0.75,0.8,0.75,0.027907
Internacional,22,216,14,11,0.904943,0.611111,0.666667,0.637681,0.666667,0.06087


## ROC