# FDS Final Project - Cemplate
Ima drop some template here for easy implementation

In [1]:
import pandas as pd
import numpy as np

import pickle
import time

# surpess warnings
import warnings
warnings.filterwarnings('ignore')

## Datasets

So here, we can prepare the datasets to test out the models. We can use several of them during our analysis, and see how the models perform on different datasets. If there is no variation in the performance, then we may remove all but one before the final submission. Or use it as an evidence of robustness. 

I am intentionally not changing the columns for now, so you can see the data as it is. Later on, we will have one column of **text** and one column of **label**.

In [120]:
# Dataset 1 from Huffington Post
huffpost = pd.read_json('data/huff_post.json', lines=True).dropna().reset_index()
huffpost = huffpost[['short_description', 'category']]
huffpost.columns = ['text', 'label']
huffpost.head(1)

Unnamed: 0,text,label
0,Health experts said it is too early to predict...,U.S. NEWS


In [121]:
# Dataset 2 from Bancolombia
bancolombia = pd.read_csv('data/bancolombia.csv').dropna().reset_index()
bancolombia = bancolombia[['news', 'Type']]
bancolombia.columns = ['text', 'label']
bancolombia.head(1)

Unnamed: 0,text,label
0,Durante el foro La banca articulador empresari...,Otra


In [122]:
# Dataset 3 from Folha de Sao Paulo
folha = pd.read_csv('data/folha.csv').dropna().reset_index()
folha = folha[['text', 'category']]
folha.columns = ['text', 'label']
folha.head(1)

Unnamed: 0,text,label
0,DA BBC BRASIL O trimestre de maio a julho des...,sobretudo


In [123]:
# Dataset 4 from BBC, no null values
bbc = pd.read_csv('data/bbc.csv').dropna().reset_index()
bbc = bbc[['content', 'category']]
bbc.columns = ['text', 'label']
bbc.head(1)

Unnamed: 0,text,label
0,Quarterly profits at US media giant TimeWarner...,business


In [124]:
# Dataset 4 from NOS, no null values
nos = pd.read_csv('data/nos.csv').dropna().reset_index()
nos = nos[['content', 'category']]
nos.columns = ['text', 'label']
nos.head(1)

Unnamed: 0,text,label
0,De enige kerncentrale van Litouwen is oudjaars...,Buitenland


In [125]:
# Dataset 5 from Geeks4Geeks, no null values
geeks4geeks = pd.read_csv('data/geeks4geeks.csv').dropna().reset_index()
geeks4geeks = geeks4geeks[['title', 'category']]
geeks4geeks.columns = ['text', 'label']
geeks4geeks.head(1)

Unnamed: 0,text,label
0,5 Best Practices For Writing SQL Joins,easy


In [126]:
# Now we put the datasets in a list
datasets = [huffpost, bancolombia, folha, bbc, nos, geeks4geeks]
dataset_names = ['huffpost', 'bancolombia', 'folha', 'bbc', 'nos', 'geeks4geeks']
dataset_names = ['Huffington Post', 'Bancolombia', 'Folha de Sao Paolo', 'BBC', 'NOS', 'Geeks4Geeks']

In [100]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import itertools

fig = make_subplots(rows=2, cols=3, specs=[[{'type':'pie'}, {'type':'pie'}, {'type':'pie'}], [{'type':'pie'}, {'type':'pie'},{'type':'pie'}]],
                    subplot_titles=dataset_names)

k = 0
for i,j in itertools.product(range(2), range(3)):
    fig.add_trace(go.Pie(
        labels=datasets[k]['label'].value_counts().index, 
        values=datasets[k]['label'].value_counts().values,
        name=dataset_names[k],
        textposition='inside', textinfo='percent+label',
        title={'text': f'Average text length: {datasets[k]["text"].apply(lambda x: len(str(x).split())).mean():.2f} words',
               'font': {'size': 10}}
            ),
        row = i+1, col = j+1)

    fig.update
    k += 1


fig.update_layout(title_text='Distribution of classes in the datasets',
                    showlegend=False)

fig.write_html('plots/distribution_of_classes.html')
fig.show()

## Classifiers

So here we set up the classifiers we are going to use. Since SKLearn has a constant interface, we can just use a list of classifiers and iterate over them. For example, they all have a `fit` method, and a `predict` method. So we can just call them in a loop, and get the results. That is what I've seen in the other notebooks.

In [4]:
# we will import Naive Bayes, softmax classifier, and svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [5]:
classifiers_old = [
    MultinomialNB(),
    LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=1, C=0.1),
    LogisticRegression(penalty='l2', C=0.1),
    SVC()
]

classifier_names = ['Naive Bayes', 'Softmax ElasticNet', 'Softmax L2', 'SVM']

In [137]:
import nbc

cnbc = nbc.ClassicalNaiveBayes() # Classical Naive Bayes
wnbc = nbc.ClassicalNaiveBayes() # Weighted Naive Bayes
nbL2 = nbc.ClassicalNaiveBayes() # Naive Bayes with L2 regularization
wnbL2 = nbc.ClassicalNaiveBayes() # Weighted Naive Bayes with L2 regularization

classifiers = [cnbc, wnbc, nbL2, wnbL2]
classifier_names = ['ClassicalNB', 'WeightedNB', 'NBL2', 'WeightedNBL2']

def fit_and_score(X_train, y_train, X_test, y_test):
    # fit all classifiers
    print(f'Fitting {classifier_names[0]}')
    cnbc.fit(X_train, y_train)
    print(f'Fitting {classifier_names[1]}')
    wnbc.fit(X_train, y_train, weight=True)
    print(f'Fitting {classifier_names[2]}')
    nbL2.fit_L2(X_train, y_train)
    print(f'Fitting {classifier_names[3]}')
    wnbL2.fit_L2(X_train, y_train, weight=True)

    training_acc = [cls.score(X_train, y_train) for cls in classifiers]
    test_acc     = [cls.score(X_test, y_test) for cls in classifiers]

    return training_acc, test_acc

## Fit and Predict

So here we fit the model, and predict the results. I am just printint the accuracy for the training set, and the test set. But we can also print other metrics. Also SKLearn has a `classification_report` function that can be used to print the precision, recall, and f1-score for each class, just writing this down so I don't forget.

In [6]:
# we will import train_test_split to split our data
from sklearn.model_selection import train_test_split

In [105]:
# load stopwords
with open('stopwords/spanish', 'rb') as f:
    spanish_stopwords = pickle.load(f)

with open('stopwords/english', 'rb') as f:
    english_stopwords = pickle.load(f)

with open('stopwords/portuguese.pkl', 'rb') as f:
    portuguese_stopwords = pickle.load(f)

with open('stopwords/dutch.pkl', 'rb') as f:
    dutch_stopwords = pickle.load(f)

In [128]:
for dataset in datasets:
    dataset['text'] = dataset['text'].astype(str)
dataset_names

['Huffington Post',
 'Bancolombia',
 'Folha de Sao Paolo',
 'BBC',
 'NOS',
 'Geeks4Geeks']

In [130]:
# we will import CountVectorizer to vectorize our text
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_huffpost    = CountVectorizer(min_df=5, stop_words=english_stopwords)
vectorizer_bancolombia = CountVectorizer(min_df=5, stop_words=spanish_stopwords)
vectorizer_folha       = CountVectorizer(min_df=5, stop_words=portuguese_stopwords)
vectorizer_bbc         = CountVectorizer(min_df=5, stop_words=english_stopwords)
vectorizer_nos         = CountVectorizer(min_df=5, stop_words=dutch_stopwords)
vectorizer_geeks4geeks = CountVectorizer(min_df=5, stop_words=english_stopwords)

vectorizers = [vectorizer_huffpost, vectorizer_bancolombia, vectorizer_folha, vectorizer_bbc, vectorizer_nos, vectorizer_geeks4geeks]

X = [vect.fit_transform(dataset['text']) for vect, dataset in zip(vectorizers, datasets)]
Y = [dataset['label'] for dataset in datasets]

# save vectorizers and X
with open('vectorizers.pkl', 'wb') as f:
    pickle.dump(vectorizers, f)

with open('X.pkl', 'wb') as f:
    pickle.dump(X, f)

In [138]:
# we will import train_test_split to split our data
from sklearn.model_selection import train_test_split

# Number of simulations for each dataset
M = 5

accuracy = pd.DataFrame(columns=classifier_names)

for x, y, dataset_name in zip(X, Y, dataset_names):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    print(f'Fitting {dataset_name} dataset')
    training_acc, test_acc = fit_and_score(X_train, y_train, X_test, y_test)

    accuracy.loc[f'{dataset_name} Train'] = training_acc
    accuracy.loc[f'{dataset_name} Test'] = test_acc

Fitting ClassicalNB
Fitting WeightedNB
Fitting NBL2
Fitting WeightedNBL2
Fitting ClassicalNB
Fitting WeightedNB
Fitting NBL2
Fitting WeightedNBL2
Fitting ClassicalNB
Fitting WeightedNB
Fitting NBL2
Fitting WeightedNBL2
Fitting ClassicalNB
Fitting WeightedNB
Fitting NBL2
Fitting WeightedNBL2
Fitting ClassicalNB
Fitting WeightedNB
Fitting NBL2
Fitting WeightedNBL2
Fitting ClassicalNB
Fitting WeightedNB
Fitting NBL2
Fitting WeightedNBL2


In [139]:
accuracy

Unnamed: 0,ClassicalNB,WeightedNB,NBL2,WeightedNBL2
Huffington Post Train,0.528907,0.528907,0.16971,0.019753
Huffington Post Test,0.43452,0.43452,0.170739,0.019711
Bancolombia Train,0.951696,0.951696,0.996917,0.996917
Bancolombia Test,0.864754,0.864754,0.758197,0.758197
Folha de Sao Paolo Train,0.920128,0.920128,0.730919,0.730919
Folha de Sao Paolo Test,0.889076,0.889076,0.726828,0.726828
BBC Train,0.99382,0.99382,1.0,1.0
BBC Test,0.973034,0.973034,0.941573,0.941573
NOS Train,0.712266,0.712266,0.338108,0.338108
NOS Test,0.684452,0.684452,0.337377,0.337377


## Evaluation

Here we can look deeper into the models. As an example, below I am printing the most important words (that have the highest conditonal probability given a class) for the Naive Bayes.

In [23]:
sknbc = MultinomialNB()
sknbc.fit(X_train, y_train)
phi = sknbc.feature_log_prob_
phi.shape

(7, 7661)

In [24]:
print('Most informative features for each class for Naive Bayes', end='\n\n')
for i, cls in enumerate(sknbc.classes_):
    if cls == 'Otra':
        print(f'{cls}:\t\t', end=' ')
    else:
        print(f'{cls}:\t', end=' ')
    print(', '.join([vectorizer.get_feature_names()[j] for j in phi[i].argsort()[-10:]]))

Most informative features for each class for Naive Bayes

Alianzas:	 parte, presidente, año, mercado, empresas, 000, país, millones, alianza, colombia
Innovacion:	 tecnología, forma, través, innovación, digital, empresas, banco, datos, clientes, bbva
Macroeconomia:	 aumento, economía, alimentos, 2022, mayor, crecimiento, bbva, año, precios, inflación
Otra:		 000, crédito, financiera, entidad, año, clientes, colombia, millones, banco, bbva
Regulaciones:	 año, si, servicios, gobierno, mercado, país, dijo, empresas, colombia, regulación
Reputacion:	 mejor, puesto, colombia, país, 10, millones, posición, sector, reputación, empresas
Sostenibilidad:	 puede, millones, cada, además, cambio, agua, sostenibilidad, sostenible, energía, bbva


In [30]:
import nbc # our own implementation of Naive Bayes

cnbc    = nbc.ClassicalNaiveBayes() # Generic Naive Bayes with Laplace smoothing
wnbc   = nbc.ClassicalNaiveBayes() # Naive Bayes with Laplace smoothing with word weights
nbc_L2  = nbc.ClassicalNaiveBayes() # Naive Bayes with L2 regularization
nbc_L2w = nbc.ClassicalNaiveBayes() # Naive Bayes with L2 regularization with word weights

def fit_and_score(X_train, y_train, X_test, y_test):
    # fit all classifiers
    cnbc.fit(X_train, y_train)
    wnbc.fit(X_train, y_train, weight=True)
    nbc_L2.fit_L2(X_train, y_train)
    nbc_L2w.fit_L2(X_train, y_train, weight=True)

    training_acc = [cnbc.score(X_train, y_train), wnbc.score(X_train, y_train), nbc_L2.score(X_train, y_train), nbc_L2w.score(X_train, y_train)]
    test_acc     = [cnbc.score(X_test, y_test), wnbc.score(X_test, y_test), nbc_L2.score(X_test, y_test), nbc_L2w.score(X_test, y_test)]

    return training_acc, test_acc

In [32]:
training_acc, test_acc = fit_and_score(X_train, y_train, X_test, y_test)

Starting gradient ascent with alpha = 1e-06 and l2 = 1000
Starting gradient ascent with alpha = 1e-06 and l2 = 1000


In [34]:
accuracy = pd.DataFrame(columns=['ClassicalNB', 'WNBC', 'L2NBC', 'L2WNBC'])

accuracy.loc['Bancolombia-training'] = training_acc
accuracy.loc['Bancolombia-test'] = test_acc

accuracy

Unnamed: 0,ClassicalNB,WNBC,L2NBC,L2WNBC
Bancolombia-training,0.951696,0.951696,0.996917,0.996917
Bancolombia-test,0.864754,0.864754,0.758197,0.758197


In [36]:
print('Most informative features for each class for Naive Bayes', end='\n\n')
for i, cls in enumerate(wnbc.classes):
    if cls == 'Otra':
        print(f'{cls}:\t\t', end=' ')
    else:
        print(f'{cls}:\t', end=' ')
    print(', '.join([vectorizer.get_feature_names()[j] for j in cnbc.phi.T[i].argsort()[-10:]]))

    if cls == 'Otra':
        print(f'{cls}:\t\t', end=' ')
    else:
        print(f'{cls}:\t', end=' ')
        
    print(', '.join([vectorizer.get_feature_names()[j] for j in wnbc.phi.T[i].argsort()[-10:]]))

Most informative features for each class for Naive Bayes

Alianzas:	 parte, presidente, año, mercado, empresas, 000, país, millones, alianza, colombia
Alianzas:	 parte, año, presidente, mercado, empresas, 000, país, millones, alianza, colombia
Innovacion:	 tecnología, forma, través, innovación, digital, empresas, banco, datos, clientes, bbva
Innovacion:	 tecnología, forma, través, innovación, digital, empresas, banco, datos, clientes, bbva
Macroeconomia:	 aumento, economía, alimentos, 2022, mayor, crecimiento, bbva, año, precios, inflación
Macroeconomia:	 economía, aumento, alimentos, 2022, mayor, crecimiento, bbva, año, precios, inflación
Otra:		 000, crédito, financiera, entidad, año, clientes, colombia, millones, banco, bbva
Otra:		 000, crédito, financiera, entidad, año, colombia, clientes, millones, banco, bbva
Regulaciones:	 año, si, servicios, gobierno, mercado, país, dijo, empresas, colombia, regulación
Regulaciones:	 sector, si, servicios, gobierno, mercado, país, dijo, empres