# FDS Final Project - Cemplate
Ima drop some template here for easy implementation

In [1]:
import pandas as pd
import numpy as np

import pickle
import time

# surpess warnings
import warnings
warnings.filterwarnings('ignore')

## Datasets

So here, we can prepare the datasets to test out the models. We can use several of them during our analysis, and see how the models perform on different datasets. If there is no variation in the performance, then we may remove all but one before the final submission. Or use it as an evidence of robustness. 

I am intentionally not changing the columns for now, so you can see the data as it is. Later on, we will have one column of **text** and one column of **label**.

In [331]:
# Dataset 1 from Huffington Post
huffpost = pd.read_json('data/huff_post.json', lines=True).dropna().reset_index()
huffpost = huffpost[['short_description', 'category']]
huffpost.columns = ['text', 'label']
huffpost.head(1)

Unnamed: 0,text,label
0,Health experts said it is too early to predict...,U.S. NEWS


In [332]:
# Dataset 2 from Bancolombia
bancolombia = pd.read_csv('data/bancolombia.csv').dropna().reset_index()
bancolombia = bancolombia[['news', 'Type']]
bancolombia.columns = ['text', 'label']
bancolombia.head(1)

Unnamed: 0,text,label
0,Durante el foro La banca articulador empresari...,Otra


In [345]:
# Dataset 3 from Folha de Sao Paulo
# https://www.kaggle.com/datasets/marlesson/news-of-the-site-folhauol
folha = pd.read_csv('data/folha.csv').dropna().reset_index()
folha = folha[['text', 'category']]
folha.columns = ['text', 'label']
folha.head(1)

Unnamed: 0,text,label
0,DA BBC BRASIL O trimestre de maio a julho des...,sobretudo


In [334]:
# Dataset 4 from BBC, no null values
bbc = pd.read_csv('data/bbc.csv').dropna().reset_index()
bbc = bbc[['content', 'category']]
bbc.columns = ['text', 'label']
bbc.head(1)

Unnamed: 0,text,label
0,Quarterly profits at US media giant TimeWarner...,business


In [335]:
# Dataset 4 from NOS
# https://www.kaggle.com/datasets/maxscheijen/dutch-news-articles
nos = pd.read_csv('data/nos.csv').dropna().reset_index()
nos = nos[['content', 'category']]
nos.columns = ['text', 'label']
nos.head(1)

Unnamed: 0,text,label
0,De enige kerncentrale van Litouwen is oudjaars...,Buitenland


In [336]:
# Dataset 5 from Geeks4Geeks
# https://www.kaggle.com/datasets/ashishjangra27/geeksforgeeks-articles
geeks4geeks = pd.read_csv('data/geeks4geeks.csv').dropna().reset_index()
geeks4geeks = geeks4geeks[['title', 'category']]
geeks4geeks.columns = ['text', 'label']
geeks4geeks.head(1)

Unnamed: 0,text,label
0,5 Best Practices For Writing SQL Joins,easy


In [337]:
# Now we put the datasets in a list
datasets = [huffpost, bancolombia, folha, bbc, nos, geeks4geeks]
dataset_names = ['huffpost', 'bancolombia', 'folha', 'bbc', 'nos', 'geeks4geeks']
dataset_names = ['Huffington Post', 'Bancolombia', 'Folha de Sao Paolo', 'BBC', 'NOS', 'Geeks4Geeks']

In [344]:
folha['label'].value_counts()

colunas                  21619
esporte                   2859
tv                        2123
sobretudo                 1057
poder                      939
saopaulo                   471
paineldoleitor             260
empreendedorsocial         150
o-melhor-de-sao-paulo       71
cotidiano                   35
multimidia                  27
Name: label, dtype: int64

In [339]:
for i in range(len(datasets)):
    print(dataset_names[i], datasets[i].shape, len(datasets[i]['label'].value_counts()))

Huffington Post (209527, 2) 42
Bancolombia (1217, 2) 7
Folha de Sao Paolo (29611, 2) 11
BBC (2225, 2) 5
NOS (246457, 2) 11
Geeks4Geeks (34551, 2) 5


In [346]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import itertools

fig = make_subplots(rows=2, cols=3, specs=[[{'type':'pie'}, {'type':'pie'}, {'type':'pie'}], [{'type':'pie'}, {'type':'pie'},{'type':'pie'}]],
                    subplot_titles=dataset_names)

k = 0
for i,j in itertools.product(range(2), range(3)):
    fig.add_trace(go.Pie(
        labels=datasets[k]['label'].value_counts().index, 
        values=datasets[k]['label'].value_counts().values,
        name=dataset_names[k],
        textposition='inside', textinfo='percent+label',
        title={'text': f'Average text length: {datasets[k]["text"].apply(lambda x: len(str(x).split())).mean():.2f} words',
               'font': {'size': 10}}
            ),
        row = i+1, col = j+1)

    fig.update
    k += 1


fig.update_layout(title_text='Distribution of classes in the datasets',
                    showlegend=False)

fig.write_html('plots/distribution_of_classes.html')
fig.show()

## Classifiers

So here we set up the classifiers we are going to use. Since SKLearn has a constant interface, we can just use a list of classifiers and iterate over them. For example, they all have a `fit` method, and a `predict` method. So we can just call them in a loop, and get the results. That is what I've seen in the other notebooks.

In [44]:
import nbc

cnbc = nbc.ClassicalNaiveBayes() # Classical Naive Bayes
wnbc = nbc.ClassicalNaiveBayes() # Weighted Naive Bayes
nbL2 = nbc.ClassicalNaiveBayes() # Naive Bayes with L2 regularization
wnbL2 = nbc.ClassicalNaiveBayes() # Weighted Naive Bayes with L2 regularization

classifiers = [cnbc, wnbc, nbL2, wnbL2]
classifier_names = ['ClassicalNB', 'WeightedNB', 'NBL2', 'WeightedNBL2']

def fit_and_score(X_train, y_train, X_test, y_test):
    # fit all classifiers
    cnbc.fit(X_train, y_train)
    wnbc.fit(X_train, y_train, weight=True)
    nbL2.fit_L2(X_train, y_train)
    wnbL2.fit_L2(X_train, y_train, weight=True)

    training_acc = [cls.score(X_train, y_train) for cls in classifiers]
    test_acc     = [cls.score(X_test, y_test) for cls in classifiers]

    return training_acc, test_acc

## Fit and Predict

So here we fit the model, and predict the results. I am just printint the accuracy for the training set, and the test set. But we can also print other metrics. Also SKLearn has a `classification_report` function that can be used to print the precision, recall, and f1-score for each class, just writing this down so I don't forget.

In [45]:
# we will import train_test_split to split our data
from sklearn.model_selection import train_test_split

In [46]:
# load stopwords
with open('stopwords/spanish', 'rb') as f:
    spanish_stopwords = pickle.load(f)

with open('stopwords/english', 'rb') as f:
    english_stopwords = pickle.load(f)

with open('stopwords/portuguese.pkl', 'rb') as f:
    portuguese_stopwords = pickle.load(f)

with open('stopwords/dutch.pkl', 'rb') as f:
    dutch_stopwords = pickle.load(f)

In [47]:
for dataset in datasets:
    dataset['text'] = dataset['text'].astype(str)
dataset_names

['Huffington Post',
 'Bancolombia',
 'Folha de Sao Paolo',
 'BBC',
 'NOS',
 'Geeks4Geeks']

### Term Frequency

In [70]:
# we will import CountVectorizer to vectorize our text
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_huffpost    = CountVectorizer(min_df=10, max_df=0.2, max_features=10000, stop_words=english_stopwords)
vectorizer_bancolombia = CountVectorizer(min_df=10, max_df=0.2, max_features=10000, stop_words=spanish_stopwords)
vectorizer_folha       = CountVectorizer(min_df=10, max_df=0.2, max_features=10000, stop_words=portuguese_stopwords)
vectorizer_bbc         = CountVectorizer(min_df=10, max_df=0.2, max_features=10000, stop_words=english_stopwords)
vectorizer_nos         = CountVectorizer(min_df=10, max_df=0.2, max_features=10000, stop_words=dutch_stopwords)
vectorizer_geeks4geeks = CountVectorizer(min_df=10, max_df=0.2, max_features=10000, stop_words=english_stopwords)

vectorizers = [vectorizer_huffpost, vectorizer_bancolombia, vectorizer_folha, vectorizer_bbc, vectorizer_nos, vectorizer_geeks4geeks]

X = [vect.fit_transform(dataset['text']) for vect, dataset in zip(vectorizers, datasets)]
Y = [dataset['label'] for dataset in datasets]

# save vectorizers and X
with open('vectorizers.pkl', 'wb') as f:
    pickle.dump(vectorizers, f)

with open('X.pkl', 'wb') as f:
    pickle.dump(X, f)

In [71]:
for x in X:
    print(x.shape)

(209527, 10000)
(1217, 4435)
(942, 4022)
(2225, 5636)
(246457, 10000)
(34551, 2092)


In [91]:
# we will import train_test_split to split our data
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer as tfidf
from tqdm import tqdm
# Number of simulations for each dataset
M = 10

accuracy = pd.DataFrame(columns=[*classifier_names, 'dataset', 'train/test', 'simulation', 'vectorizer'])

k = 0
for x, y, dataset_name in zip(X, Y, dataset_names):
    for i in (pbar := tqdm(range(M))):
        pbar.set_description(f'Fitting {dataset_name} dataset')

        # split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i+1)

        # fit and score for count
        training_acc, test_acc = fit_and_score(X_train, y_train, X_test, y_test)

        # save results
        accuracy.loc[k] = [*training_acc, dataset_name, 'train', i+1, 'count']; k += 1
        accuracy.loc[k] = [*test_acc, dataset_name, 'test', i+1, 'count']; k += 1

        # transform X to tfidf
        X_train_idf, X_test_idf = tfidf().fit_transform(X_train), tfidf().fit_transform(X_test)

        # fit and score for tfidf
        training_acc, test_acc = fit_and_score(X_train_idf, y_train, X_test_idf, y_test)

        # save results
        accuracy.loc[k] = [*training_acc, dataset_name, 'train', i+1, 'tfidf']; k += 1
        accuracy.loc[k] = [*test_acc, dataset_name, 'test', i+1, 'tfidf']; k += 1

Fitting Huffington Post dataset: 100%|██████████| 10/10 [39:47<00:00, 238.76s/it]
Fitting Bancolombia dataset: 100%|██████████| 10/10 [00:20<00:00,  2.09s/it]
Fitting Folha de Sao Paolo dataset: 100%|██████████| 10/10 [00:19<00:00,  1.90s/it]
Fitting BBC dataset: 100%|██████████| 10/10 [00:21<00:00,  2.11s/it]
Fitting NOS dataset: 100%|██████████| 10/10 [42:38<00:00, 255.86s/it]
Fitting Geeks4Geeks dataset: 100%|██████████| 10/10 [00:55<00:00,  5.57s/it]


In [93]:
accuracy

# save results
accuracy.to_csv('results/accuracy.csv', index=False)

In [96]:
accuracy[accuracy['train/test'] == 'train'].groupby(['dataset', 'vectorizer']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,ClassicalNB,WeightedNB,NBL2,WeightedNBL2,simulation
dataset,vectorizer,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BBC,count,0.990169,0.990169,0.999157,0.999157,5.5
BBC,tfidf,0.985,0.985,0.999551,0.999551,5.5
Bancolombia,count,0.937205,0.937205,0.992395,0.992395,5.5
Bancolombia,tfidf,0.843371,0.843371,0.992497,0.992497,5.5
Folha de Sao Paolo,count,0.971713,0.971713,0.99907,0.99907,5.5
Folha de Sao Paolo,tfidf,0.902125,0.902125,1.0,1.0,5.5
Geeks4Geeks,count,0.43009,0.43009,0.437051,0.437051,5.5
Geeks4Geeks,tfidf,0.437786,0.437786,0.437066,0.437066,5.5
Huffington Post,count,0.508921,0.508921,0.169982,0.020094,5.5
Huffington Post,tfidf,0.421451,0.421451,0.47227,0.47227,5.5


In [101]:
accuracy[accuracy['train/test'] == 'test'].groupby(['dataset', 'vectorizer']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,ClassicalNB,WeightedNB,NBL2,WeightedNBL2,simulation
dataset,vectorizer,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BBC,count,0.975506,0.975506,0.940674,0.940674,5.5
BBC,tfidf,0.973933,0.973933,0.940899,0.940899,5.5
Bancolombia,count,0.844262,0.844262,0.738115,0.738115,5.5
Bancolombia,tfidf,0.754918,0.754918,0.75082,0.75082,5.5
Folha de Sao Paolo,count,0.863492,0.863492,0.848677,0.848677,5.5
Folha de Sao Paolo,tfidf,0.859259,0.859259,0.859259,0.859259,5.5
Geeks4Geeks,count,0.354464,0.354464,0.359239,0.359239,5.5
Geeks4Geeks,tfidf,0.369006,0.369006,0.36795,0.36795,5.5
Huffington Post,count,0.422567,0.422567,0.169651,0.019801,5.5
Huffington Post,tfidf,0.39019,0.39019,0.362146,0.362146,5.5


## Evaluation

Here we can look deeper into the models. As an example, below I am printing the most important words (that have the highest conditonal probability given a class) for the Naive Bayes.

In [110]:
# index of Bancolombia
dataset_names.index('Bancolombia'), dataset_names

(1,
 ['Huffington Post',
  'Bancolombia',
  'Folha de Sao Paolo',
  'BBC',
  'NOS',
  'Geeks4Geeks'])

In [111]:
cnbc.fit(X[1], Y[1])
nbL2.fit_L2(X[1], Y[1])

In [164]:
print('Most informative features for each class for Naive Bayes', end='\n\n')
important_words = set()

for i, cls in enumerate(cnbc.classes):
    if cls == 'Otra':
        print(f'{cls}:\t\t', end=' ')
    else:
        print(f'{cls}:\t', end=' ')

    if cls in ['Innovacion', 'Macroeconomia']:
        important_words.update([(j, vectorizer_bancolombia.get_feature_names()[j]) for j in cnbc.phi.T[i].argsort()[-10:]])
        important_words.update([(j, vectorizer_bancolombia.get_feature_names()[j]) for j in nbL2.phi.T[i].argsort()[-10:]])

    print(', '.join([vectorizer_bancolombia.get_feature_names()[j] for j in cnbc.phi.T[i].argsort()[-10:]]))

Most informative features for each class for Naive Bayes

Alianzas:	 grupo, dijo, acceso, programa, medio, digital, empresa, compañía, usuarios, alianza
Innovacion:	 cliente, soluciones, capital, seguridad, permite, digitales, tecnología, españa, innovación, digital
Macroeconomia:	 tasas, alza, variación, anual, economista, méxico, puntos, ipc, tasa, alimentos
Otra:		 sostenibilidad, digitales, sostenible, 2019, banca, tasa, grupo, consumo, tarjetas, crédito
Regulaciones:	 comisión, usuarios, uber, ley, industria, trabajo, plataformas, gobierno, dijo, regulación
Reputacion:	 lugar, compañías, ranking, empresa, grupo, 10, puesto, posición, marca, reputación
Sostenibilidad:	 uso, climático, consumo, carbono, españa, emisiones, sostenibilidad, agua, sostenible, energía


In [241]:
# Get indexes of Macroeconomia and Innovacion
def get_idx(l, items):
    return [i for i, x in enumerate(l) if x in items]

# Get indexes of Macroeconomia and Innovacion
get_idx(cnbc.classes, ['Macroeconomia', 'Innovacion'])

[1, 2]

In [357]:
import networkx as nx
from pyvis.network import Network
import seaborn as sns

important_words = set()

top_words = 5

allword = True

classes = ['Macroeconomia', 'Innovacion', 'Sostenibilidad']
clidx = get_idx(cnbc.classes, classes)

if allword:
    important_words = set([(j, word) for j, word in enumerate(vectorizer_bancolombia.get_feature_names())])
else:
    for i, cls in enumerate(cnbc.classes):

        if cls in ['Macroeconomia']:
            important_words.update([(j, vectorizer_bancolombia.get_feature_names()[j]) for j in cnbc.phi.T[i].argsort()[-top_words:]])
            important_words.update([(j, vectorizer_bancolombia.get_feature_names()[j]) for j in nbL2.phi.T[i].argsort()[-top_words:]])

G = nx.Graph()

# add important words and classes as nodes

weights1 = np.array([[cnbc.phi[i, j] for i, word in important_words] for j, cls in enumerate(cnbc.classes) if cls in classes])
weights2 = np.array([[nbL2.phi[i, j] for i, word in important_words] for j, cls in enumerate(cnbc.classes) if cls in classes])
changes = weights2 - weights1

maxchange = np.max(np.abs(changes))
minchange = np.min(np.abs(changes))

changes[changes > 0] = 50 + 48 * (changes[changes > 0] - minchange) / (maxchange - minchange)
changes[changes < 0] = 50 + 48 * (changes[changes < 0] + minchange) / (maxchange - minchange)

# filter the changes with top 10% and bottom 10%
maxchange = np.percentile(np.abs(changes), 99)
minchange = np.percentile(np.abs(changes), 1)

print(maxchange, minchange)

for j, (i, word) in enumerate(important_words):
    G.add_node(word, color=sns.color_palette("rocket", n_colors=100).as_hex()[
        int(99 * max((weights1[:,j] - np.min(weights1, axis=1)) / (np.max(weights1,axis=1) - np.min(weights1,axis=1))))
        ]
        )

G.add_nodes_from(classes, color='#00FF00')

print('Finished adding nodes')
change_bool = True

# add edges between important words and classes from cnbc.phi
for k, (i, word) in enumerate(important_words):

    for j, cls in enumerate(cnbc.classes):
        if j not in clidx:
            continue

        if change_bool:
            G.add_edge(word, cls, color=sns.color_palette("Spectral_r", n_colors=100).as_hex()[int(changes[k])])

print('Finished adding edges')

net = Network(bgcolor='#222222', font_color='white')
net.from_nx(G)
net.set_edge_smooth('dynamic')
net.show_buttons(filter_=['physics'])
net.show(f'important_words.html')

50.47038261939856 45.02810810045058


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [364]:
len(weights1), len(weights2), len(weights1[0]), np.min(weights1, axis=1)

(3, 3, 4435, array([2.19630581e-05, 1.44402247e-05, 2.37574836e-05]))

In [328]:
change = nbL2.phi - cnbc.phi
 
np.min(change), np.max(change)

(-0.00867074655143806, 0.00021467967422580578)

In [315]:
# import nbc from sklearn
from sklearn.naive_bayes import MultinomialNB as skNBC

# create a new NBC
skNBC = skNBC()

# fit the model
skNBC.fit(X[1], bancolombia['label'])

MultinomialNB()