In [None]:
import pandas as pd
import numpy as np
 
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from utils_data import filter_by_category

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
sns.set(style="whitegrid", context='talk') #palette=['#D44D5C', '#43AA8B']

In [None]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )
data_es

In [None]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [None]:
pd.DataFrame(data=training)

In [None]:
training_categ_count = pd.DataFrame(training).groupby(by='category', as_index=False).agg({'qid': 'count'})
validation_categ_count = pd.DataFrame(validation).groupby(by='category', as_index=False).agg({'qid': 'count'})
testing_categ_count = pd.DataFrame(testing).groupby(by='category', as_index=False).agg({'qid': 'count'})

In [None]:
g = sns.histplot (training_categ_count, x="category", weights ='qid')
g.set_xticklabels(rotation=45, labels=['Biología', 'Química', 'Medicina', 'Enfermería', 'Farmacología',  'Psicología'])
plt.xlabel("Train", fontsize=20)
plt.ylabel("Cantidad de instancias",fontsize=16)

In [None]:
sns.set(style="whitegrid", context='talk', palette=['#FF2E2E']) #palette=['#D44D5C', '#43AA8B']
g = sns.histplot (validation_categ_count, x="category", weights ='qid')
g.set_xticklabels(rotation=45, labels=['Biología', 'Química', 'Medicina', 'Enfermería', 'Farmacología',  'Psicología'])
plt.xlabel("Dev", fontsize=22)
plt.ylabel("Cantidad de instancias",fontsize=16)

In [None]:
sns.set(style="whitegrid", context='talk', palette=['#43AA8B']) #palette=['#D44D5C', '#43AA8B']
g = sns.histplot (testing_categ_count, x="category", weights ='qid')
g.set_xticklabels(rotation=45, labels=['Biología', 'Química', 'Medicina', 'Enfermería', 'Farmacología',  'Psicología'])
plt.xlabel("Test", fontsize=22)
plt.ylabel("Cantidad de instancias",fontsize=16)

In [None]:
# g = sns.displot(validation_categ_count, x="Categoria", weights ='qid', )
# g.set_xticklabels(rotation=45)
# plt.xlabel("", fontsize=22)
# plt.ylabel("Cantidad de instancias",fontsize=16)

In [None]:
g = sns.displot(testing_categ_count, x="category", weights ='qid', )
g.set_xticklabels(rotation=45)
plt.xlabel("", fontsize=22)
plt.ylabel("Cantidad de instancias",fontsize=16)

In [None]:
import spacy
nlp = spacy.load('es_core_news_lg')  
all_stopwords = nlp.Defaults.stop_words.union({'a', 'y', 'o', 'u', 'siguientes'})

In [None]:
def get_word_counter(dataset):
    word_counter = Counter()
    for row in dataset:
        answers = row['answers']
        question = row['qtext']
        doc = nlp(question)
        for token in doc:
            if not token.is_punct:
                token = str(token).lower()
                if token not in all_stopwords:
                    if token in word_counter:
                        word_counter[token] += 1
                    else:
                        word_counter[token] = 1
        for a in answers:
            atext = a['atext']
            doc = nlp(atext)
            for token in doc:
                if not token.is_punct:
                    token = str(token).lower()
                    if token not in all_stopwords:
                        if token in word_counter:
                            word_counter[token] += 1
                        else:
                            word_counter[token] = 1            
    return word_counter

In [None]:
training_counter = get_word_counter(training)
validation_counter = get_word_counter(validation)
testing_counter = get_word_counter(testing)

In [None]:
t_most_common = training_counter.most_common(20)
t_most_common

In [None]:
items, values = [], []
for item, value in t_most_common:
    items.append(items)
    values.append(value)

In [None]:
plt.bar(items, values)

In [None]:
plt.bar(items, values)
plt.title("10 most frequent tokens in description")
plt.ylabel("Frequency")
plt.xlabel("Words")
plt.show()

In [None]:
# g = sns.histplot(data=items)
# g.set_xticklabels(rotation=45, labels=values)
# plt.xlabel("Train", fontsize=20)
# plt.ylabel("Cantidad de instancias",fontsize=16)

In [None]:
tdf = training_counter.to_df()
g = sns.histplot (training_categ_count, x="category", weights ='qid')
g.set_xticklabels(rotation=45, labels=['Biología', 'Química', 'Medicina', 'Enfermería', 'Farmacología',  'Psicología'])
plt.xlabel("Train", fontsize=20)
plt.ylabel("Cantidad de instancias",fontsize=16)

In [None]:
validation_counter.most_common(20)

In [None]:
testing_counter.most_common(20)

In [None]:
wc = WordCloud(background_color="white", width=2000,height=1000).generate_from_frequencies(training_counter)
plt.axis('off')
plt.imshow(wc)

In [None]:
wc = WordCloud(background_color="white", width=2000,height=1000).generate_from_frequencies(testing_counter)
plt.axis('off')
plt.imshow(wc)

In [None]:
wc = WordCloud(background_color="white", width=2000,height=1000).generate_from_frequencies(validation_counter)
plt.axis('off')
plt.imshow(wc)

### Por categorías

In [None]:
#Biologia
data = filter_by_category(training, 'biology')
biology_training_wc = get_word_counter(data)
wc = WordCloud(background_color="white", width=1000,height=500).generate_from_frequencies(biology_training_wc)
plt.axis('off')
plt.imshow(wc)

In [None]:
data = filter_by_category(training, 'nursery')
biology_training_wc = get_word_counter(data)
wc = WordCloud(background_color="white", width=1000,height=500).generate_from_frequencies(biology_training_wc)
plt.axis('off')
plt.imshow(wc)

In [None]:
data = filter_by_category(training, 'pharmacology')
medicine_training_wc = get_word_counter(data)
wc = WordCloud(background_color="white", width=1000,height=500).generate_from_frequencies(medicine_training_wc)
plt.axis('off')
plt.imshow(wc)

In [None]:
data = filter_by_category(training, 'medicine')
medicine_training_wc = get_word_counter(data)
wc = WordCloud(background_color="white", width=1000,height=500).generate_from_frequencies(medicine_training_wc)
plt.axis('off')
plt.imshow(wc)

In [None]:
data = filter_by_category(training, 'psychology')
medicine_training_wc = get_word_counter(data)
wc = WordCloud(background_color="white", width=1000,height=500).generate_from_frequencies(medicine_training_wc)
plt.axis('off')
plt.imshow(wc)

In [None]:
data = filter_by_category(training, 'chemistry')
medicine_training_wc = get_word_counter(data)
wc = WordCloud(background_color="white", width=1000,height=500).generate_from_frequencies(medicine_training_wc)
plt.axis('off')
plt.imshow(wc)