In [None]:
import json
import joblib
import pandas as pd

from utils.tokenizers import tokenizador
from sklearn.feature_extraction.text import CountVectorizer
from typing import List

In [26]:
economy = json.load(open("economy-articles.json", encoding="utf-8"))
society = json.load(open("society-articles.json", encoding="utf-8"))
world = json.load(open("world-articles.json", encoding="utf-8"))

In [29]:
df_economy = pd.DataFrame.from_dict(economy)
df_society = pd.DataFrame.from_dict(society)
df_world = pd.DataFrame.from_dict(world)
df_news = df_economy.append(df_society).append(df_world)
df_news.shape

(9900, 8)

In [30]:
def leer_stopwords(path: str) -> List[str]:
    with open(path, "rt") as stopwords_file:
        return [stopword for stopword in [stopword.strip().lower() for stopword in stopwords_file] if len(stopword) > 0]
    
mi_lista_stopwords = leer_stopwords("utils/stopwords_es_sin_acentos.txt")

In [35]:
vectorizer = CountVectorizer(stop_words=mi_lista_stopwords, tokenizer=tokenizador(),
                             lowercase=True, strip_accents='unicode', decode_error='ignore',
                             ngram_range=(1, 2), min_df=3, max_df=0.8)

In [36]:
vectores = vectorizer.fit_transform(df_news['article'])
features_names = vectorizer.get_feature_names()

In [37]:
joblib.dump(vectores, "vectores.joblib")
joblib.dump(df_news['section'], "targets.joblib")
joblib.dump(features_names, "features.joblib")

['features.joblib']

In [38]:
vectores

<9900x201614 sparse matrix of type '<class 'numpy.int64'>'
	with 3448523 stored elements in Compressed Sparse Row format>