In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


from sklearn.feature_extraction.text import TfidfVectorizer

import swifter

from joblib import dump, load

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from tokenizer import Clean

In [4]:
data = pd.read_excel("../data/cat_345.xlsx")

In [5]:
Y = data['sdg']
X = data.drop(['sdg'], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1234, stratify=Y)
print(f"Total de filas en entrenamiento {X_train.shape[0]} y test {X_test.shape[0]}")

Total de filas en entrenamiento 2400 y test 600


In [6]:
X_train_clean = pd.read_excel("../data/cat_345_clean_con_lema.xlsx")
X_test_clean = pd.read_excel("../data/cat_345_clean_con_lema_test.xlsx")

In [7]:
def toke(text):
    return word_tokenize(text, language="spanish")

In [8]:
vectorizer = TfidfVectorizer(
        sublinear_tf=True, max_df=0.5, min_df=5, stop_words=stopwords.words('spanish'), tokenizer=toke
    )
X_train_TFID = vectorizer.fit_transform(X_train_clean['texto_limpio'])
X_test_TFID = vectorizer.transform(X_test_clean['texto_limpio'])



## Importation of the best model which is already trained

In [None]:
best_model = load('best_model.joblib')

In [None]:
y_train_pred = best_model.predict(X_train_TFID)
print(classification_report(Y_train, y_train_pred))

In [None]:
y_test_pred = best_model.predict(X_test_TFID)
print(classification_report(Y_test, y_test_pred))

In [None]:
Y_test

In [None]:
y_test_pred

In [None]:
X_test.head(1)

In [None]:
X_test_clean.head(1)

## Pipeline creation

In [None]:
pipe = Pipeline([
    ('Tokenizer', Tokenizer()),
    ('Vectorizer', vectorizer),
    ('Ridge Classifier', best_model)
])

In [None]:
dump(pipe, 'thePipeline.joblib')

In [None]:
import pandas as pd

text = ['las mujeres son el futuro de la humanidad']
df = pd.DataFrame(text, columns=['Text'])

print(df)

In [None]:
!python -m spacy download es_core_news_sm

In [None]:
pipe.predict(df['Text'])

In [None]:
df = pd.read_excel("../data/SinEtiquetatest_cat_345.xlsx")

In [None]:
pipe.predict(df['Textos_espanol'])