## Libreries

In [81]:
import pandas as pd
import numpy as np
import pickle

## Data

In [82]:
Q_train = pd.read_excel("../../data/train_task_Q.xlsx", index_col="id")
Q_val = pd.read_excel("../../data/val_task_Q.xlsx", index_col="id")
Q_test = pd.read_excel("../../data/test_task_Q.xlsx", index_col="id")

## Linguistic features

spacy

In [83]:
%%capture
!pip install -U spacy
import spacy
!python -m spacy download es_core_news_md
nlp = spacy.load('es_core_news_md')

In [84]:
def get_lf(df, text_col="Q"):
    data_lf = {}
    N = df.shape[0]

    cols = ["lemma", "tag", "dep",
                "shape", "is_alpha", "is_stop"]

    for i, ix in enumerate(df.index):
        q = df.loc[ix][text_col].lower()
        doc = nlp(q)   
        for t in doc:
            cols_per_t = [t.lemma_, t.tag_, t.dep_, t.shape_, t.is_alpha, t.is_stop]
            for k, v in enumerate(cols_per_t):
                name_col = f"{cols[k]}<&>{v}"
                if name_col not in data_lf.keys():
                    data_lf[name_col] = np.zeros(N, dtype=int)
                data_lf[name_col][i] += 1
    o = pd.DataFrame(data_lf)
    o["id"] = df.index
    o = o.set_index("id")
    return o

In [85]:
%%time
lf_Q_train = get_lf(Q_train, text_col="Q")

Wall time: 1.77 s


In [86]:
%%time
lf_Q_val = get_lf(Q_val, text_col="Q")

Wall time: 458 ms


In [87]:
%%time
lf_Q_test = get_lf(Q_test, text_col="Q")

Wall time: 1.76 s


In [88]:
cols_train = set(lf_Q_train.columns)
cols_val = set(lf_Q_val.columns)
cols_test = set(lf_Q_test.columns)

In [89]:
drop_val = cols_val.difference(cols_train)
drop_test = cols_test.difference(cols_train)

In [90]:
add_val = cols_train.difference(cols_val)
add_test = cols_train.difference(cols_test)

In [91]:
olf_Q_train = lf_Q_train

In [92]:
olf_Q_val = lf_Q_val.drop(columns=drop_val)
zero_cols = pd.concat([olf_Q_val.iloc[:, 0] * 0] * len(add_val), axis=1)
zero_cols.columns = add_val
olf_Q_val = pd.concat([olf_Q_val, zero_cols], axis=1)[olf_Q_train.columns]

In [93]:
olf_Q_test = lf_Q_test.drop(columns=drop_test)
zero_cols = pd.concat([olf_Q_test.iloc[:, 0] * 0] * len(add_test), axis=1)
zero_cols.columns = add_test
olf_Q_test = pd.concat([olf_Q_test, zero_cols], axis=1)[olf_Q_train.columns]

## Save features

In [94]:
olf_Q_train.to_excel("../features/lf_features_train_task_Q.xlsx")
olf_Q_val.to_excel("../features/lf_features_val_task_Q.xlsx")
olf_Q_test.to_excel("../features/lf_features_test_task_Q.xlsx")