In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from spacy.lang.es import Spanish
from spacy.tokenizer import Tokenizer
from sklearn.metrics import balanced_accuracy_score
from multiprocessing import  Pool
import numpy as np
import fasttext
import csv
import unicodedata
from collections import defaultdict
from joblib import Parallel, delayed
from sklearn.utils import resample

In [2]:
# Load Dataset
data_train = pd.read_csv('./train.csv', sep='|')
data_test = pd.read_csv('./test_santander.csv', sep=',')

In [3]:
data_train = data_train[data_train.Intencion != 'Cat_104']

In [4]:
def normalize_text(text,nlp):
    s = []
    for tok in nlp.tokenizer(text.lower()):
        if not tok.is_stop:
            if tok.is_alpha and not (tok.is_digit or len(tok.text) == 1):
                if not tok.is_ascii:
                    tok = ''.join(c for c in unicodedata.normalize('NFD', tok.text.lower()) if unicodedata.category(c) != 'Mn')
                    s.append(tok)
                else:
                    s.append(tok.text)
    if not s:
        return "emptystring"
    else:
        s = ' '.join(s)
        return s

In [5]:
nlp_es = Spanish()

In [6]:
#tk = nlp_es.tokenizer('para, 1983, de con 18te ')
#for t in tk:
#    print("Text:", t.text)
#    print("Is Stop: ", t.is_stop)
#    print("Is Ascii: ", t.is_ascii)
#    print("Is Alpha: ", t.is_alpha)
#    print("Is Digit: ", t.is_digit)

In [7]:
def parallelize_dataframe(df, func, n_cores=8):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [8]:
def preprocess(df):
    nlp_es = Spanish()
    df["tokens"] = df["Pregunta"].apply(normalize_text,args=(nlp_es,))
    df["label"] = df["Intencion"].apply(lambda x: '__label__'+ x)
    return df[["label","tokens"]]

In [9]:
def preprocess_test(df):
    nlp_es = Spanish()
    df["tokens"] = df["Pregunta"].apply(normalize_text,args=(nlp_es,))
    return df[["id","tokens"]]

In [10]:
def create_fasttext_split_files(train_df, test_df, outputfiles):
    # train and validation set files
    train = parallelize_dataframe(train_df, preprocess)
    X_train, X_val, y_train, y_val = train_test_split(train[["tokens"]], train["label"], test_size=0.2, random_state=42, stratify=train["label"])
    train_fasttext = pd.concat([y_train,X_train["tokens"]], axis=1)
    val_fasttext = pd.concat([y_val,X_val["tokens"]], axis=1)
    train_fasttext.to_csv(outputfiles[0],index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
    val_fasttext.to_csv(outputfiles[1],index=False, sep='|', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
    
    #test set file
    if test_df is not None:
        test = parallelize_dataframe(test_df, preprocess_test)
        test.to_csv(outputfiles[2],index=False,header=False,line_terminator='\n')

In [11]:
%%time 
create_fasttext_split_files(data_train, data_test, ['./fastextData/train.csv','./fastextData/val.csv','./fastextData/test.csv'])

CPU times: user 164 ms, sys: 120 ms, total: 284 ms
Wall time: 1.05 s


In [14]:
%%time 
model = fasttext.train_supervised(input="./fastextData/train.csv",autotune-validation='./fastextData/val.csv',
                                  epoch=300, lr=0.1, wordNgrams=1, dim=300,
                                  thread=8)

SyntaxError: keyword can't be an expression (<unknown>, line 1)

In [None]:
val_data = pd.read_csv('./fastextData/val.csv', sep='|', names=['label','tokens'])

In [None]:
val_data.head()

In [None]:
val_preds = model.predict(val_data.tokens.values.tolist(), 1)

In [None]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(val_data.label, np.array(val_preds)[0][:].flatten())

In [None]:
model.save_model("./models/fasttext_model.bin")

# Predict

In [None]:
model = fasttext.load_model("./models/fasttext_baseline.bin")

In [None]:
test_data = pd.read_csv('./fastextData/test.csv',names=['tokens'])

In [None]:
test_data.head()

In [None]:
%time 
predictions = model.predict(test_data["tokens"].values.tolist())

In [None]:
predictions = pd.Series([x[0][13:] for x in predictions[0]])
submission = pd.DataFrame({'id':test_data.index.values, 'pred': predictions})

In [None]:
submission.head()

In [None]:
submission.to_csv("./submissions/fastext_baseline_subm.csv",header=False, index=False)