In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
total_file = '../data/' + 'letras_total_ft.csv'
train_file = '../data/' + 'letras_train.csv'
test_file = '../data/' + '/letras_test.csv'

In [None]:
data = pd.read_csv('../data/tango_rock.csv')
data['texto'] = data['titulo'] + ' ' + data['letra']

In [None]:
ft = data[['texto','genero']]

In [None]:
# Lets do some cleaning of this text
def clean_it(text,normalize=True):
    # Replacing possible issues with data. We can add or reduce the replacemtent in this chain
    s = str(text).replace(',',' ').replace('"','').replace('\'','').replace('.',' . ').replace('(',' ( ').\
            replace(')',' ) ').replace('!',' ! ').replace('?',' ? ').replace(':',' ').replace(';',' ').\
            replace('/','').replace('|','').replace('¿',' ¿ ').replace('¡',' ¡ ').lower()
    
    # normalizing / encoding the text
    if normalize:
        s = s.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8')
    
    return s

# Now lets define a small function where we can use above cleaning on datasets
def clean_df(data, cleanit= False, shuffleit=False, encodeit=False, label_prefix='__label__'):
    # Defining the new data
    df = data[['texto']].copy(deep=True)
    df['genero'] = label_prefix + data['genero'].astype(str) + ' '
    
    # cleaning it
    if cleanit:
        #df['name'] = df['name'].apply(lambda x: clean_it(x,encodeit))
        df['texto'] = df['texto'].apply(lambda x: clean_it(x,encodeit))
    
    # shuffling it
    if shuffleit:
        df.sample(frac=1).reset_index(drop=True)
            
    return df

In [None]:
def get_train_test_indexes(X=ft_cleaned['texto'],
                           y=ft_cleaned['genero'],
                           size = 0.33,
                           random_state=777,
                         strat=ft_cleaned['genero']):
    lista = train_test_split(X, y, test_size=size,
                                       random_state=random_state, stratify=strat)
    
    train_index = lista[0].index
    test_index = lista[1].index
    
    #return(pd.Series(train_index == lista[2].index))
    #return(pd.Series(test_index == lista[3].index))
    
    return(train_index.values, test_index.values)

In [None]:
train_ft = ft_cleaned.iloc[get_train_test_indexes()[0],:]
test_ft = ft_cleaned.iloc[get_train_test_indexes()[1],:]

In [None]:
# Write files to disk as fastText classifier API reads files from disk.
train_ft.to_csv(train_file, header=None, index=False, columns=['genero', 'texto'])

test_ft.to_csv(test_file, header=None, index=False, columns=['genero', 'texto'])

In [4]:
from fasttext import  train_unsupervised, train_supervised

In [5]:
## Using fastText for feature extraction and training
"""fastText expects and training file (csv), a model name as input arguments.
label_prefix refers to the prefix before label string in the dataset.
default is __label__. In our dataset, it is __class__. 
There are several other parameters which can be seen in: 
https://pypi.org/project/fasttext/
"""
model = train_unsupervised(
                        input=total_file,
                        #autotuneValidationFile=test_file,
                        maxn=4,
                        minn=2,
                        lr=0.025, epoch=300, wordNgrams=1, 
                        dim=300, thread=16, verbose=100)

In [6]:
key_words = {"sentidos" : ["sentido", "sensación", "sentir", "suave", "duro", "frío", "caliente", 
            "oler", "asqueroso", "gusto", "dulce", "amargo", "oír",
            "sonar", "silencio", "fuerte", "ver", "mirar", "vislumbrar", 
            "ligero", "oscuro", "brillante"],
"creencia" : ["espíritu", "imaginar", "sabiduría", "sabio", "corazonada", 
            "mente", "sospecha", "creer", "pensar", "confiar", "fe",
            "verdad", "creencia", "duda", "esperanza", "miedo", "vida", 
            "alma", "cielo", "eterno", "mortal", "santo", "dios", "orar",
            "sobrenatural", "misterio", "sabio"],
"cuerpo" : ["cabeza", "nariz", "boca", "oreja", "cabello", "hombro", "pecho", 
          "vientre", "pierna", "mano", "brazo", "dedo"],
"pronombres" : ["yo", "nosotros", "tú", "él", "ella", "ellos", "mío", 
               "tu", "nuestro", "suyo"],
"actividades" : ["caminar", "dormir", "gritar", "sentarse", "reír", "comer", "beber", "sonreír"]
}


In [11]:
[{x:model.get_nearest_neighbors(x)} for x in key_words["pronombres"]]

[{'yo': [(0.6709863543510437, 'que'),
   (0.6623286604881287, 'me'),
   (0.6494157910346985, 'pero'),
   (0.6473472714424133, 'y'),
   (0.6452637910842896, 'no'),
   (0.6270315051078796, 'mi'),
   (0.6248949766159058, 'te'),
   (0.6119467616081238, 'soy'),
   (0.594132661819458, 'lo'),
   (0.5923551917076111, 'hoyyo')]},
 {'nosotros': [(0.9373599886894226, ',nosotros'),
   (0.8846564292907715, '“nosotros'),
   (0.7834941148757935, 'vosotros'),
   (0.47439202666282654, 'potros'),
   (0.470106840133667, 'lejísimo'),
   (0.41293758153915405, 'wait`s'),
   (0.4056229293346405, 'otros'),
   (0.4041799008846283, 'aramos'),
   (0.38781091570854187, 'nos'),
   (0.3806556165218353, 'robamos')]},
 {'tú': [(0.5542171001434326, 'túeso'),
   (0.4421008825302124, 'oyelé'),
   (0.40519675612449646, 'ámalo'),
   (0.4003676772117615, 'pídeme'),
   (0.38878604769706726, 'heaven'),
   (0.38592860102653503, 'túyo'),
   (0.38236942887306213, 'electrográfico'),
   (0.3804987967014313, 'gurú'),
   (0.3774909

In [None]:
model.get_nearest_neighbors('ella', 100)