In [99]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [100]:
data = pd.read_csv('../data/tango_rock.csv')
data['texto'] = data['titulo'] + ' ' + data['letra']

In [101]:
ft = data[['texto','genero']]

In [102]:
# Lets do some cleaning of this text
def clean_it(text,normalize=True):
    # Replacing possible issues with data. We can add or reduce the replacemtent in this chain
    s = str(text).replace(',',' ').replace('"','').replace('\'','').replace('.',' . ').replace('(',' ( ').\
            replace(')',' ) ').replace('!',' ! ').replace('?',' ? ').replace(':',' ').replace(';',' ').\
            replace('/','').replace('|','').replace('¿',' ¿ ').replace('¡',' ¡ ').lower()
    
    # normalizing / encoding the text
    if normalize:
        s = s.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8')
    
    return s

# Now lets define a small function where we can use above cleaning on datasets
def clean_df(data, cleanit= False, shuffleit=False, encodeit=False, label_prefix='__label__'):
    # Defining the new data
    df = data[['texto']].copy(deep=True)
    df['genero'] = label_prefix + data['genero'].astype(str) + ' '
    
    # cleaning it
    if cleanit:
        #df['name'] = df['name'].apply(lambda x: clean_it(x,encodeit))
        df['texto'] = df['texto'].apply(lambda x: clean_it(x,encodeit))
    
    # shuffling it
    if shuffleit:
        df.sample(frac=1).reset_index(drop=True)
            
    return df

In [103]:
%%time
# Transform the datasets using the above clean functions
ft_cleaned = clean_df(ft, True, True)
#ft_test_cleaned = clean_df(df_test, True, True)

CPU times: user 333 ms, sys: 23.9 ms, total: 356 ms
Wall time: 355 ms


In [104]:
def get_train_test_indexes(X=ft_cleaned['texto'],
                           y=ft_cleaned['genero'],
                           size = 0.33,
                           random_state=199,
                         strat=ft_cleaned['genero']):
    lista = train_test_split(X, y, test_size=size,
                                       random_state=random_state, stratify=strat)
    
    train_index = lista[0].index
    test_index = lista[1].index
    
    #return(pd.Series(train_index == lista[2].index))
    #return(pd.Series(test_index == lista[3].index))
    
    return(train_index.values, test_index.values)

In [105]:
train_ft = ft_cleaned.iloc[get_train_test_indexes()[0],:]
test_ft = ft_cleaned.iloc[get_train_test_indexes()[1],:]

In [106]:
train_ft['genero'].value_counts()

__label__rock      13583
__label__tango      3841
Name: genero, dtype: int64

In [107]:
test_ft['genero'].value_counts()

__label__rock      6691
__label__tango     1892
Name: genero, dtype: int64

In [108]:
train_ft

Unnamed: 0,texto,genero
19337,algo contigo hace falta que te diga que me mu...,__label__rock
10840,espejismo tus ojos nena se parecen a un espe...,__label__rock
3780,no me dejan salir estoy verde no me dejan sal...,__label__rock
2952,la reacción necesito estar un poco mas adentro...,__label__rock
12428,tan sólo un instante pensaba en los buenos mom...,__label__rock
...,...,...
7638,ahí voy de nuevo ahi voy de nuevo de nuevo sal...,__label__rock
22754,humo ronda gris fantasma de humo que me obsed...,__label__tango
18246,sobredosis de tv estoy desesperado soy tan ...,__label__rock
21237,casamiento hoy se realiza la cosa y ya se ha ...,__label__tango


In [118]:
# Write files to disk as fastText classifier API reads files from disk.
train_file = '../data/' + 'letras_train.csv'
train_ft.to_csv(train_file, header=None, index=False, columns=['genero', 'texto'])

test_file = '../data/' + '/letras_test.csv'
test_ft.to_csv(test_file, header=None, index=False, columns=['genero', 'texto'])

In [None]:
%%time
## Using fastText for feature extraction and training
from fasttext import train_supervised 
"""fastText expects and training file (csv), a model name as input arguments.
label_prefix refers to the prefix before label string in the dataset.
default is __label__. In our dataset, it is __class__. 
There are several other parameters which can be seen in: 
https://pypi.org/project/fasttext/
"""
model = train_supervised(input='../data/letras_train.csv',  lr=0.001, epoch=200, wordNgrams=5, 
                         dim=300, thread=4, verbose=2)

In [124]:
for k in range(1,6):
    results = model.test(test_file,k=k)
    print(f"Test Samples: {results[0]} Precision@{k} : {results[1]*100:2.4f} Recall@{k} : {results[2]*100:2.4f}")

Test Samples: 8583 Precision@1 : 98.6485 Recall@1 : 98.6485
Test Samples: 8583 Precision@2 : 50.0000 Recall@2 : 100.0000
Test Samples: 8583 Precision@3 : 50.0000 Recall@3 : 100.0000
Test Samples: 8583 Precision@4 : 50.0000 Recall@4 : 100.0000
Test Samples: 8583 Precision@5 : 50.0000 Recall@5 : 100.0000


In [125]:
model.test(test_file)

(8583, 0.9864849120354189, 0.9864849120354189)