In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

#Classificadores
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

#Otimização 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import Normalizer



from glob import glob
import csv
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [None]:
def read_data(path, sep):
    data = pd.read_csv(path, sep=sep)
    data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
    return data

def classesRetrieval(dataframe,labels):
    dataframe['label'] = labels
    return dataframe

def drop(df,label):
    df.drop(labels=label, axis=1, inplace=True)

def encoder(data,labelfield):
    #Codificando as classes para um vetor numérico 
    encoder = LabelEncoder()
    y_encoded = encoder.fit_transform(data[labelfield])
    return y_encoded, encoder.classes_

In [None]:
def two_places(value):
    rounded= float("{:.2f}".format(value))
    return rounded

def write(row):
    row = [str(w) for w in row]
    output = open("./results_ml_pop.txt","a")#append mode
    output.write(','.join(row)+"\n")
    output.close()

def write_header(target_names):
    header = ['model','params','representation','accuracy','f1_macro','f1_weighted']
    for target in target_names:
        header.append('{}_precision'.format(target))
        header.append('{}_recall'.format(target))
        header.append('{}_f1_score'.format(target))
        header.append('{}_samples'.format(target))

    header.append('total_samples')
    write(header)
    
def compile_results(nn,nn_params,rep,y_test, y_pred, target_names):
    report = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
    results_row = []
    
    results_row.append(nn)
    results_row.append(nn_params)
    results_row.append(rep)
    results_row.append(two_places(report['accuracy']))
    results_row.append(two_places(report['macro avg']['f1-score']))
    results_row.append(two_places(report['weighted avg']['f1-score']))
    for target in target_names:
        results_row.append(two_places(report[target]['precision']))
        results_row.append(two_places(report[target]['recall']))
        results_row.append(two_places(report[target]['f1-score']))
        results_row.append(report[target]['support'])

    results_row.append(report['macro avg']['support'])
    
    write(results_row)

In [None]:
bag = read_data('../input/pop-rock-subgenres/pop_mar21_bow.csv', sep=",")
pos = read_data('../input/pop-rock-subgenres/pop_mar21_pos.csv', sep=",")
cet = read_data('../input/pop-rock-subgenres/pop_mar21_cet.csv', sep=",")

#Codifica e preserva as classes
labels, classes = encoder(bag,'label')

#Remove label para combinação das representações:
drop(bag,'label')
drop(pos,'label')
drop(cet,'label')

In [None]:
c1 = bag
c2 = pos
c3 = cet
c4 = pd.concat([bag, pos, cet], axis=1, join='inner')
c5 = pd.concat([bag, pos], axis=1, join='inner')
c6 = pd.concat([bag, cet], axis=1, join='inner')
c7 = pd.concat([pos, cet], axis=1, join='inner')

In [None]:
features = {"BoW": c1,"PoS":c2,"CET":c3,"BoW|PoS|CET":c4,"BoW|PoS":c5,"BoW|CET":c6,"PoS|CET":c7}

for i in features:
    classesRetrieval(features[i],labels)
    
representations_files = features

In [None]:
#estancia um novo modelo
def n_clf(clf):
    if clf == 'knn':
        knn = KNeighborsClassifier()
        return knn

    elif clf == 'gnb':
        gnb = GaussianNB()
        return gnb

    elif clf == 'mnb':
        mnb = MultinomialNB()
        return mnb

    elif clf == 'bnb':
        bnb = BernoulliNB()
        return bnb

    elif clf == 'cnb':
        cnb = ComplementNB()
        return cnb
    
    elif clf == 'lr':
        lr = LogisticRegression()
        return lr

    elif clf == 'svm':
        svm = SVC()
        return svm

    elif clf == 'dt':
        dt = DecisionTreeClassifier(max_depth=None)
        return dt

    elif clf == 'rf':
        rf = RandomForestClassifier()
        return rf

# dicionario dos parametros: 

parameters = {}
parameters['knn'] = {'n_neighbors' : [3,5,7,9,11,13,15,17,19,21], 'weights':['uniform','distance'], 'metric' : ['euclidean','manhattan','cosine']}
parameters['gnb'] = {}
parameters['bnb'] = {}
parameters['mnb'] = {}
parameters['cnb'] = {}
parameters['lr'] = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
parameters['svm'] = dict(C=[1e-5,1e-4,1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3,1e4,1e5], kernel=['sigmoid','rbf','linear','poly'], max_iter=[25000])
parameters['dt'] = dict(criterion=['gini', 'entropy'])
parameters['rf'] = dict(n_estimators=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], min_samples_split=[2, 3, 4, 5, 6, 7, 8, 9, 10])

score = 'accuracy'

#classifiers_list = ['knn','gnb','mnb','bnb','cnb','svm','dt','rf']
classifiers_list = ['gnb','mnb','bnb','cnb','lr','svm','dt','rf']

In [None]:
write_header(classes)
                         
for classifier in classifiers_list:
    for representation in representations_files:
        print("Running {} classification with {}".format(classifier,representation))
        
        y = representations_files[representation].label
        X = representations_files[representation].drop(labels='label', axis=1)
        
        #Scaling
        if classifier in ['mnb','cnb']:
            scaler = Normalizer()
        else:
            scaler = QuantileTransformer(output_distribution='normal')
            
        X_scaled = scaler.fit_transform(X)
        
        #split
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)
        
        
        clf = GridSearchCV(
            n_clf(classifier), params, scoring=score, cv=StratifiedKFold(n_splits=10).split(X_train, y_train), n_jobs=-1)

        clf.fit(X_train, y_train)
        
        #store the best parameters
        best_params = str(clf.best_params_).replace(',','|').replace("'",'')

        # Set the clf to the best combination of parameters
        clf = clf.best_estimator_
        
        # Fit the best algorithm to the data. 
        clf = clf.fit(X_train, y_train)

        y_true, y_pred = y_test, clf.predict(X_test)
        
        compile_results(classifier,best_params,representation,y_true, y_pred, classes)

In [None]:
data = pd.read_csv('./results_ml_pop.txt',',')
data.head()