In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import itertools
import glob
import pickle
import matplotlib.pyplot as plt
import qmin

In [None]:
def getTrainData(path_files):
    #Import Mineral Balanced Data!!!!
    
    input_files = glob.glob(path_files + "*.csv")

    min_in = []

    for filename in input_files:
        df = pd.read_csv(filename, index_col=None, header=0, encoding = "ISO-8859-1")
        min_in.append(df)

    df_all = pd.concat(min_in, axis=0, ignore_index=True)
    #print(df_all.columns)
    
    df = df_all.drop(columns = ['Unnamed: 0', 'X1', 'id', 'SAMPLE', 'GROUP', 'MINERAL', 'ROCK','X1_1'])
    return df, df_all

    
def test_acc(X,y, model,plot=True,modelName='rf'):
    # Function to test the accuracy
    n_nodes = []
    max_depths = []

    for ind_tree in model.estimators_:
        n_nodes.append(ind_tree.tree_.node_count)
        max_depths.append(ind_tree.tree_.max_depth)

    print(f'Average number of nodes {int(np.mean(n_nodes))}')
    print(f'Average maximum depth {int(np.mean(max_depths))}')
    
    feature_list = list(X.columns)
    # Get numerical feature importances
    importances = list(model.feature_importances_)
    # List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
    # Sort the feature importances by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    # Print out the feature and importances 
    [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

    y_pred = model.predict(X)
    
    #print(confusion_matrix(y,y_pred))
    cm = confusion_matrix(y, y_pred)
    if plot:
        if modelName == 'rf':
            qmin.plot_confusion_matrix(cm, classes = np.unique(y),
                                  title = 'Confusion Matrix',normalize=True)
        else:
            qmin.plot_confusion_matrix(cm, classes = np.unique(y),
                                  title = 'Confusion Matrix',normalize=True,
                                 output='../figures/rf'+modelName+'_confusion_matrix.png')
        print(classification_report(y,y_pred))
    print(accuracy_score(y, y_pred))
    return accuracy_score(y, y_pred)

def save_Model(model, model_name):
    path = '../model_py/'
    pickle.dump(model, open(path+model_name, 'wb'))
    print("Model Saved ...\n"+path+model_name)
    
def randomForestBuilder(trainData,labels, acc_test=True, saveModel=True, nameModel='RF'):
    # Separate 30% for test, training in 70% of Data!
    train, test, train_labels, test_labels = train_test_split(trainData, labels, 
                                                              stratify = labels,
                                                              test_size = 0.3)
    model = RandomForestClassifier(n_estimators=50, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 1,
                               oob_score=True)

    # Fit on training data
    model.fit(train, train_labels)
    
    if acc_test:
        acc = test_acc(test,test_labels,model,modelName=nameModel)
    
    if saveModel:
        save_Model(model, nameModel+'.pkl')
    
    return model, test_acc(test,test_labels,model,plot=False)

def _cleanDataFrame(df):
    df = df.drop(columns = ['Unnamed: 0', 'X1', 'id', 'SAMPLE', 'GROUP', 'ROCK','X1_1'])
    return df

def createMineralModel(data,namegroup):
    #get Dataframe to create multiple mineral models
    
    labels = labels = np.array(data.pop('MINERAL'))
    trainData = _cleanDataFrame(data)    
    mineralModel, acc = randomForestBuilder(trainData,labels,acc_test=True,
                                     saveModel=False, nameModel=namegroup)
    return mineralModel,acc
    
    

def modelCreate():
    models = {}
    
    trainData, allData = getTrainData('../data_train/')
    trainFeatures = trainData.columns
    labels = np.array(allData.pop('GROUP'))
    allData['GROUP'] = labels

    #Group Model C;assify
    groupModel, acc = randomForestBuilder(trainData,labels,acc_test=False,
                                     saveModel=True, nameModel='RF_GROUP')
    models['GROUP'] = groupModel
    
    groups = allData['GROUP'].unique() # Get all Gruops for mineral classification

    accs = []
    for group in groups:
        print('Training .... '+group)
        mineralData = allData[allData['GROUP']== group]
        models[group], acc = createMineralModel(mineralData,group)
        accs.append([group,acc])
    
    print(accs)
    return models,trainFeatures


models, trainFeatures = modelCreate()

In [None]:
def test_cprm_datasets(filename):
    
    #Features used in training!
    Qmin_RF_features = trainFeatures.values.tolist()
    df = pd.read_csv(filename, encoding = "ISO-8859-1")
    df_w = df

    #Remove Columns not in Qmin_Group_RF (Oxides used in trainnig RF model)
    for i in df.columns:
        if i == 'FEO':
            df = df.rename(columns={'FEO':'FEOT'})
            continue
        if i not in Qmin_RF_features:
            print('Data not in trained RF Model: ',i)
            df = df.drop(columns=i)

    # Add missing columns
    for i in Qmin_RF_features:
        if i not in df.columns:
            print('Missing... ', i)
            df[i] = 0.0

    df = df.astype('float64')
    df = df.reindex(columns=Qmin_RF_features)

    group_class = models['GROUP'].predict(df)
    mineral_class = models['SULFIDE'].predict(df)
    df_w['GROUP_ClASS'] = group_class
    df_w['MINERAL_CLASS'] = mineral_class
    df_w.to_csv(filename[:-4]+'_rf_python.csv')
    print(classification_report(df_w['MINERAL'],mineral_class))
    print(accuracy_score(df_w['MINERAL'],mineral_class))

#Teste Anderson
test_cprm_datasets('../data_test/anderson_output.csv')


In [None]:
# Teste Evandro
test_cprm_datasets('../data_test/evandro_output.csv')
