In [32]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import itertools
import glob
import pickle
import matplotlib.pyplot as plt
import qmin

In [39]:
def getTrainData(path_files):
    #Import Mineral Balanced Data!!!!
    
    input_files = glob.glob(path_files + "*.csv")

    min_in = []

    for filename in input_files:
        df = pd.read_csv(filename, index_col=None, header=0, encoding = "ISO-8859-1")
        min_in.append(df)

    df_all = pd.concat(min_in, axis=0, ignore_index=True)
    df = df_all.drop(columns = ['Unnamed: 0', 'X1', 'id', 'SAMPLE', 'GROUP', 'MINERAL', 'ROCK','X1_1'])
    df = df.drop(columns = ["H20"])

    return df, df_all

    
def test_acc(X,y, model,plot=True,modelName='rf'):
    # Function to test the accuracy
    n_nodes = []
    max_depths = []

    for ind_tree in model.estimators_:
        n_nodes.append(ind_tree.tree_.node_count)
        max_depths.append(ind_tree.tree_.max_depth)

    print(f'Average number of nodes {int(np.mean(n_nodes))}')
    print(f'Average maximum depth {int(np.mean(max_depths))}')
    
    feature_list = list(X.columns)
    # Get numerical feature importances
    importances = list(model.feature_importances_)
    # List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
    # Sort the feature importances by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    # Print out the feature and importances 
    [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

    y_pred = model.predict(X)
    
    cm = confusion_matrix(y, y_pred)
    if plot:
        if modelName == 'rf':
            qmin.plot_confusion_matrix(cm, classes = np.unique(y),
                                  title = 'Confusion Matrix',normalize=True)
        else:
            qmin.plot_confusion_matrix(cm, classes = np.unique(y),
                                  title = 'Confusion Matrix',normalize=True,
                                 output='../figures/RF_'+modelName+'_confusion_matrix.png')
        print(classification_report(y,y_pred))
    print(accuracy_score(y, y_pred))
    return accuracy_score(y, y_pred)

def save_Model(model, model_name):
    path = '../model_py/'
    pickle.dump(model, open(path+model_name, 'wb'))
    print("Model Saved ...\n"+path+model_name)
    
def randomForestBuilder(trainData,labels, acc_test=True, saveModel=True, nameModel='RF'):
    # Separate 30% for test, training in 70% of Data!
    train, test, train_labels, test_labels = train_test_split(trainData, labels, 
                                                              stratify = labels,
                                                              test_size = 0.3)
    model = RandomForestClassifier(n_estimators=50, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 1,
                               oob_score=True)

    # Fit on training data
    model.fit(train, train_labels)
    
    if acc_test:
        acc = test_acc(test,test_labels,model,modelName=nameModel)
    
    if saveModel:
        save_Model(model, nameModel+'.pkl')
    
    return model, test_acc(test,test_labels,model,plot=False)

def _cleanDataFrame(df):
    df = df.drop(columns = ['Unnamed: 0', 'X1', 'id', 'SAMPLE', 'GROUP', 'ROCK','X1_1'])
    df = df.drop(columns = ["H20"])
    
    return df

def createMineralModel(data,namegroup):
    #get Dataframe to create multiple mineral models
    
    labels = labels = np.array(data.pop('MINERAL'))
    trainData = _cleanDataFrame(data)    
    mineralModel, acc = randomForestBuilder(trainData,labels,acc_test=False,
                                     saveModel=False, nameModel=namegroup)
    return mineralModel,acc
    
    

def modelCreate():
    models = {}
    
    trainData, allData = getTrainData('../data_train/')
    trainFeatures = trainData.columns
    labels = np.array(allData.pop('GROUP'))
    allData['GROUP'] = labels

    #Group Model Classify
    groupModel, acc = randomForestBuilder(trainData,labels,acc_test=False,
                                     saveModel=True, nameModel='RF_GROUP')
    models['GROUP'] = groupModel
    
    groups = allData['GROUP'].unique() # Get all Gruops for mineral classification
    
    # Mineral Model Classify
    accs = []
    for group in groups:
        print('Training .... '+group)
        mineralData = allData[allData['GROUP']== group]
        models[group], acc = createMineralModel(mineralData,group)
        accs.append([group,acc])
    
    print(accs)
    return models,trainFeatures


models, trainFeatures = modelCreate()
models['Train Features'] = trainFeatures.values.tolist()
#models['Train Features'] = ['SIO2','TIO2','AL2O3','CR2O3','FEOT','CAO','MGO','MNO','K2O',\
# 'NA2O','P2O5','F','CL','NIO','CUO','COO','ZNO','PBO','S','ZRO2', 'AS']
path = '../model_py/'
model_name = 'allmodels'

with open(path+model_name+'.pkl', 'wb') as f:
    pickle.dump(models, f)
# pickle.dump(models, open(path+model_name+'.pkl', 'wb'))
print("Model Saved ...\n"+path+model_name)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished


Model Saved ...
../model_py/RF_GROUP.pkl
Average number of nodes 339
Average maximum depth 15
Variable: SIO2                 Importance: 0.15
Variable: AL2O3                Importance: 0.1
Variable: S                    Importance: 0.1
Variable: ZRO2                 Importance: 0.07
Variable: CAO                  Importance: 0.06
Variable: F                    Importance: 0.06
Variable: COO                  Importance: 0.06
Variable: MGO                  Importance: 0.05
Variable: K2O                  Importance: 0.05
Variable: NA2O                 Importance: 0.05
Variable: CL                   Importance: 0.05
Variable: TIO2                 Importance: 0.04
Variable: FEOT                 Importance: 0.04
Variable: NIO                  Importance: 0.03
Variable: ZNO                  Importance: 0.03
Variable: MNO                  Importance: 0.02
Variable: CUO                  Importance: 0.02
Variable: CR2O3                Importance: 0.01
Variable: P2O5                 Importance: 0

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s


0.991725768321513
Training .... AMPHIBOLES
Average number of nodes 303
Average maximum depth 15


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


Variable: TIO2                 Importance: 0.1
Variable: NA2O                 Importance: 0.1
Variable: SIO2                 Importance: 0.09
Variable: AL2O3                Importance: 0.09
Variable: CAO                  Importance: 0.09
Variable: FEOT                 Importance: 0.08
Variable: K2O                  Importance: 0.08
Variable: MGO                  Importance: 0.07
Variable: MNO                  Importance: 0.07
Variable: F                    Importance: 0.05
Variable: CR2O3                Importance: 0.04
Variable: CL                   Importance: 0.04
Variable: P2O5                 Importance: 0.03
Variable: NIO                  Importance: 0.03
Variable: ZRO2                 Importance: 0.03
Variable: CUO                  Importance: 0.0
Variable: COO                  Importance: 0.0
Variable: ZNO                  Importance: 0.0
Variable: PBO                  Importance: 0.0
Variable: S                    Importance: 0.0
Variable: AS                   Importance: 0.0


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished


Average number of nodes 291
Average maximum depth 19
Variable: CAO                  Importance: 0.15
Variable: NA2O                 Importance: 0.1
Variable: MNO                  Importance: 0.09
Variable: F                    Importance: 0.08
Variable: SIO2                 Importance: 0.07
Variable: FEOT                 Importance: 0.07
Variable: MGO                  Importance: 0.07
Variable: K2O                  Importance: 0.07
Variable: P2O5                 Importance: 0.05
Variable: CL                   Importance: 0.05
Variable: TIO2                 Importance: 0.04
Variable: AL2O3                Importance: 0.04
Variable: CR2O3                Importance: 0.04
Variable: NIO                  Importance: 0.03
Variable: ZNO                  Importance: 0.03
Variable: S                    Importance: 0.02
Variable: ZRO2                 Importance: 0.02
Variable: CUO                  Importance: 0.0
Variable: COO                  Importance: 0.0
Variable: PBO                  Importa

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished


0.8960573476702509
Training .... CLAY
Average number of nodes 81
Average maximum depth 10
Variable: K2O                  Importance: 0.13
Variable: AL2O3                Importance: 0.12
Variable: MGO                  Importance: 0.12
Variable: SIO2                 Importance: 0.11
Variable: TIO2                 Importance: 0.1
Variable: CAO                  Importance: 0.1
Variable: FEOT                 Importance: 0.07
Variable: MNO                  Importance: 0.06
Variable: NA2O                 Importance: 0.05
Variable: P2O5                 Importance: 0.05
Variable: CR2O3                Importance: 0.03
Variable: NIO                  Importance: 0.03
Variable: CL                   Importance: 0.02
Variable: F                    Importance: 0.0
Variable: CUO                  Importance: 0.0
Variable: COO                  Importance: 0.0
Variable: ZNO                  Importance: 0.0
Variable: PBO                  Importance: 0.0
Variable: S                    Importance: 0.0
Variab

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished


Average number of nodes 77
Average maximum depth 9
Variable: CAO                  Importance: 0.17
Variable: NA2O                 Importance: 0.17
Variable: AL2O3                Importance: 0.15
Variable: K2O                  Importance: 0.15
Variable: SIO2                 Importance: 0.11
Variable: FEOT                 Importance: 0.06
Variable: TIO2                 Importance: 0.04
Variable: MGO                  Importance: 0.03
Variable: CR2O3                Importance: 0.02
Variable: MNO                  Importance: 0.02
Variable: P2O5                 Importance: 0.02
Variable: F                    Importance: 0.02
Variable: CL                   Importance: 0.02
Variable: NIO                  Importance: 0.02
Variable: CUO                  Importance: 0.0
Variable: COO                  Importance: 0.0
Variable: ZNO                  Importance: 0.0
Variable: PBO                  Importance: 0.0
Variable: S                    Importance: 0.0
Variable: ZRO2                 Importance:

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished


0.8888888888888888
Training .... FELDSPATHOID
Average number of nodes 82
Average maximum depth 10
Variable: SIO2                 Importance: 0.16
Variable: K2O                  Importance: 0.15
Variable: NA2O                 Importance: 0.14
Variable: AL2O3                Importance: 0.09
Variable: CL                   Importance: 0.09
Variable: CAO                  Importance: 0.08
Variable: FEOT                 Importance: 0.07
Variable: MGO                  Importance: 0.04
Variable: P2O5                 Importance: 0.04
Variable: TIO2                 Importance: 0.03
Variable: MNO                  Importance: 0.03
Variable: F                    Importance: 0.03
Variable: CR2O3                Importance: 0.02
Variable: NIO                  Importance: 0.02
Variable: S                    Importance: 0.02
Variable: CUO                  Importance: 0.0
Variable: COO                  Importance: 0.0
Variable: ZNO                  Importance: 0.0
Variable: PBO                  Importance

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished


Average number of nodes 53
Average maximum depth 8
Variable: SIO2                 Importance: 0.14
Variable: FEOT                 Importance: 0.12
Variable: MGO                  Importance: 0.1
Variable: TIO2                 Importance: 0.09
Variable: MNO                  Importance: 0.08
Variable: AL2O3                Importance: 0.07
Variable: CAO                  Importance: 0.07
Variable: F                    Importance: 0.05
Variable: ZRO2                 Importance: 0.05
Variable: NA2O                 Importance: 0.04
Variable: CL                   Importance: 0.04
Variable: NIO                  Importance: 0.04
Variable: CR2O3                Importance: 0.03
Variable: K2O                  Importance: 0.03
Variable: P2O5                 Importance: 0.03
Variable: ZNO                  Importance: 0.03
Variable: CUO                  Importance: 0.0
Variable: COO                  Importance: 0.0
Variable: PBO                  Importance: 0.0
Variable: S                    Importance

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished


0.8472222222222222
Training .... MICA
Average number of nodes 124
Average maximum depth 12
Variable: AL2O3                Importance: 0.12
Variable: MGO                  Importance: 0.12
Variable: F                    Importance: 0.1
Variable: SIO2                 Importance: 0.09
Variable: FEOT                 Importance: 0.08
Variable: TIO2                 Importance: 0.07
Variable: K2O                  Importance: 0.07
Variable: NA2O                 Importance: 0.07
Variable: CAO                  Importance: 0.06
Variable: MNO                  Importance: 0.06
Variable: CL                   Importance: 0.06
Variable: CR2O3                Importance: 0.04
Variable: NIO                  Importance: 0.04
Variable: P2O5                 Importance: 0.03
Variable: CUO                  Importance: 0.0
Variable: COO                  Importance: 0.0
Variable: ZNO                  Importance: 0.0
Variable: PBO                  Importance: 0.0
Variable: S                    Importance: 0.0
Var

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished


Average number of nodes 23
Average maximum depth 6
Variable: SIO2                 Importance: 0.16
Variable: MNO                  Importance: 0.16
Variable: MGO                  Importance: 0.14
Variable: FEOT                 Importance: 0.12
Variable: K2O                  Importance: 0.12
Variable: NA2O                 Importance: 0.09
Variable: TIO2                 Importance: 0.05
Variable: NIO                  Importance: 0.05
Variable: AL2O3                Importance: 0.04
Variable: CR2O3                Importance: 0.03
Variable: P2O5                 Importance: 0.03
Variable: CAO                  Importance: 0.01
Variable: F                    Importance: 0.0
Variable: CL                   Importance: 0.0
Variable: CUO                  Importance: 0.0
Variable: COO                  Importance: 0.0
Variable: ZNO                  Importance: 0.0
Variable: PBO                  Importance: 0.0
Variable: S                    Importance: 0.0
Variable: ZRO2                 Importance: 0

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s


0.9333333333333333
Training .... PYROXENE
Average number of nodes 166
Average maximum depth 13


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


Variable: CAO                  Importance: 0.13
Variable: NA2O                 Importance: 0.12
Variable: SIO2                 Importance: 0.1
Variable: FEOT                 Importance: 0.1
Variable: MGO                  Importance: 0.1
Variable: AL2O3                Importance: 0.08
Variable: MNO                  Importance: 0.07
Variable: TIO2                 Importance: 0.05
Variable: ZRO2                 Importance: 0.05
Variable: CR2O3                Importance: 0.04
Variable: K2O                  Importance: 0.04
Variable: P2O5                 Importance: 0.04
Variable: CL                   Importance: 0.03
Variable: NIO                  Importance: 0.02
Variable: COO                  Importance: 0.02
Variable: F                    Importance: 0.01
Variable: ZNO                  Importance: 0.01
Variable: CUO                  Importance: 0.0
Variable: PBO                  Importance: 0.0
Variable: S                    Importance: 0.0
Variable: AS                   Importance: 0.0

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished


Average number of nodes 113
Average maximum depth 10
Variable: AL2O3                Importance: 0.13
Variable: CR2O3                Importance: 0.13
Variable: FEOT                 Importance: 0.11
Variable: MGO                  Importance: 0.11
Variable: MNO                  Importance: 0.1
Variable: TIO2                 Importance: 0.08
Variable: NIO                  Importance: 0.08
Variable: ZNO                  Importance: 0.08
Variable: CAO                  Importance: 0.05
Variable: SIO2                 Importance: 0.04
Variable: K2O                  Importance: 0.03
Variable: NA2O                 Importance: 0.03
Variable: P2O5                 Importance: 0.03
Variable: F                    Importance: 0.0
Variable: CL                   Importance: 0.0
Variable: CUO                  Importance: 0.0
Variable: COO                  Importance: 0.0
Variable: PBO                  Importance: 0.0
Variable: S                    Importance: 0.0
Variable: ZRO2                 Importance:

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished


0.7474747474747475
Training .... SULFIDE
Average number of nodes 265
Average maximum depth 16
Variable: S                    Importance: 0.17
Variable: FEOT                 Importance: 0.16
Variable: NIO                  Importance: 0.14
Variable: CUO                  Importance: 0.12
Variable: COO                  Importance: 0.11
Variable: AS                   Importance: 0.1
Variable: PBO                  Importance: 0.07
Variable: ZNO                  Importance: 0.06
Variable: K2O                  Importance: 0.03
Variable: NA2O                 Importance: 0.03
Variable: CAO                  Importance: 0.02
Variable: SIO2                 Importance: 0.0
Variable: TIO2                 Importance: 0.0
Variable: AL2O3                Importance: 0.0
Variable: CR2O3                Importance: 0.0
Variable: MGO                  Importance: 0.0
Variable: MNO                  Importance: 0.0
Variable: P2O5                 Importance: 0.0
Variable: F                    Importance: 0.0
Var

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.0s finished


Average number of nodes 1
Average maximum depth 0
Variable: SIO2                 Importance: 0.0
Variable: TIO2                 Importance: 0.0
Variable: AL2O3                Importance: 0.0
Variable: CR2O3                Importance: 0.0
Variable: FEOT                 Importance: 0.0
Variable: CAO                  Importance: 0.0
Variable: MGO                  Importance: 0.0
Variable: MNO                  Importance: 0.0
Variable: K2O                  Importance: 0.0
Variable: NA2O                 Importance: 0.0
Variable: P2O5                 Importance: 0.0
Variable: F                    Importance: 0.0
Variable: CL                   Importance: 0.0
Variable: NIO                  Importance: 0.0
Variable: CUO                  Importance: 0.0
Variable: COO                  Importance: 0.0
Variable: ZNO                  Importance: 0.0
Variable: PBO                  Importance: 0.0
Variable: S                    Importance: 0.0
Variable: ZRO2                 Importance: 0.0
Variable: 

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.0s finished


Average number of nodes 1
Average maximum depth 0
Variable: SIO2                 Importance: 0.0
Variable: TIO2                 Importance: 0.0
Variable: AL2O3                Importance: 0.0
Variable: CR2O3                Importance: 0.0
Variable: FEOT                 Importance: 0.0
Variable: CAO                  Importance: 0.0
Variable: MGO                  Importance: 0.0
Variable: MNO                  Importance: 0.0
Variable: K2O                  Importance: 0.0
Variable: NA2O                 Importance: 0.0
Variable: P2O5                 Importance: 0.0
Variable: F                    Importance: 0.0
Variable: CL                   Importance: 0.0
Variable: NIO                  Importance: 0.0
Variable: CUO                  Importance: 0.0
Variable: COO                  Importance: 0.0
Variable: ZNO                  Importance: 0.0
Variable: PBO                  Importance: 0.0
Variable: S                    Importance: 0.0
Variable: ZRO2                 Importance: 0.0
Variable: 

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.0s finished


Average number of nodes 1
Average maximum depth 0
Variable: SIO2                 Importance: 0.0
Variable: TIO2                 Importance: 0.0
Variable: AL2O3                Importance: 0.0
Variable: CR2O3                Importance: 0.0
Variable: FEOT                 Importance: 0.0
Variable: CAO                  Importance: 0.0
Variable: MGO                  Importance: 0.0
Variable: MNO                  Importance: 0.0
Variable: K2O                  Importance: 0.0
Variable: NA2O                 Importance: 0.0
Variable: P2O5                 Importance: 0.0
Variable: F                    Importance: 0.0
Variable: CL                   Importance: 0.0
Variable: NIO                  Importance: 0.0
Variable: CUO                  Importance: 0.0
Variable: COO                  Importance: 0.0
Variable: ZNO                  Importance: 0.0
Variable: PBO                  Importance: 0.0
Variable: S                    Importance: 0.0
Variable: ZRO2                 Importance: 0.0
Variable: 

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.0s finished


Average number of nodes 1
Average maximum depth 0
Variable: SIO2                 Importance: 0.0
Variable: TIO2                 Importance: 0.0
Variable: AL2O3                Importance: 0.0
Variable: CR2O3                Importance: 0.0
Variable: FEOT                 Importance: 0.0
Variable: CAO                  Importance: 0.0
Variable: MGO                  Importance: 0.0
Variable: MNO                  Importance: 0.0
Variable: K2O                  Importance: 0.0
Variable: NA2O                 Importance: 0.0
Variable: P2O5                 Importance: 0.0
Variable: F                    Importance: 0.0
Variable: CL                   Importance: 0.0
Variable: NIO                  Importance: 0.0
Variable: CUO                  Importance: 0.0
Variable: COO                  Importance: 0.0
Variable: ZNO                  Importance: 0.0
Variable: PBO                  Importance: 0.0
Variable: S                    Importance: 0.0
Variable: ZRO2                 Importance: 0.0
Variable: 

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.0s finished


Average number of nodes 1
Average maximum depth 0
Variable: SIO2                 Importance: 0.0
Variable: TIO2                 Importance: 0.0
Variable: AL2O3                Importance: 0.0
Variable: CR2O3                Importance: 0.0
Variable: FEOT                 Importance: 0.0
Variable: CAO                  Importance: 0.0
Variable: MGO                  Importance: 0.0
Variable: MNO                  Importance: 0.0
Variable: K2O                  Importance: 0.0
Variable: NA2O                 Importance: 0.0
Variable: P2O5                 Importance: 0.0
Variable: F                    Importance: 0.0
Variable: CL                   Importance: 0.0
Variable: NIO                  Importance: 0.0
Variable: CUO                  Importance: 0.0
Variable: COO                  Importance: 0.0
Variable: ZNO                  Importance: 0.0
Variable: PBO                  Importance: 0.0
Variable: S                    Importance: 0.0
Variable: ZRO2                 Importance: 0.0
Variable: 

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.0s finished


Average number of nodes 1
Average maximum depth 0
Variable: SIO2                 Importance: 0.0
Variable: TIO2                 Importance: 0.0
Variable: AL2O3                Importance: 0.0
Variable: CR2O3                Importance: 0.0
Variable: FEOT                 Importance: 0.0
Variable: CAO                  Importance: 0.0
Variable: MGO                  Importance: 0.0
Variable: MNO                  Importance: 0.0
Variable: K2O                  Importance: 0.0
Variable: NA2O                 Importance: 0.0
Variable: P2O5                 Importance: 0.0
Variable: F                    Importance: 0.0
Variable: CL                   Importance: 0.0
Variable: NIO                  Importance: 0.0
Variable: CUO                  Importance: 0.0
Variable: COO                  Importance: 0.0
Variable: ZNO                  Importance: 0.0
Variable: PBO                  Importance: 0.0
Variable: S                    Importance: 0.0
Variable: ZRO2                 Importance: 0.0
Variable: 

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


Model Saved ...
../model_py/allmodels


In [34]:
def test_cprm_datasets(filename):
    
    #Features used in training!
    Qmin_RF_features = trainFeatures.values.tolist()
    print(Qmin_RF_features)
    df = pd.read_csv(filename, encoding = "ISO-8859-1")
    df_w = df

    #Remove Columns not in Qmin_Group_RF (Oxides used in trainnig RF model)
    for i in df.columns:
        if i == 'FEO':
            df = df.rename(columns={'FEO':'FEOT'})
            continue
        if i not in Qmin_RF_features:
            print('Data not in trained RF Model: ',i)
            df = df.drop(columns=i)   
    
    # Add missing columns
    for i in Qmin_RF_features:
        if i not in df.columns:
            print('Missing... ', i)
            df[i] = 0.0
        

    df = df.astype('float64')
    df = df.reindex(columns=Qmin_RF_features)

    group_class = models['GROUP'].predict(df)
    mineral_class = models['SULFIDE'].predict(df)
    df_w['GROUP_ClASS'] = group_class
    df_w['MINERAL_CLASS'] = mineral_class
    df_w.to_csv(filename[:-4]+'_rf_python.csv')
    print(classification_report(df_w['MINERAL'],mineral_class))
    print(accuracy_score(df_w['MINERAL'],mineral_class))

#Teste Anderson
test_cprm_datasets('../data_test/anderson_output.csv')


['SIO2', 'TIO2', 'AL2O3', 'CR2O3', 'FEOT', 'CAO', 'MGO', 'MNO', 'K2O', 'NA2O', 'P2O5', 'F', 'CL', 'NIO', 'CUO', 'COO', 'ZNO', 'PBO', 'S', 'ZRO2', 'AS']
Data not in trained RF Model:  Unnamed: 0
Data not in trained RF Model:  id
Data not in trained RF Model:  SAMPLE
Data not in trained RF Model:  MINERAL
Data not in trained RF Model:  value
Data not in trained RF Model:  POINT
Data not in trained RF Model:  H2O
Data not in trained RF Model:  H20


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


                 precision    recall  f1-score   support

             34       0.00      0.00      0.00         1
             35       0.00      0.00      0.00         1
             36       0.00      0.00      0.00         1
             37       0.00      0.00      0.00         1
             38       0.00      0.00      0.00         1
             39       0.00      0.00      0.00         1
             40       0.00      0.00      0.00         1
             41       0.00      0.00      0.00         1
     ALABANDITE       0.00      0.00      0.00         0
   ARSENOPYRITE       0.00      0.00      0.00         4
        BORNITE       0.00      0.00      0.00         0
   CALCHOPYRITE       0.00      0.00      0.00        17
   CHALCOPYRITE       0.00      0.00      0.00         0
      COBALTITE       0.00      0.00      0.00         0
         GALENA       1.00      0.91      0.95        86
HOLLINGWORTHITE       0.00      0.00      0.00         0
       IRARSITE       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
# Teste Evandro
test_cprm_datasets('../data_test/evandro_output.csv')


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


['SIO2', 'TIO2', 'AL2O3', 'CR2O3', 'FEOT', 'CAO', 'MGO', 'MNO', 'K2O', 'NA2O', 'P2O5', 'F', 'CL', 'NIO', 'CUO', 'COO', 'ZNO', 'PBO', 'S', 'ZRO2', 'AS']
Data not in trained RF Model:  Unnamed: 0
Data not in trained RF Model:  SAMPLE
Data not in trained RF Model:  MINERAL
Data not in trained RF Model:  value
Data not in trained RF Model:  GROUP
Data not in trained RF Model:  Total
Data not in trained RF Model:  Pt
Data not in trained RF Model:  Au
Data not in trained RF Model:  Ag
Data not in trained RF Model:  H20
              precision    recall  f1-score   support

    AIKINITE       0.00      0.00      0.00         1
ARSENOPYRITE       1.00      1.00      1.00        22
     BORNITE       0.00      0.00      0.00         0
CHALCOPYRITE       1.00      0.98      0.99        42
   COBALTITE       1.00      1.00      1.00         3
    CUBANITE       0.00      0.00      0.00         0
    DIGENITE       0.00      0.00      0.00         2
      GALENA       1.00      1.00      1.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
def organize(df):
    #TODO: Algumas tabelas vieram com , e outras com . checar
    df = df.stack().str.replace(',','.').unstack()


    dic = {}
    for i in range(len(df.columns)):
        dic[df.columns[i]] = df.columns[i].strip().upper()

    df = df.rename(columns=dic)

    #Remove Columns not in Qmin_Group_RF (Oxides used in trainnig RF model)
    for i in df.columns:
        if i == 'FEO':
            df = df.rename(columns={'FEO':'FEOT'})
            continue
        if i not in model['Train Features']:
            #print('Data not in trained RF Model: ',i)
            df = df.drop(columns=i)

    # Add missing columns
    for i in model['Train Features']:
        if i not in df.columns:
           # print('Missing... ', i)
            df[i] = 0

    df = df.reindex(columns=model['Train Features'])
    return df

modelsaved = '../model_py/allmodels.pkl'
#model = pickle.load(open(filename, 'rb'))
with open(modelsaved, "rb") as f:
    model = pickle.load(f)

# Dados Renato
filename = '../data_othersources/EXMPLO_RESULTADOS_EPMA_MINERAIS_VARIADOS.csv'
#filename = '../data_othersources/ALBITA_STANDARD_BAD_ANALYSIS.csv'
#filename = '../data_othersources/ALBITA_STANDARD_GOOD_ANALYSIS.csv'


df = pd.read_csv(filename, skipfooter=6,skiprows=3)
df_w = df

#TODO: Algumas tabelas vieram com , e outras com . checar
df = df.stack().str.replace(',','.').unstack()
print(df.columns)

dic = {}
for i in range(len(df.columns)):
    dic[df.columns[i]] = df.columns[i].strip().upper()

df = df.rename(columns=dic)

#Remove Columns not in Qmin_Group_RF (Oxides used in trainnig RF model)
for i in df.columns:
    if i == 'FEO':
        df = df.rename(columns={'FEO':'FEOT'})
        continue
    if i not in model['Train Features']:
        print('Data not in trained RF Model: ',i)
        df = df.drop(columns=i)
    if i == 'H2O':
        df = df.drop(columns=i)
    

# Add missing columns
for i in model['Train Features']:
    if i not in df.columns:
        print('Missing... ', i)
        df[i] = 0
        
df = df.reindex(columns=model['Train Features'])
#df.to_csv('Renato_input.csv')
#df = df.apply(pd.to_numeric)
#model.predict(df)
df_w['GROUP']= model['GROUP'].predict(df)
#print(df_w)

groups = df_w['GROUP'].unique()

df_partial = []

for group in groups:
    #df = pd.read_csv(filename, index_col=None, header=0, encoding = "ISO-8859-1")
    df = df_w[df_w['GROUP']== group]
    df = organize(df)
    
    predictions = model[group].predict(df)
    df = df_w[df_w['GROUP']== group]
    df['GROUP PREDICTED'] = group
    df['MINERAL PREDICTED'] = predictions
    
    df_partial.append(df)
    

df_all = pd.concat(df_partial, axis=0, ignore_index=True)
print(df_all)
#mineralData = df_w[df_w['GROUP']== groups[0]]


outfilename = filename[:-4]+'_group.csv'
print(outfilename)
df_all.to_csv(outfilename)



Index(['   No. ', '   Na2O  ', '   MgO   ', '   F     ', '   Al2O3 ',
       '   SiO2  ', '   CaO   ', '   K2O   ', '   Cl    ', '   TiO2  ',
       '   Cr2O3 ', '   MnO   ', '   NiO   ', '   FeO   ', '   V2O3  ',
       '  Total  ', 'Comment  '],
      dtype='object')
Data not in trained RF Model:  NO.
Data not in trained RF Model:  V2O3
Data not in trained RF Model:  TOTAL
Data not in trained RF Model:  COMMENT
Missing...  P2O5
Missing...  CUO
Missing...  COO
Missing...  ZNO
Missing...  PBO
Missing...  S
Missing...  ZRO2
Missing...  AS


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[Parallel(n_jobs=8)]: Using backend ThreadingBackend

       No.     Na2O      MgO       F         Al2O3     SiO2      CaO     \
0         1      0,12      7,52      0,15     19,52     33,97      0,04   
1         2      0,13      7,91      0,11     19,59     34,86      0,01   
2         3       0,2      5,75      0,04     19,41     43,63      0,03   
3         4       0,2      8,14      0,07     19,07     34,75         0   
4         5       0,2      8,46      0,09     19,63     34,06         0   
5         6      1,34      0,65         0     36,17      45,2      0,01   
6         7      1,42      0,41         0     36,37     45,27      0,03   
7         8      1,04      0,68         0     35,67     44,93      0,03   
8         9         1      0,61         0     35,83     44,91      0,03   
9        10      1,06       0,4         0     36,36     46,21         0   
10       13      0,46      7,35         0     33,21     48,15      0,06   
11       16      0,04      7,66      0,31     17,37     34,99      0,02   
12       17      0,54    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [37]:
accs = [['AMPHIBOLES', 0.8045977011494253], ['CARBONATE', 0.8888888888888888], ['CLAY', 0.8271604938271605], ['FELDSPAR', 0.8888888888888888], ['FELDSPATHOID', 0.8787878787878788], ['GARNET', 0.8611111111111112], ['MICA', 0.9281045751633987], ['OLIVINE', 0.9333333333333333], ['PYROXENE', 0.85], ['SPINEL', 0.7373737373737373], ['SULFIDE', 0.9354838709677419], ['APATITE', 1.0], ['ILMENITE', 1.0], ['PEROVSKITE', 1.0], ['QUARTZ', 1.0], ['TITANITE', 1.0], ['ZIRCON', 1.0]]
for i in accs:
    print('%s, %.3f'%(i[0],i[1]))

AMPHIBOLES, 0.805
CARBONATE, 0.889
CLAY, 0.827
FELDSPAR, 0.889
FELDSPATHOID, 0.879
GARNET, 0.861
MICA, 0.928
OLIVINE, 0.933
PYROXENE, 0.850
SPINEL, 0.737
SULFIDE, 0.935
APATITE, 1.000
ILMENITE, 1.000
PEROVSKITE, 1.000
QUARTZ, 1.000
TITANITE, 1.000
ZIRCON, 1.000


In [38]:
models['Train Features'] 

['SIO2',
 'TIO2',
 'AL2O3',
 'CR2O3',
 'FEOT',
 'CAO',
 'MGO',
 'MNO',
 'K2O',
 'NA2O',
 'P2O5',
 'F',
 'CL',
 'NIO',
 'CUO',
 'COO',
 'ZNO',
 'PBO',
 'S',
 'ZRO2',
 'AS']