In [1]:
import os
import re
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
def strip_names(value):
    return value.strip()

incorrect_float = re.compile("\d+\,\d+")

def apply_function(series):
    return series.apply(convert_to_float)

def convert_to_float(values):
    new_values = []
    for value in values:
        try:
            if incorrect_float.match(value):
                value = str(value).replace(',', '.')
            try:
                new_values.append(float(value))
            except:
                return np.nan
        except:
            new_values.append(float(value))
    return new_values

def convert_to_float_value(value):
    try:
        if incorrect_float.match(value):
            value = str(value).replace(',', '.')
        return float(value)
    except:
        return float(value)

# Dataset format
The dataset used in this research is not public available. If you need access to data, contact marina.danes@ufla.br

**The file has the following header:**

```Código Master, Experimento, Vaca, Produção de leite, Consumo de MS, d1, d2, ..., dn```

in which ```d1, d2, ..., dn``` are spectral data variables

# Experiments

In [7]:
labels = {'dataset.csv': 'Consumo de MS'}

df_results = {'file': [], 'label': [], 'LOU': [], 'whout': [], 'algorithm': [], 'MSE': [], 'test_size': [], 'R2': []}

for file, label in labels.items():
    print()
    print("LABEL", label, ' file', file)
    with open('../datasets/{}'.format(file), 'r') as file_:
        dataset = pd.read_csv(file_, delimiter=',')
        dataset.replace(".", np.nan, inplace=True)
        dataset['Vaca'] = dataset['Vaca'].apply(strip_names)
        
        print("- Shape before nan removal ", dataset.shape)
        columns = dataset.columns
        removables = ['Código Master', 'Produção de leite']
        if 'Consumo' in label:
            removables.remove('Produção de leite')
        split_columns = ['Experimento', 'Vaca']

        attributes = [a for a in columns if a not in labels and a not in removables and a not in split_columns]
        dataset.drop(labels=removables, axis=1, inplace=True, errors='raise')
        dataset.dropna(inplace=True)
        print("- Shape after nan removal ", dataset.shape)
        
        for split_column in split_columns:
            print("- - LOU {}".format(split_column))
            split_values = sorted(list(set(list(dataset[split_column]))))
            for li, leave_out in enumerate(split_values):
                print("- - - WHOUT {}      ".format(leave_out), li, 'of', len(split_values))
                
                #
                # SPLIT DATA into Train and Test following the Leave One (Vaca or Experiment) Out
                #
                train_control, test_control = [v for v in split_values if v != leave_out], [leave_out]
                X_train, Y_train, X_test, Y_test = [], [], [], []
                
                for index, row in dataset.iterrows():
                    if row[split_column] in train_control:
                        X_train.append([convert_to_float_value(v) for k, v in row.items() if k in attributes])
                        Y_train.append([v for k , v in row.items() if k == label])
                    elif row[split_column] in test_control:
                        X_test.append([convert_to_float_value(v) for k, v in row.items() if k in attributes])
                        Y_test.append([v for k , v in row.items() if k == label])
                Y_train = [convert_to_float(y) for y in Y_train]
                Y_test = [convert_to_float(y) for y in Y_test]
                print("- - - Train size: ", len(X_train), "Test size: ", len(X_test))
                df_train = pd.DataFrame(X_train, columns=attributes)
                df_train['Consumo de MS'] = Y_train
                df_test = pd.DataFrame(X_test, columns=attributes)
                df_test['Consumo de MS'] = Y_test
                
                df_train.to_csv(f'data_{label}_{split_column}_{leave_out}_train.csv')
                df_test.to_csv(f'data_{label}_{split_column}_{leave_out}_test.csv')

                #
                # TRAINING
                #
                regressors = {"PLSRegressor": PLSRegression(), "KNNRegressor": KNeighborsRegressor(), "SVR": SVR(),
                              "GradientBoostingRegressor": GradientBoostingRegressor()}

                try:
                    expected = [y[0] for y in Y_test]
                except:
                    print("Some error with: ", Y_test)

                for name, regressor in regressors.items():
                    clf = regressor.fit(X_train, Y_train)
                    pred = clf.predict(X_test)

                    if isinstance(pred, np.ndarray):
                        pred = pred.tolist()
                        if isinstance(pred[0], list):
                            pred = [p[0] for p in pred]
                    else:
                        pred = [p[0] for p in pred]
                                                            
                    df_results['file'].append(file)
                    df_results['label'].append(label)
                    df_results['LOU'].append(split_column)
                    df_results['whout'].append(leave_out)
                    df_results['algorithm'].append(name)
                    df_results['test_size'].append(len(X_test))
                    df_results['MSE'].append(mean_squared_error(expected, pred))
                    df_results['R2'].append(r2_score(expected, pred))
                    
                    # store a table with expected and predicted values, 
                    #  if you want to calculate the metrics by your own
                    df = pd.DataFrame({'expected': expected, 'predicted': pred})
                    df.to_csv(f'predictions_{label}_{split_column}_{leave_out}_{name}.csv')


LABEL Consumo de MS  file dataset.csv
- Shape before nan removal  (233, 2209)
- Shape after nan removal  (233, 2208)
- - LOU Experimento
- - - WHOUT 1       0 of 5
- - - Train size:  208 Test size:  25


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT 2       1 of 5
- - - Train size:  204 Test size:  29


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT 3       2 of 5
- - - Train size:  174 Test size:  59


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT 4       3 of 5
- - - Train size:  171 Test size:  62


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT 5       4 of 5
- - - Train size:  175 Test size:  58


  return f(**kwargs)
  return f(**kwargs)


- - LOU Vaca
- - - WHOUT ALECRIM       0 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT AMETISTA       1 of 64
- - - Train size:  229 Test size:  4


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT ANETE       2 of 64
- - - Train size:  226 Test size:  7


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT ANNA       3 of 64
- - - Train size:  231 Test size:  2


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT BAMBINA       4 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT CINDERELA       5 of 64
- - - Train size:  228 Test size:  5


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT CLAIR       6 of 64
- - - Train size:  232 Test size:  1


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT CLARA       7 of 64
- - - Train size:  227 Test size:  6


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT CLARIS       8 of 64
- - - Train size:  228 Test size:  5


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT CLEO       9 of 64
- - - Train size:  231 Test size:  2


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT CRETA       10 of 64
- - - Train size:  231 Test size:  2


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT DAMIETA       11 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT DARYA       12 of 64
- - - Train size:  232 Test size:  1


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT DIADORIM       13 of 64
- - - Train size:  226 Test size:  7


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT DIANA       14 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT DIONE       15 of 64
- - - Train size:  227 Test size:  6


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT DODONA       16 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT DORIS       17 of 64
- - - Train size:  231 Test size:  2


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT ELOA       18 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT ESPANHA       19 of 64
- - - Train size:  224 Test size:  9


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT FARRA       20 of 64
- - - Train size:  226 Test size:  7


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT GAIA       21 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT GELINA       22 of 64
- - - Train size:  225 Test size:  8


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT GIOVANA       23 of 64
- - - Train size:  231 Test size:  2


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT HEVEA       24 of 64
- - - Train size:  228 Test size:  5


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT HOLLA       25 of 64
- - - Train size:  229 Test size:  4


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT IARA       26 of 64
- - - Train size:  225 Test size:  8


  return f(**kwargs)


- - - WHOUT INDIA       27 of 64
- - - Train size:  231 Test size:  2


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT INDRA       28 of 64
- - - Train size:  229 Test size:  4


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT ISABEL       29 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT JANA       30 of 64
- - - Train size:  232 Test size:  1


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT JARLA       31 of 64
- - - Train size:  232 Test size:  1


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT JEANY       32 of 64
- - - Train size:  229 Test size:  4


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT JOICE       33 of 64
- - - Train size:  226 Test size:  7


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT JUJU       34 of 64
- - - Train size:  229 Test size:  4


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT JULIA       35 of 64
- - - Train size:  231 Test size:  2


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT KAREN       36 of 64
- - - Train size:  228 Test size:  5


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT LAGOONA       37 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT LAURA       38 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT LEA       39 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT LEDA       40 of 64
- - - Train size:  232 Test size:  1


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT LIRIA       41 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT LOLITA       42 of 64
- - - Train size:  229 Test size:  4


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT LORA       43 of 64
- - - Train size:  231 Test size:  2


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT LUCIA       44 of 64
- - - Train size:  229 Test size:  4


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT MANTRA       45 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT MARCIA       46 of 64
- - - Train size:  228 Test size:  5


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT MARISOL       47 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT MAYRA       48 of 64
- - - Train size:  231 Test size:  2


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT MISS       49 of 64
- - - Train size:  228 Test size:  5


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT MOOREM       50 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT NANDI       51 of 64
- - - Train size:  226 Test size:  7


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT NATALINA       52 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT NEMEIA       53 of 64
- - - Train size:  232 Test size:  1


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT NOLA       54 of 64
- - - Train size:  231 Test size:  2


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT ODARINHA       55 of 64
- - - Train size:  231 Test size:  2


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT OLIVIA       56 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT OPERETA       57 of 64
- - - Train size:  231 Test size:  2


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT PAULA       58 of 64
- - - Train size:  228 Test size:  5


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT PERSIA       59 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT PORANGA       60 of 64
- - - Train size:  231 Test size:  2


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT SOLANGE       61 of 64
- - - Train size:  225 Test size:  8


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT TRINKET       62 of 64
- - - Train size:  230 Test size:  3


  return f(**kwargs)
  return f(**kwargs)


- - - WHOUT VAHALA       63 of 64
- - - Train size:  232 Test size:  1


  return f(**kwargs)
  return f(**kwargs)


In [5]:
df = pd.DataFrame(df_results)
df.head()

Unnamed: 0,file,label,LOU,whout,algorithm,MSE,test_size,R2
0,dataset.csv,Consumo de MS,Experimento,1,PLSRegressor,20.045904,25,-1.683969
1,dataset.csv,Consumo de MS,Experimento,1,KNNRegressor,13.267901,25,-0.776455
2,dataset.csv,Consumo de MS,Experimento,1,SVR,10.590045,25,-0.417913
3,dataset.csv,Consumo de MS,Experimento,1,GradientBoostingRegressor,0.008497,25,0.998862
4,dataset.csv,Consumo de MS,Experimento,2,PLSRegressor,12.624488,29,-0.565051


In [6]:
df.to_csv('results_CMS.csv')