In [15]:
import pickle

### Set up environment

# import packages
import pandas as pd
import numpy as np
import os
from glob import iglob
#import matplotlib.pyplot as plt
#import seaborn as sns
import random
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor
#from matplotlib.pyplot import figure
from sklearn.neural_network import MLPRegressor
import glob

import json
import csv

# Set parameters
T = 298      # Kelvin


# define Jeanna's broadening formalism for use later.  Taken from paper ...
def broadening(m, T, ma, mp, b0):
    gamma = 1.7796e-5 * (m/(m-2)) * (1/np.sqrt(T)) * np.sqrt((ma+mp)/(ma*mp)) * b0**2
    return(gamma)





In [16]:

### Read in data

# import data

# absolute path to folder containing data
#rootdir_glob = '/Users/elizabeth/Desktop/line_broadening.nosync/line_broadening/hitran_data/**/*'
# be selective for data files
#file_list = [f for f in iglob(rootdir_glob, recursive=True) if os.path.isfile(f) if f[-10:] == "/1_iso.csv" if "readme" not in f]
#file_list = 
#path = '/Users/elizabeth/Desktop/line_broadening.nosync/line_broadening/model_search/*'
#file_list = glob.glob(path + 'raw_data/*.csv')

path = '/Users/elizabeth/Desktop/line_broadening.nosync/line_broadening/Other_broadeners/Files_with_new_data/'
file_list = glob.glob(path+'*.csv')

# read data files, taking the filename from absolute path
db = {}
for f in file_list:
    i = f[101:104].strip('_')
    j = f[106:109].strip('_')
    file = pd.read_csv(f, low_memory=False)
    if i+j in db:
        db[i+'_'+j] = pd.concat([db[i+'_'+j], file]).reset_index(drop=True)
    else:
        db[i+'_'+j] = file.reset_index(drop=True)
        
print(len(db))


13


In [17]:
    

# dictionary of molecules of data - condensed version
molecules = {}

# take only molecules for which there is full data
for key, datas in db.items():
    if key == 'HONO_prediction':
        continue
    # filter for rotational constant 3D - as that was the last thing added
    if 'B0a' in datas.columns:



        broadeners = ['air', 'self', 'CO2', 'H2O', 'H2', 'He']
        for broadener in broadeners:
            print(key, broadener)
            if broadener == 'He':
                continue
            elif broadener == 'air':
                continue
            elif broadener == 'self':
                broadener_data = db[key][['molecule_weight', 'm', 'd',
                     'molecule_dipole', 'polar', 'B0a', 'B0b', 'B0c']].loc[0]
                
            else:
                broadener_data = db[broadener][['molecule_weight', 'm', 'd',
                     'molecule_dipole', 'polar', 'B0a', 'B0b', 'B0c']].loc[0]
                
            data = datas[['J', 'Jpp', 'molecule_weight', 'm', 'd', 
                     'molecule_dipole', 'polar', 'B0a', 'B0b', 'B0c', 'Ka_aprox',
                     'Kapp_aprox', 'Kc_aprox', 'Kcpp_aprox', 'gamma_'+broadener, 'gamma_'+broadener+'-err']]
            data = data.rename(columns={"gamma_"+broadener: "gamma", "gamma_"+broadener+"-err": "gamma-err"})
            for item in broadener_data.index:
                data['broadener_'+item] = broadener_data.loc[item]
           

            data['m'] = data['m'].loc[0]-4
            data['broadener_m'] = data['broadener_m'].loc[0]-4
            data['broadener_weight'] = data['broadener_molecule_weight'].loc[0]
            data['broadener_dipole'] = data['broadener_molecule_dipole'].loc[0]
            data.drop(columns=['broadener_molecule_weight', 'broadener_molecule_dipole'])

            # some data missing J values, just get rid of it.
            data = data.dropna()
        
        
            #if key == 'C2H6':
            #    data = data.drop(data[(data['J'] > 20) & (data['gamma_'+broadener] > 0.08)].index)
        
        
            branch = data['Jpp'] - data['J']
            data = data.drop(branch[abs(branch)>2].index)
            branch = data['Jpp'] - data['J']
        
            data['P'] = -data['Jpp'][branch==1]
            data['Q'] = data['Jpp'][branch==0]
            data['R'] = data['Jpp'][branch==-1] +1
            data['O'] = -data['Jpp'][branch==2]
            data['S'] = data['Jpp'][branch==-2] +1

            data['P'] = data['P'].fillna(0)
            data['Q'] = data['Q'].fillna(0)
            data['R'] = data['R'].fillna(0)
            data['O'] = data['O'].fillna(0)
            data['S'] = data['S'].fillna(0)
            #data = data.fillna(0)

            data['M'] = data['P'] + data['Q'] + data['R'] + data['O'] + data['S']
            data = data.drop(columns=['P', 'Q', 'R', 'O', 'S'])
            #data['M'] = data['M']/data['B0c']
            # assign data to molecule
            molecules[key+'_'+broadener] = data


             
            if key == 'CH3CN':
                broadness_jeanna = broadening(data['m'][20]+data['broadener_m'][20], T, data['molecule_weight'][20], data['broadener_weight'][20], data['d'][20]/2+data['broadener_d'][20]/2)
            else:
                broadness_jeanna = broadening(data['m'][2]+data['broadener_m'][2], T, data['molecule_weight'][2], data['broadener_weight'][2], data['d'][2]/2+data['broadener_d'][2]/2)
            molecules[key+'_'+broadener]['broadness_jeanna'] = broadness_jeanna
            #print(molecules[key+'_'+broadener])
            #molecules[key] = molecules[key].drop(columns=['air_weight'])#symmetry
            #print(molecules[key])




In [18]:
print(molecules['C2H2_self'].columns)
print(molecules['C2H2_CO2'].columns)
print(len(molecules))

KeyError: 'C2H2_self'

In [115]:
    


###  Investigate data

# Work out the average error codes of each molecule.  Count how many of each error code, and how many points there are

# list of molecule names
molecule_names = []
# list of the proportion of data that is code 3 or above for each molecule
error_2s = []
# list of the number of points each molecule contains
datapoints = []


for key, data in molecules.items():
    molecule, broadener = key.split('_')
    molecule_names.append(key)
    x = data['gamma-err']
    # array of a 2s, b 3s, c 4s, etc...
    y = x.value_counts().sort_index()

    # if there is data classed as 2 or below, class it as 'bad'
    if 2 in y.index:
        error_2s.append(1-y.cumsum()[2]/x.count())
        datapoints.append(x.count())
    # otherwise all data is 'good'
    elif 1 in y.index:
        error_2s.append(1-y.cumsum()[1]/x.count())
        datapoints.append(x.count())

    elif 0 in y.index:
        error_2s.append(1-y.cumsum()[0]/x.count())
        datapoints.append(x.count())
    else:
        error_2s.append(1-0)
        datapoints.append(x.count())
        
        

# Print out how much data there is which is 'good' (out of every datapoint)
e2 = np.array(error_2s)
dat = np.array(datapoints)
good_data = e2*dat
print('percentage "good" data =')
print(sum(good_data)/sum(dat))






percentage "good" data =
0.22262681199767262


In [116]:

### Prepare data for use

'''
# reset molecules so that all molecules have the same number of points
for molecule, data in molecules.items():
    # normalise the amount of data compared to the molecule with the most data (SO2)
    points = 549424//len(data)
    #print(len(data))
    # repeat data n times, until each has roughly the same amount of data
    data = pd.concat([data]*points)
    # assign data back to dictionar
    molecules[molecule] = data
'''

# weight data by error code, currently error code = weighting
for key, data in molecules.items():
    molecule, broadener = key.split('_')

    # take weight as gamma-air-err
    #data=data.sample(frac=1)
    weight = data['gamma-err']
    # Give helpful weightings
    # reweight 0 to tiny, because 0 gives /0 error
    
    if molecule in ['COCl2', 'C2H2', 'HOBr', 'H2', 'CH3OH', 'HCOOH', 'HOCl', 'COF2', 'HC3N']:
        weight = (1/1000000000)**2
    elif molecule in ['HNO3', 'O2', 'CS', 'ClO', 'CH3CN', 'H2O2', 'C2H4', 'O3']:
        weight = weight.replace(0, (1/500000000)**2)    # 0  ~~~  unreported or unavailable
        weight = weight.replace(1, (1/20000000)**2)    # 1  ~~~  Default or constant
        weight = weight.replace(2, (1/1000000)**2)    # 2  ~~~  Average or estimate
        weight = weight.replace(3, (1/50)**2)     # 3  ~~~  err >= 20 %              50
        weight = weight.replace(4, (1/15)**2)     # 4  ~~~  20 >= err >= 10 %        15
        weight = weight.replace(5, (1/10)**2)    # 5  ~~~  10 >= err >= 5 %         7.5
        weight = weight.replace(6, (1/10)**2)    # 6  ~~~  5 >= err >= 2 %          3.5
        weight = weight.replace(7, (1/10)**2)    # 7  ~~~  2 >= err >= 1 %          1.5
        weight = weight.replace(8, (1/10)**2)    # 8  ~~~  err <= 1 %               0.5
    elif molecule in ['C2H6']:
        weight = weight.replace(0, (1/500000)**2)    # 0  ~~~  unreported or unavailable
        weight = weight.replace(1, (1/20000)**2)    # 1  ~~~  Default or constant
        weight = weight.replace(2, (1/1000)**2)    # 2  ~~~  Average or estimate
        weight = weight.replace(3, (1/50)**2)     # 3  ~~~  err >= 20 %              50
        weight = weight.replace(4, (1/15000000000000)**2)     # 4  ~~~  20 >= err >= 10 %        15
        weight = weight.replace(5, (1/10)**2)    # 5  ~~~  10 >= err >= 5 %         7.5
        weight = weight.replace(6, (1/10)**2)    # 6  ~~~  5 >= err >= 2 %          3.5
        weight = weight.replace(7, (1/10)**2)    # 7  ~~~  2 >= err >= 1 %          1.5
        weight = weight.replace(8, (1/10)**2)    # 8  ~~~  err <= 1 %               0.5
    elif molecule in ['SO2']:
        weight = weight.replace(0, (1/500000000)**2)    # 0  ~~~  unreported or unavailable
        weight = weight.replace(1, (1/20000000)**2)    # 1  ~~~  Default or constant
        weight = weight.replace(2, (1/1000000)**2)    # 2  ~~~  Average or estimate
        weight = weight.replace(3, (1/50000)**2)     # 3  ~~~  err >= 20 %              50
        weight = weight.replace(4, (1/15)**2)     # 4  ~~~  20 >= err >= 10 %        15
        weight = weight.replace(5, (1/10)**2)    # 5  ~~~  10 >= err >= 5 %         7.5
        weight = weight.replace(6, (1/10)**2)    # 6  ~~~  5 >= err >= 2 %          3.5
        weight = weight.replace(7, (1/10)**2)    # 7  ~~~  2 >= err >= 1 %          1.5
        weight = weight.replace(8, (1/10)**2)    # 8  ~~~  err <= 1 %               0.5
    else:
        weight = weight.replace(0, (1/500000)**2)    # 0  ~~~  unreported or unavailable
        weight = weight.replace(1, (1/20000)**2)    # 1  ~~~  Default or constant
        weight = weight.replace(2, (1/1000)**2)    # 2  ~~~  Average or estimate
        weight = weight.replace(3, (1/50)**2)     # 3  ~~~  err >= 20 %              50
        weight = weight.replace(4, (1/15)**2)     # 4  ~~~  20 >= err >= 10 %        15
        weight = weight.replace(5, (1/10)**2)    # 5  ~~~  10 >= err >= 5 %         7.5
        weight = weight.replace(6, (1/10)**2)    # 6  ~~~  5 >= err >= 2 %          3.5
        weight = weight.replace(7, (1/10)**2)    # 7  ~~~  2 >= err >= 1 %          1.5
        weight = weight.replace(8, (1/10)**2)    # 8  ~~~  err <= 1 %               0.5
    
    # reassign weight into dictionary
    molecules[key]['gamma-err'] = weight
    


    #print(molecule)
    #print(len(data))
    datas=[]
    if isinstance(weight, float):
        points=0
        data = data.sample(frac=points, replace=True)
        molecules[key] = data
    else:
        for weight_value in weight.unique():
            amount_data = len(weight[weight==weight_value])
            #print(weight_value)
            #print(amount_data)
            fraction = amount_data / len(data)
            points = 60000000*weight_value*fraction/len(data)
            #print(fraction)
            #print('fraction kept = '+str(points)) 
            #print()
            datas.append(data[weight==weight_value].sample(frac=points, replace=True))
    
        data = pd.concat(datas)
    # assign data back to dictionary
    molecules[key] = data
    
        
        
        
    '''average_weight = np.mean(weight)
    points = 600000000*average_weight/len(data)
    print(molecule)
    print('avg_weighting = '+str(average_weight))
    print('fraction kept = '+str(points)) 
    print()
    data = data.sample(frac=points, replace=True)
    #data = pd.concat([data]*points)
    # assign data back to dictionary
    molecules[molecule] = data
    '''


In [117]:
print(molecules.keys())

dict_keys(['CS2_self', 'CS2_CO2', 'CS2_H2O', 'CS2_H2', 'CS_self', 'CS_CO2', 'CS_H2O', 'CS_H2', 'COF2_self', 'COF2_CO2', 'COF2_H2O', 'COF2_H2', 'NO2_self', 'NO2_CO2', 'NO2_H2O', 'NO2_H2', 'O3_self', 'O3_CO2', 'O3_H2O', 'O3_H2', 'HBr_self', 'HBr_CO2', 'HBr_H2O', 'HBr_H2', 'O2_self', 'O2_CO2', 'O2_H2O', 'O2_H2', 'PH3_self', 'PH3_CO2', 'PH3_H2O', 'PH3_H2', 'HNO3_self', 'HNO3_CO2', 'HNO3_H2O', 'HNO3_H2', 'N2_self', 'N2_CO2', 'N2_H2O', 'N2_H2', 'HO2_self', 'HO2_CO2', 'HO2_H2O', 'HO2_H2', 'SO2_self', 'SO2_CO2', 'SO2_H2O', 'SO2_H2', 'COCl2_self', 'COCl2_CO2', 'COCl2_H2O', 'COCl2_H2', 'HC3N_self', 'HC3N_CO2', 'HC3N_H2O', 'HC3N_H2', 'GeH4_self', 'GeH4_CO2', 'GeH4_H2O', 'GeH4_H2', 'CH3OH_self', 'CH3OH_CO2', 'CH3OH_H2O', 'CH3OH_H2', 'OCS_self', 'OCS_CO2', 'OCS_H2O', 'OCS_H2', 'HOBr_self', 'HOBr_CO2', 'HOBr_H2O', 'HOBr_H2', 'CH3I_self', 'CH3I_CO2', 'CH3I_H2O', 'CH3I_H2', 'H2O2_self', 'H2O2_CO2', 'H2O2_H2O', 'H2O2_H2', 'H2S_self', 'H2S_CO2', 'H2S_H2O', 'H2S_H2', 'CH3Cl_self', 'CH3Cl_CO2', 'CH3Cl_H2O

In [118]:


'''
# reset molecules so that all molecules have the same number of points
for molecule, data in molecules.items():
    # normalise the amount of data compared to the molecule with the most data (SO2)
    print(len(data))
    weight = 950863/len(data)
    #weight = 549424/len(data)
    # assign data back to dictionary
    molecules[molecule]['gamma_air-err'] = data['gamma_air-err']*weight
'''

import random
keys = list(molecules.items())
random.seed(41)
random.shuffle(keys)
molecules = dict(keys)
    
# Dictionary of molecules, and test/training data
molecule_list = {}

# collect 'training data' from all other molecules, except the labelled one
i=0
# collect 'training data' from all other molecules, except the labelled one
for molecule in molecules:
    if not i%20:
        print(i)
        # molecule is being tested
        test_data = {k: molecules[k] for k in list(molecules)[i:i+20]}
        #data_test = molecules[molecule]
        # take test molecule out of dictionary
        train_data = set(molecules) - set(test_data)
        #print(test_data)
        #print(test_data.keys(), train_data)
        # dictionary of molecules in test data
        train_data = {k: molecules[k] for k in train_data}
        
        # All test data in one dataframe
        data_test = pd.concat([test_data[k] for k in test_data])

        # Take all train data into one dataframe
        data_train = pd.concat([train_data[k] for k in train_data])
        
        # add data into molecule_list dictionary
        moles = ','.join(list(test_data.keys()))
        molecule_list[moles] = [data_train, data_test]
        
    i+=1
    



#print(molecule_list)


0
20
40
60
80
100
120
140
160
180


In [119]:



'''
# concatenate all dataframes once
data_train_all = pd.concat(list(molecules.values()))


molecule_list = {}

a = []
for i, data_test in enumerate(list(molecules.values())):
    # slice the concatenated dataframe
    data_train = pd.concat([data_train_all.iloc[:i*len(molecule_list)], data_train_all.iloc[(i+1)*len(molecule_list):]])

    molecule_list[list(molecules.keys())[i]] = [data_train, data_test]


'''
'''
# Dictionary of molecules, and test/training data
molecule_list = {}

# collect 'training data' from all other molecules, except the labelled one
for molecule in molecules:
    # molecule is being tested
    data_test = molecules[molecule].to_numpy()
    # take test molecule out of dictionary
    train_data = {}
    for k in molecules:
        if k != molecule:
            train_data[k] = molecules[k].to_numpy()

    data_train = np.concatenate([train_data[k] for k in train_data])
    molecule_list[molecule] = [data_train, data_test]
'''



def what_is_error(weight):
    if weight == (1/500000)**2:
        weight = 'Unavailable'
    if weight == (1/20000)**2:
        weight = 'Constant'
    if weight == (1/1000)**2:
        weight = 'Estimate'
    if weight == (1/50)**2:
        weight = '>20%'
    if weight == (1/15)**2:
        weight = '20> >10%'
    if weight == (1/10)**2:
        weight = '<10%'
    return weight



plot_data_list = []

In [None]:

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
### Machine learning!
#print('Can you get to the start of ML?')

## start machine learning
#total_1oerr2 = 0
#total_1oerr = 0
#total_err = 0
#total_err2 = 0
#total_1orterr = 0
#total_nowt = 0
#total_MLP = 0
counter = 0
pipe_container = []

total = 0
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, lognorm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import export_graphviz
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import VotingRegressor
#>>> from sklearn.pipeline import Pipeline
#>>> import numpy as np
flag = False
#kernel = 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3))
# learn on each molecule
for key, data in molecule_list.items():
    print(len(data[1]))
    #print(key)
    #molecule, broadener = key.split('_')
    # print out which molecule we're looking at
    #counter += 1
    #if counter < 1:
    #    continue
    
    #distributions = dict(histgradientboostingregressor__loss=['least_squares', 'least_absolute_deviation', 'poisson'],# 'quantile'],
    #                      histgradientboostingregressor__learning_rate=lognorm(2),         histgradientboostingregressor__learning_rate=uniform(loc=0.2, scale=0.9))
    #                      #histgradientboostingregressor__max_iter=np.random.randint(10, 1000, 1000),       #lognorm(1.5, scale=300),
    #                      #histgradientboostingregressor__max_leaf_nodes=np.random.randint(2, 100, 1000),              #uniform(loc=0, scale=300),
    #                      #histgradientboostingregressor__l2_regularization=uniform(loc=0, scale=10)) 
    


    #distributions = dict(histgradientboostingregressor__min_samples_leaf=[1000, 100, 20, 5, 1])
    #distributions = dict(mlpregressor__hidden_layer_sizes=[(100, 100), (1000, 1000), (5000, 500), (500, 5000)], 
    #                     mlpregressor__activation=['tanh', 'relu'],
    #                     mlpregressor__alpha=[0.0001, 0.001, 0.01])
    
    #if key == 'HO2':
    #    flag=True
    #elif key == 'NO':
    #    flag=False
    #print(molecule)
    #if flag == False:
    #    continue
    
    
    # take out training and test data
    data_train = data[0]
    data_test = data[1]
    
    # Check if there are null values
    #if data_train.isnull().values.any():
    #    raise ValueError("We're getting null values here, might be good to cut them out")

    # shuffle data, randomised lines of each molecule for machine learning
    print(len(data_train))
    data_test = data_test.sample(frac=1)
    print("lets train!!!")
    data_train = data_train.sample(frac=0.01)
    

    #print('data splitted')



    # Training data - separate out all x values and y (broadening) values.  gamma-err is weighting
    X_train = data_train.drop(['gamma', 'gamma-err'], axis=1)
    y_train = data_train['gamma']
    weight_train = data_train['gamma-err']
    #weight_train_1ovsquare = data_train['gamma_air-err']
    #weight_train_1overr = np.sqrt(data_train['gamma_air-err'])
    #weight_train_err = 1/np.sqrt(data_train['gamma_air-err'])
    #weight_train_err_sq = 1/data_train['gamma_air-err']
    #weight_train_1ovsq_root = np.sqrt(np.sqrt(data_train['gamma_air-err']))
    #weight_train = np.sqrt(data_train['gamma_air-err'])
    #weight_train = np.sqrt(data_train['gamma_air-err'])
    # Separate out test data
    X_test = data_test.drop(['gamma', 'gamma-err'], axis=1)
    y_test = data_test['gamma']
    weight_test = data_test['gamma-err']
    #weight_test_1ovsquare = data_test['gamma_air-err']
    #weight_test_1overr = np.sqrt(data_test['gamma_air-err'])
    #weight_test_err = 1/np.sqrt(data_test['gamma_air-err'])
    #weight_test_err_sq = 1/data_test['gamma_air-err']
    #weight_test_1ovsq_root = np.sqrt(np.sqrt(data_test['gamma_air-err']))

    
    #pipe = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=10, min_weight_fraction_leaf=0.001, verbose=2))

    #FINALLLLL
    
    # Create pipeline of scaling, then ML method
    pipe = make_pipeline(StandardScaler(), VotingRegressor(
                                               estimators=[('hist', HistGradientBoostingRegressor()),
                                                           ('ada', AdaBoostRegressor()),
                                                           ('svr', SVR()),
                                                           ('forest', RandomForestRegressor(n_estimators=10, min_weight_fraction_leaf=0.001, verbose=2)),
                                                           ('mlp', MLPRegressor(hidden_layer_sizes=(30, 30), alpha=0.01, learning_rate='adaptive', random_state=42, verbose=1, n_iter_no_change=1, tol=0.00001))]
                                                , n_jobs=-1, verbose=True
                                               ))

    
    '''
    pipe1 = make_pipeline(StandardScaler(), VotingRegressor(
                                               estimators=[#('hist', HistGradientBoostingRegressor(learning_rate=1, max_leaf_nodes=None, random_state=42)),
                                                           ('ada', AdaBoostRegressor()),
                                                           ('svr', SVR()),
                                                           ('forest', RandomForestRegressor(n_estimators=10, min_weight_fraction_leaf=0.001, verbose=2)),
                                                           ('mlp', MLPRegressor(hidden_layer_sizes=(30, 30), alpha=0.01, learning_rate='adaptive', random_state=42, verbose=1, n_iter_no_change=1, tol=0.00001))]
                                                , n_jobs=-1, verbose=True
                                               ))
    pipe2 = make_pipeline(StandardScaler(), VotingRegressor(
                                               estimators=[('hist', HistGradientBoostingRegressor(learning_rate=1, max_leaf_nodes=None, random_state=42)),
                                                           #('ada', AdaBoostRegressor()),
                                                           ('svr', SVR()),
                                                           ('forest', RandomForestRegressor(n_estimators=10, min_weight_fraction_leaf=0.001, verbose=2)),
                                                           ('mlp', MLPRegressor(hidden_layer_sizes=(30, 30), alpha=0.01, learning_rate='adaptive', random_state=42, verbose=1, n_iter_no_change=1, tol=0.00001))]
                                                , n_jobs=-1, verbose=True
                                               ))
    pipe3 = make_pipeline(StandardScaler(), VotingRegressor(
                                               estimators=[('hist', HistGradientBoostingRegressor(learning_rate=1, max_leaf_nodes=None, random_state=42)),
                                                           ('ada', AdaBoostRegressor()),
                                                           #('svr', SVR()),
                                                           ('forest', RandomForestRegressor(n_estimators=10, min_weight_fraction_leaf=0.001, verbose=2)),
                                                           ('mlp', MLPRegressor(hidden_layer_sizes=(30, 30), alpha=0.01, learning_rate='adaptive', random_state=42, verbose=1, n_iter_no_change=1, tol=0.00001))]
                                                , n_jobs=-1, verbose=True
                                               ))
    pipe4 = make_pipeline(StandardScaler(), VotingRegressor(
                                               estimators=[('hist', HistGradientBoostingRegressor(learning_rate=1, max_leaf_nodes=None, random_state=42)),
                                                           ('ada', AdaBoostRegressor()),
                                                           ('svr', SVR()),
                                                           #('forest', RandomForestRegressor(n_estimators=10, min_weight_fraction_leaf=0.001, verbose=2)),
                                                           ('mlp', MLPRegressor(hidden_layer_sizes=(30, 30), alpha=0.01, learning_rate='adaptive', random_state=42, verbose=1, n_iter_no_change=1, tol=0.00001))]
                                                , n_jobs=-1, verbose=True
                                               ))
    pipe5 = make_pipeline(StandardScaler(), VotingRegressor(
                                               estimators=[('hist', HistGradientBoostingRegressor(learning_rate=1, max_leaf_nodes=None, random_state=42)),
                                                           ('ada', AdaBoostRegressor()),
                                                           ('svr', SVR()),
                                                           ('forest', RandomForestRegressor(n_estimators=10, min_weight_fraction_leaf=0.001, verbose=2))]
                                                           #('mlp', MLPRegressor(hidden_layer_sizes=(30, 30), alpha=0.01, learning_rate='adaptive', random_state=42, verbose=1, n_iter_no_change=1, tol=0.00001))]
                                                , n_jobs=-1, verbose=True
                                               ))



    '''

    #('mlp', MLPRegressor(hidden_layer_sizes=(30, 30), alpha=0.01, learning_rate='adaptive', random_state=42, verbose=1, n_iter_no_change=1, tol=0.00001))]

    #pipe = make_pipeline(StandardScaler(), HistGradientBoostingRegressor(learning_rate=1, max_leaf_nodes=None, random_state=42, verbose=1))
    #pipe = make_pipeline(StandardScaler(), HistGradientBoostingRegressor(random_state=42, verbose=1))
    #pipe = RandomizedSearchCV(pipe, distributions, n_iter=5, cv=2, verbose=3)
    #pipe = make_pipeline(StandardScaler(), AdaBoostRegressor())
    #pipe = make_pipeline(StandardScaler(),  SGDRegressor(max_iter=1000, tol=1e-3))
    #pipe = make_pipeline(StandardScaler(), DummyRegressor(strategy='mean'))
    #pipe = make_pipeline(StandardScaler(), DecisionTreeRegressor(max_depth=9))
    #pipe = make_pipeline(StandardScaler(), RandomForestRegressor())
    #pipe = make_pipeline(StandardScaler(), MLPRegressor(hidden_layer_sizes=(100, 100), alpha=0.001, learning_rate='adaptive', random_state=42, verbose=1, n_iter_no_change=3, tol=0.00001))#, hidden_layer_sizes=(10, 100), alpha=0.001, learning_rate='adaptive', random_state=42, verbose=1))
    #pipe = make_pipeline(StandardScaler(), PolynomialFeatures(degree=3), LinearRegression(fit_intercept=False))
    #pipe = make_pipeline(StandardScaler(), SVR(C=5, epsilon=0.2))
    #pipe = make_pipeline(StandardScaler(), GaussianProcessRegressor(kernel=kernel, alpha=0.1))
    #pipe = RandomizedSearchCV(pipe, distributions, n_iter=24, cv=2, verbose=3)

    #print('Pipeline made')
    #print(pipe) 
    #pipe.fit(X_train, y_train, linearregression__sample_weight=weight_train)
    #pipe.fit(X_train, y_train, histgradientboostingregressor__sample_weight=weight_train)
    #pipe.fit(X_train, y_train, sgdregressor__sample_weight=weight_train)
    #pipe.fit(X_train, y_train, dummyregressor__sample_weight=weight_train)
    #pipe.fit(X_train, y_train, adaboostregressor__sample_weight=weight_train)
    #pipe.fit(X_train, y_train, decisiontreeregressor__sample_weight=weight_train)
    #pipe.fit(X_train, y_train, randomforestregressor__sample_weight=weight_train)
    #pipe.fit(X_train, y_train, votingregressor__sample_weight=weight_train)
    pipe.fit(X_train, y_train)#, gaussianprocessregressor__sample_weight=weight_train)
    #pipe.fit(X_train, y_train, mlpregressor__sample_weight=weight_train)
    '''pipe1.fit(X_train, y_train)#, mlpregressor__sample_weight=weight_train)
    pipe2.fit(X_train, y_train)#, mlpregressor__sample_weight=weight_train)
    pipe3.fit(X_train, y_train)#, mlpregressor__sample_weight=weight_train)
    pipe4.fit(X_train, y_train)#, mlpregressor__sample_weight=weight_train)
    pipe5.fit(X_train, y_train)#, mlpregressor__sample_weight=weight_train)
    '''

    # Predict broadening values
    y_pred = pipe.predict(X_test)
    '''y_pred1 = pipe.predict(X_test)
    y_pred2 = pipe.predict(X_test)
    y_pred3 = pipe.predict(X_test)
    y_pred4 = pipe.predict(X_test)
    y_pred5 = pipe.predict(X_test)
    '''

    #print(pipe.best_params_)
    #y_pred2 = pipe2.predict(X_test)
    #y_pred3 = pipe3.predict(X_test)
    #y_pred4 = pipe4.predict(X_test)
    #y_pred5 = pipe5.predict(X_test)
    #y_pred6 = pipe6.predict(X_test)
    #y_pred7 = pipe7.predict(X_test)


    print('leaf nodes = '+str(pipe))

    #print('fitting done')



    # print out the scor
    score = pipe.score(X_test, y_test, sample_weight=weight_test)
    mse_score = mean_squared_error(y_pred, y_test, sample_weight=weight_test)
    
    '''score1 = pipe1.score(X_test, y_test, sample_weight=weight_test)
    score2 = pipe2.score(X_test, y_test, sample_weight=weight_test)
    score3 = pipe3.score(X_test, y_test, sample_weight=weight_test)
    score4 = pipe4.score(X_test, y_test, sample_weight=weight_test)
    score5 = pipe5.score(X_test, y_test, sample_weight=weight_test)
    '''
    #score2 = pipe2.score(X_test, y_test, weight_test_1ovsquare)
    #score3 = pipe3.score(X_test, y_test, weight_test_1ovsquare)
    #score4 = pipe4.score(X_test, y_test, weight_test_1ovsquare)
    #score5 = pipe5.score(X_test, y_test, weight_test_1ovsquare)
    #score6 = pipe6.score(X_test, y_test, weight_test_1ovsquare)
    #score7 = pipe7.score(X_test, y_test, weight_test_1ovsquare)
    
    print(key+' has score = '+str(score))
    print('mean square error = '+str(mse_score))

    print()
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    '''print('minor scores = ')
    print(score1, score2, score3, score4, score5)
    '''

    #print('score 1/err = '+str(score2))
    #print('score err = '+str(score3))
    #print('score err2 = '+str(score4))
    #print('score 1/rt(err) = '+str(score5))
    #print('score nowt = '+str(score6))
    #print('score MLP = '+str(score7))
    
    # Add up the total score - allows methods to be compared
    if score > -10:
        total += score
    #total_1oerr2 += score1
    #total_1oerr += score2
    #total_err += score3
    #total_err2 += score4
    #total_1orterr += score5
    #total_nowt += score6
    #total_MLP = score7
    
    
    # Get data into matplotlib friendly form
    y_test_plot = y_test.to_numpy()
    x_plot = X_test['M'].to_numpy()


    # prepare to plot different accuracy different colour
    #err_codes = data_test['gamma_air-err'].value_counts().sort_index()
    #data_by_vib_lev = {}
    
    '''
    cv_results = pd.DataFrame(pipe.cv_results_).sort_values("mean_test_score", ascending=False)
    column_results = [f"param_{name}" for name in distributions.keys()]
    column_results += ["mean_test_score", "std_test_score", "rank_test_score"]
    cv_results = cv_results[column_results]
    def shorten_param(param_name):
        if "__" in param_name:
            return param_name.rsplit("__", 1)[1]
        return param_name
    cv_results = cv_results.rename(shorten_param, axis=1)
    pivoted_cv_results = cv_results.pivot_table(values="mean_test_score", index=["hidden_layer_sizes"], columns=["alpha"])
    print('CV results:')
    print(cv_results)
    print('LR vs l2')
    print(pivoted_cv_results)
    '''
   
    #plot_data_list.append([key, x_plot[-100000:], y_test_plot[-100000:], y_pred[-100000:], [score, score1, score2, score3, score4, score5], mse_score, X_test['molecule_weight'][-100000:]])#, dot_data])#, pipe])
    plot_data_list.append([key, x_plot[-100000:], y_test_plot[-100000:], y_pred[-100000:], score, mse_score, X_test['molecule_weight'][-100000:]])#, dot_data])#, pipe])
    

    
    #print(pipe[1].train_score_)
    
    #pipe_container.append([pipe, pipe1, pipe2, pipe3, pipe4, pipe5])
    pipe_container.append(pipe)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#



print('total score = '+str(total))

import pickle 

with open('final_10-25_voter_data.pkl', 'wb') as f:
    pickle.dump(plot_data_list, f)

with open('final_10-25_voter_forest.pkl', 'wb') as f:
    pickle.dump(pipe_container, f)


#with open("baseline.csv", "w") as f:
#    wr = csv.writer(f)
#    wr.writerows(plot_data_list)
'''
with open('baseline_results.json', 'wb') as fp:
    json.dump(plot_data_list, fp)'''


1785734
11447098
lets train!!!
building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10
[Voting] ................... (4 of 5) Processing forest, total=   0.6s
building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10
[Voting] ................... (4 of 5) Processing forest, total=   0.3s
[Voting] ...................... (3 of 5) Processing svr, total=   0.5s
[Voting] ...................... (3 of 5) Processing svr, total=   0.4s
[Voting] ...................... (2 of 5) Processing ada, total=   0.3s
[Voting] ..................... (1 of 5) Processing hist, total=   0.5s
Iteration 1, loss = 0.04263187
Iteration 2, loss = 0.00787375
Iteration 3, loss = 0.00511283
Iteratio

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished


[Voting] ..................... (1 of 5) Processing hist, total=   0.4s
Iteration 1, loss = 0.04481496
Iteration 2, loss = 0.00769295
Iteration 3, loss = 0.00500346
Iteration 4, loss = 0.00399523
Iteration 5, loss = 0.00344658
Iteration 6, loss = 0.00306830
Iteration 7, loss = 0.00278538
Iteration 8, loss = 0.00259211
Iteration 9, loss = 0.00243434
Iteration 10, loss = 0.00233524
Iteration 11, loss = 0.00223212
Iteration 12, loss = 0.00216518
Iteration 13, loss = 0.00208514
Iteration 14, loss = 0.00203588
Iteration 15, loss = 0.00198505
Iteration 16, loss = 0.00194006
Iteration 17, loss = 0.00189820
Iteration 18, loss = 0.00187025
Iteration 19, loss = 0.00183887
Iteration 20, loss = 0.00181227
Iteration 21, loss = 0.00178027
Iteration 22, loss = 0.00175353
Iteration 23, loss = 0.00171243
Iteration 24, loss = 0.00169842
Iteration 25, loss = 0.00167511
Iteration 26, loss = 0.00164127
Iteration 27, loss = 0.00161744
Iteration 28, loss = 0.00160309
Iteration 29, loss = 0.00157997
Iteration 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished


[Voting] ...................... (3 of 5) Processing svr, total=   0.7s
Iteration 1, loss = 0.04335570
Iteration 2, loss = 0.00713367
Iteration 3, loss = 0.00445959
Iteration 4, loss = 0.00351923
Iteration 5, loss = 0.00305078
Iteration 6, loss = 0.00279855
Iteration 7, loss = 0.00262803
Iteration 8, loss = 0.00249594
Iteration 9, loss = 0.00238249
Iteration 10, loss = 0.00229770
Iteration 11, loss = 0.00219991
Iteration 12, loss = 0.00211255
Iteration 13, loss = 0.00205265
Iteration 14, loss = 0.00200179
Iteration 15, loss = 0.00195837
Iteration 16, loss = 0.00191711
Iteration 17, loss = 0.00187672
Iteration 18, loss = 0.00183810
Iteration 19, loss = 0.00180175
Iteration 20, loss = 0.00176669
Iteration 21, loss = 0.00173442
Iteration 22, loss = 0.00171028
Iteration 23, loss = 0.00170894
Iteration 24, loss = 0.00165305
Iteration 25, loss = 0.00163181
Iteration 26, loss = 0.00162080
Iteration 27, loss = 0.00157957
Iteration 28, loss = 0.00157396
Iteration 29, loss = 0.00153247
Iteration 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished


[Voting] ...................... (2 of 5) Processing ada, total=   0.7s
building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10
[Voting] ................... (4 of 5) Processing forest, total=   0.3s
Iteration 1, loss = 0.04334129
Iteration 2, loss = 0.00697188
Iteration 3, loss = 0.00480630
Iteration 4, loss = 0.00393285
Iteration 5, loss = 0.00349347
Iteration 6, loss = 0.00317232
Iteration 7, loss = 0.00292959
Iteration 8, loss = 0.00273172
Iteration 9, loss = 0.00257306
Iteration 10, loss = 0.00244890
Iteration 11, loss = 0.00237913
Iteration 12, loss = 0.00231275
Iteration 13, loss = 0.00220230
Iteration 14, loss = 0.00215535
Iteration 15, loss = 0.00207743
Iteration 16, loss = 0.00203332
Iteration 17, loss = 0.00199199
Iteration 18, loss = 0.00196247
Iteration 19, loss = 0.00191015
Iteration 20, loss = 0.00187893
Iterat

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.2s finished


Iteration 1, loss = 0.04742902
Iteration 2, loss = 0.00784081
Iteration 3, loss = 0.00489347
Iteration 4, loss = 0.00385634
Iteration 5, loss = 0.00335650
Iteration 6, loss = 0.00304568
Iteration 7, loss = 0.00283363
Iteration 8, loss = 0.00267393
Iteration 9, loss = 0.00250503
Iteration 10, loss = 0.00239354
Iteration 11, loss = 0.00230254
Iteration 12, loss = 0.00220503
Iteration 13, loss = 0.00213623
Iteration 14, loss = 0.00206146
Iteration 15, loss = 0.00201902
Iteration 16, loss = 0.00196324
Iteration 17, loss = 0.00191958
Iteration 18, loss = 0.00187606
Iteration 19, loss = 0.00185202
Iteration 20, loss = 0.00180995
Iteration 21, loss = 0.00177683
Iteration 22, loss = 0.00174720
Iteration 23, loss = 0.00170266
Iteration 24, loss = 0.00167422
Iteration 25, loss = 0.00165641
Iteration 26, loss = 0.00162778
Iteration 27, loss = 0.00160432
Iteration 28, loss = 0.00160456
Iteration 29, loss = 0.00159061
Iteration 30, loss = 0.00153720
Iteration 31, loss = 0.00151192
Iteration 32, los

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[Voting] ...................... (3 of 5) Processing svr, total=   0.3s
[Voting] ...................... (2 of 5) Processing ada, total=   0.5s
Iteration 1, loss = 0.04381667
Iteration 2, loss = 0.00722059
Iteration 3, loss = 0.00443524
Iteration 4, loss = 0.00358052
Iteration 5, loss = 0.00315760
Iteration 6, loss = 0.00284009
Iteration 7, loss = 0.00262894
Iteration 8, loss = 0.00245318
Iteration 9, loss = 0.00235649
Iteration 10, loss = 0.00225604
Iteration 11, loss = 0.00219107
Iteration 12, loss = 0.00213292
Iteration 13, loss = 0.00207767
Iteration 14, loss = 0.00201936
Iteration 15, loss = 0.00195749
Iteration 16, loss = 0.00192706
Iteration 17, loss = 0.00187120
Iteration 18, loss = 0.00184487
Iteration 19, loss = 0.00183317
Iteration 20, loss = 0.00178834
Iteration 21, loss = 0.00174094
Iteration 22, loss = 0.00171481
Iteration 23, loss = 0.00168735
Iteration 24, loss = 0.00165992
Iteration 25, loss = 0.00164639
Iteration 26, loss = 0.00161117
Iteration 27, loss = 0.00158944
Ite

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.2s finished


leaf nodes = Pipeline(steps=[('standardscaler', StandardScaler()),
                ('votingregressor',
                 VotingRegressor(estimators=[('hist',
                                              HistGradientBoostingRegressor()),
                                             ('ada', AdaBoostRegressor()),
                                             ('svr', SVR()),
                                             ('forest',
                                              RandomForestRegressor(min_weight_fraction_leaf=0.001,
                                                                    n_estimators=10,
                                                                    verbose=2)),
                                             ('mlp',
                                              MLPRegressor(alpha=0.01,
                                                           hidden_layer_sizes=(30,
                                                                               30),
              

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.0s finished


SO_H2,COCl2_CO2,CH3I_H2O,CS2_self,CH3OH_CO2,N2O_self,C2H4_H2O,HOBr_H2,PH3_CO2,SO_H2O,HO2_self,HCN_H2O,C2N2_H2O,N2O_CO2,CO_H2,HBr_self,HI_CO2,CH3OH_H2O,H2O_self,SO2_self has score = 0.05250926635368125
mean square error = 0.01118048746785693

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1913506
11319326
lets train!!!
[Voting] ..................... (1 of 5) Processing hist, total=   2.4s
building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10
[Voting] ................... (4 of 5) Processing forest, total=   2.9s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.9s finished


Iteration 1, loss = 0.00745637
Iteration 2, loss = 0.00206766
Iteration 3, loss = 0.00168987
Iteration 4, loss = 0.00146128
Iteration 5, loss = 0.00128251
Iteration 6, loss = 0.00112704
Iteration 7, loss = 0.00099915
Iteration 8, loss = 0.00087685
Iteration 9, loss = 0.00077836
Iteration 10, loss = 0.00069572
Iteration 11, loss = 0.00062443
Iteration 12, loss = 0.00057266
Iteration 13, loss = 0.00052517
Iteration 14, loss = 0.00049002
Iteration 15, loss = 0.00046035
Iteration 16, loss = 0.00044061
Iteration 17, loss = 0.00042298
Iteration 18, loss = 0.00040583
Iteration 19, loss = 0.00039668
Iteration 20, loss = 0.00039012
Training loss did not improve more than tol=0.000010 for 1 consecutive epochs. Stopping.
[Voting] ...................... (5 of 5) Processing mlp, total=   5.9s
[Voting] ...................... (2 of 5) Processing ada, total=   8.7s
[Voting] ...................... (3 of 5) Processing svr, total=  16.0s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.1s finished


leaf nodes = Pipeline(steps=[('standardscaler', StandardScaler()),
                ('votingregressor',
                 VotingRegressor(estimators=[('hist',
                                              HistGradientBoostingRegressor()),
                                             ('ada', AdaBoostRegressor()),
                                             ('svr', SVR()),
                                             ('forest',
                                              RandomForestRegressor(min_weight_fraction_leaf=0.001,
                                                                    n_estimators=10,
                                                                    verbose=2)),
                                             ('mlp',
                                              MLPRegressor(alpha=0.01,
                                                           hidden_layer_sizes=(30,
                                                                               30),
              

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.9s finished


COCl2_H2O,CH3F_CO2,HOCl_CO2,NO_H2O,HOCl_self,SO_CO2,CH3CN_self,SO2_H2,HCN_H2,HCl_H2,HCl_H2O,O2_H2O,CH3I_CO2,HNO3_CO2,CO_self,ClO_H2O,C2H6_CO2,COF2_H2,CH3Br_self,CH3I_H2 has score = 0.2765093047144451
mean square error = 0.014015101588285372

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
733340
12499492
lets train!!!
[Voting] ..................... (1 of 5) Processing hist, total=   2.6s
[Voting] ...................... (2 of 5) Processing ada, total=   3.4s
building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10
[Voting] ................... (4 of 5) Processing forest, total=   3.5s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.5s finished


Iteration 1, loss = 0.00782186
Iteration 2, loss = 0.00199396
Iteration 3, loss = 0.00166200
Iteration 4, loss = 0.00144528
Iteration 5, loss = 0.00124219
Iteration 6, loss = 0.00108574
Iteration 7, loss = 0.00095106
Iteration 8, loss = 0.00083933
Iteration 9, loss = 0.00074251
Iteration 10, loss = 0.00065824
Iteration 11, loss = 0.00059234
Iteration 12, loss = 0.00054201
Iteration 13, loss = 0.00049981
Iteration 14, loss = 0.00047060
Iteration 15, loss = 0.00044250
Iteration 16, loss = 0.00042307
Iteration 17, loss = 0.00040418
Iteration 18, loss = 0.00039743
Iteration 19, loss = 0.00038746
Training loss did not improve more than tol=0.000010 for 1 consecutive epochs. Stopping.
[Voting] ...................... (5 of 5) Processing mlp, total=   6.0s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished


leaf nodes = Pipeline(steps=[('standardscaler', StandardScaler()),
                ('votingregressor',
                 VotingRegressor(estimators=[('hist',
                                              HistGradientBoostingRegressor()),
                                             ('ada', AdaBoostRegressor()),
                                             ('svr', SVR()),
                                             ('forest',
                                              RandomForestRegressor(min_weight_fraction_leaf=0.001,
                                                                    n_estimators=10,
                                                                    verbose=2)),
                                             ('mlp',
                                              MLPRegressor(alpha=0.01,
                                                           hidden_layer_sizes=(30,
                                                                               30),
              

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished


H2O_H2,N2O_H2,C2H2_self,C2H4_CO2,OCS_self,NO_H2,OH_self,C4H2_CO2,OCS_H2O,C2N2_H2,CH3Cl_H2O,HO2_H2,O2_self,C2H2_H2,CS_H2,H2S_self,H2O2_H2O,COF2_CO2,C4H2_H2O,HCOOH_H2 has score = -6.148608089362245
mean square error = 0.005095082666070238

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1648015
11584817
lets train!!!
[Voting] ...................... (3 of 5) Processing svr, total=16.4min
[Voting] ..................... (1 of 5) Processing hist, total=   2.4s
building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10
[Voting] ................... (4 of 5) Processing forest, total=   3.0s
[Voting] ...................... (2 of 5) Processing ada, total=   3.2s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.0s finished


Iteration 1, loss = 0.00733907
Iteration 2, loss = 0.00199157
Iteration 3, loss = 0.00164603
Iteration 4, loss = 0.00141405
Iteration 5, loss = 0.00122996
Iteration 6, loss = 0.00108704
Iteration 7, loss = 0.00096323
Iteration 8, loss = 0.00084884
Iteration 9, loss = 0.00075344
Iteration 10, loss = 0.00066809
Iteration 11, loss = 0.00059802
Iteration 12, loss = 0.00054123
Iteration 13, loss = 0.00049872
Iteration 14, loss = 0.00046327
Iteration 15, loss = 0.00044017
Iteration 16, loss = 0.00041188
Iteration 17, loss = 0.00039530
Iteration 18, loss = 0.00038046
Iteration 19, loss = 0.00036947
Iteration 20, loss = 0.00036048
Iteration 21, loss = 0.00035310
Training loss did not improve more than tol=0.000010 for 1 consecutive epochs. Stopping.
[Voting] ...................... (5 of 5) Processing mlp, total=   6.0s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.9s finished


leaf nodes = Pipeline(steps=[('standardscaler', StandardScaler()),
                ('votingregressor',
                 VotingRegressor(estimators=[('hist',
                                              HistGradientBoostingRegressor()),
                                             ('ada', AdaBoostRegressor()),
                                             ('svr', SVR()),
                                             ('forest',
                                              RandomForestRegressor(min_weight_fraction_leaf=0.001,
                                                                    n_estimators=10,
                                                                    verbose=2)),
                                             ('mlp',
                                              MLPRegressor(alpha=0.01,
                                                           hidden_layer_sizes=(30,
                                                                               30),
              

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.9s finished


NH3_H2O,OH_H2O,C2H2_H2O,HF_CO2,CH3OH_self,CO2_CO2,HF_self,CH3Br_H2O,C2H6_H2O,HOCl_H2O,H2O2_H2,H2S_H2O,GeH4_self,CO2_H2,HI_self,ClO_CO2,COCl2_self,C2H6_self,C2H2_CO2,CO2_H2O has score = 0.25525074559830063
mean square error = 0.004154588260896388

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1508498
11724334
lets train!!!
[Voting] ..................... (1 of 5) Processing hist, total=   2.4s
building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10
[Voting] ................... (4 of 5) Processing forest, total=   3.1s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.1s finished


[Voting] ...................... (2 of 5) Processing ada, total=   4.8s
Iteration 1, loss = 0.00720703
Iteration 2, loss = 0.00205356
Iteration 3, loss = 0.00167835
Iteration 4, loss = 0.00144158
Iteration 5, loss = 0.00126359
Iteration 6, loss = 0.00111726
Iteration 7, loss = 0.00098199
Iteration 8, loss = 0.00087506
Iteration 9, loss = 0.00077677
Iteration 10, loss = 0.00068633
Iteration 11, loss = 0.00061738
Iteration 12, loss = 0.00056749
Iteration 13, loss = 0.00051808
Iteration 14, loss = 0.00048385
Iteration 15, loss = 0.00045642
Iteration 16, loss = 0.00043393
Iteration 17, loss = 0.00042166
Iteration 18, loss = 0.00040191
Iteration 19, loss = 0.00038855
Iteration 20, loss = 0.00037749
Iteration 21, loss = 0.00036965
Iteration 22, loss = 0.00036044
Training loss did not improve more than tol=0.000010 for 1 consecutive epochs. Stopping.
[Voting] ...................... (5 of 5) Processing mlp, total=   6.5s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.8s finished


[Voting] ...................... (3 of 5) Processing svr, total=  16.5s
leaf nodes = Pipeline(steps=[('standardscaler', StandardScaler()),
                ('votingregressor',
                 VotingRegressor(estimators=[('hist',
                                              HistGradientBoostingRegressor()),
                                             ('ada', AdaBoostRegressor()),
                                             ('svr', SVR()),
                                             ('forest',
                                              RandomForestRegressor(min_weight_fraction_leaf=0.001,
                                                                    n_estimators=10,
                                                                    verbose=2)),
                                             ('mlp',
                                              MLPRegressor(alpha=0.01,
                                                           hidden_layer_sizes=(30,
                           

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.8s finished


CH3Br_H2,C2N2_self,CO2_self,CH3F_H2O,H2_H2,H2CO_self,CH3F_self,C2H4_self,PH3_self,OH_CO2,NO2_self,H2O2_self,HC3N_CO2,OCS_H2,C2H6_H2,COF2_H2O,H2CO_H2O,H2CO_CO2,ClO_H2,HC3N_self has score = 0.762019981635773
mean square error = 0.006644792386343759

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1213605
12019227
lets train!!!
[Voting] ..................... (1 of 5) Processing hist, total=   2.5s
building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10
[Voting] ................... (4 of 5) Processing forest, total=   3.3s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.3s finished


[Voting] ...................... (2 of 5) Processing ada, total=   4.2s
Iteration 1, loss = 0.00780819
Iteration 2, loss = 0.00211886
Iteration 3, loss = 0.00176885
Iteration 4, loss = 0.00154204
Iteration 5, loss = 0.00133818
Iteration 6, loss = 0.00116561
Iteration 7, loss = 0.00102824
Iteration 8, loss = 0.00090443
Iteration 9, loss = 0.00080400
Iteration 10, loss = 0.00072061
Iteration 11, loss = 0.00064284
Iteration 12, loss = 0.00058656
Iteration 13, loss = 0.00053452
Iteration 14, loss = 0.00050168
Iteration 15, loss = 0.00047321
Iteration 16, loss = 0.00045214
Iteration 17, loss = 0.00042997
Iteration 18, loss = 0.00041396
Iteration 19, loss = 0.00040199
Iteration 20, loss = 0.00038670
Iteration 21, loss = 0.00037947
Iteration 22, loss = 0.00037241
Training loss did not improve more than tol=0.000010 for 1 consecutive epochs. Stopping.
[Voting] ...................... (5 of 5) Processing mlp, total=   6.7s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.6s finished


leaf nodes = Pipeline(steps=[('standardscaler', StandardScaler()),
                ('votingregressor',
                 VotingRegressor(estimators=[('hist',
                                              HistGradientBoostingRegressor()),
                                             ('ada', AdaBoostRegressor()),
                                             ('svr', SVR()),
                                             ('forest',
                                              RandomForestRegressor(min_weight_fraction_leaf=0.001,
                                                                    n_estimators=10,
                                                                    verbose=2)),
                                             ('mlp',
                                              MLPRegressor(alpha=0.01,
                                                           hidden_layer_sizes=(30,
                                                                               30),
              