In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from math import log10

In [2]:
# Baseline Models
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor

## Import Data

In [20]:
def import_data(training_file, attributes=None, test=True):
    df = pd.read_csv(training_file, index_col=0)
    
    if attributes:
        df = df[attributes]
    
    if 'functional_groups' in df.columns:
        df['functional_groups'] = pd.factorize(df['functional_groups'])[0]

    if 'topology' in df.columns:
        df['topology'] = pd.factorize(df['topology'])[0]
    
    if test:    
        X, y = df.iloc[:, -1], df.iloc[:, -1]
        return train_test_split(X, y, test_size=0.2, random_state=42)
    else:
        return df.iloc[:, 0:]

In [4]:
X_train, X_test, y_train, y_test = import_data('data/train_rm.csv')
X_train

Unnamed: 0,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]
29090,3791.779389,2133.358400,1612.89,0.27012,0.2891,201,9,7,17,2,50.793877,6.625632
37393,1565.035655,929.669790,1609.70,0.27302,0.2768,238,2,14,18,1,17.721064,5.956794
53137,1138.774874,743.588840,1276.96,0.19782,0.1824,265,2,1,14,1,28.696494,6.868172
13652,1584.394923,813.756440,2213.31,0.25822,0.3028,33,2,13,24,1,12.145572,5.654744
59979,1629.437263,947.356000,1661.97,0.26501,0.2745,315,3,13,15,1,22.687045,6.079344
...,...,...,...,...,...,...,...,...,...,...,...,...
14346,2033.040809,1162.103640,1508.15,0.20282,0.2137,316,2,18,22,1,33.916261,6.791042
56577,1704.847773,941.346610,2006.45,0.28968,0.3159,112,3,16,22,1,13.766825,5.390857
48280,1723.586768,813.389600,2423.84,0.28908,0.3689,370,3,1,26,1,16.658075,5.502273
1090,2722.772757,2505.236730,1016.83,0.25378,0.1661,51,2,16,27,1,32.399418,6.323016


## Train

In [5]:
svr = SVR()
rf = RandomForestRegressor()
gpr = GaussianProcessRegressor()
mlp = MLPRegressor()

In [6]:
svr.fit(X_train, y_train)

SVR()

In [40]:
rf.fit(X_train, y_train)

RandomForestRegressor()

In [None]:
gpr.fit(X_train, y_train)

In [None]:
mlp.fit(X_train, y_train)

## Prediction Results

In [7]:
svr_pred = svr.predict(X_test)

In [41]:
rf_pred = rf.predict(X_test)

In [10]:
svr_lmae = log10(mean_absolute_error(y_test, svr_pred))
svr_lmae

1.6629801794313404

In [43]:
rf_lmae = log10(mean_absolute_error(y_test, rf_pred))
rf_lmae

1.2802461491231447

## Real Test Set

In [22]:
X_pretest = import_data('data/pretest.csv', test=False)
X_pretest

Unnamed: 0_level_0,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]
MOFname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
mof_unit_pretest_1,6288.293858,2271.687140,4148.48,0.41225,0.6872,0,4,7,27,0,14.048791,5.604779
mof_unit_pretest_2,1790.506437,887.747320,2191.34,0.30231,0.3672,1,2,4,26,1,20.217222,6.148776
mof_unit_pretest_3,2348.969203,1239.765880,2030.88,0.28533,0.3256,2,3,18,22,1,33.108662,6.164397
mof_unit_pretest_4,2941.571525,1147.951400,3587.13,0.41963,0.6475,3,2,8,15,1,12.800562,5.164957
mof_unit_pretest_5,705.397601,643.270740,0.00,0.07060,0.0466,4,3,10,22,1,23.395617,7.090687
...,...,...,...,...,...,...,...,...,...,...,...,...
mof_unit_pretest_1996,5111.109714,1578.082220,3630.31,0.58981,1.1504,238,2,1,14,3,4.778698,3.675003
mof_unit_pretest_1997,911.269336,481.279680,2546.02,0.36132,0.4120,250,2,10,20,1,6.883387,3.732121
mof_unit_pretest_1998,4236.596494,1127.792600,4296.42,0.60298,1.3641,215,2,7,20,1,5.270734,3.354425
mof_unit_pretest_1999,22861.645381,3492.712720,6252.01,0.75732,2.9852,329,2,6,11,3,3.172914,2.643592


In [24]:
svr_pred_pretest = svr.predict(X_pretest)

In [46]:
rf_pred_pretest = rf.predict(X_pretest)
rf_pred_pretest[rf_pred_pretest < 0] = 0

In [47]:
rf_pred_pretest

array([ 99.88007382, 151.00482055, 197.59369954, ...,   0.        ,
         0.        ,  84.53306563])

In [48]:
id = list(x[9:] for x in X_pretest.index)
submit_pretest = pd.DataFrame(data={'id': id, 'CO2_working_capacity [mL/g]': rf_pred_pretest})

In [49]:
submit_pretest

Unnamed: 0,id,CO2_working_capacity [mL/g]
0,pretest_1,99.880074
1,pretest_2,151.004821
2,pretest_3,197.593700
3,pretest_4,63.166817
4,pretest_5,147.524465
...,...,...
1995,pretest_1996,2.248851
1996,pretest_1997,3.238563
1997,pretest_1998,0.000000
1998,pretest_1999,0.000000


In [50]:
submit_pretest.to_csv('results/submission.csv', sep=',', index=False)