## Features Extraction
### Important Features:
- Functional group  **// Done**
- Adsorption energy **// Already Available**
- Metal type **// Done**
- Void volume & void fraction & density  **// Already Available**
- GSA **// Already Available**
- VSA **// Done**
- Pore diameter
- Selectivity **// Already Available**
- Metal + Topology --> Open active site of metal  //Optional


## Import Files

In [None]:
import numpy as np
import pandas as pd

file = pd.read_csv('/work/tmlcc2021/data/train_rm_keepfg.csv')
#file[file.functional_groups.isnull()]

### 1. Calculation of Volumetric Surface Area (VSA; m^2/cm^3)

In [None]:
## Calculation of VSA from GSA

sa = file['surface_area [m^2/g]']
sa_num = sa.to_numpy()
weight = file['weight [u]']
weight_num = 1.6605300000013*(10**-24)*weight.to_numpy()
sa_weight = np.multiply(sa_num,weight_num)
vol = file['volume [A^3]']
vol_num = (10**-24)*vol.to_numpy()
VSA = np.divide(sa_weight,vol_num)
file.insert(loc=4, column='volumetric_sa [m^2/cm^3]', value=VSA)

### 2. Assign Numbers to Functional Groups

In [None]:
## Convert functional_groups strings into numbers

def get_func_group_number(functional_group):
    """
    This fuction will convert a string of functional group 
    into a number indicating a type of the functional group.
    """
    functional_group_split = functional_group.split('-')
    functional_group_list = ['None','F','Cl','Br','I','Me','Et','Pr','HCO','COOH','OH','OMe','OEt',
                            'OPr','NH2','CN','NHMe','NO2','Ph','SO3H','H']
    functional_group_index = range(0, len(functional_group_list))
    functional_group_dict = {fg: i for fg, i in zip(functional_group_list, functional_group_index)}

    fg_1 = functional_group_dict[functional_group_split[0]]
    if len(functional_group_split) == 1:
        fg_2 = fg_1
    else:
        fg_2 = functional_group_dict[functional_group_split[1]]
    return fg_1,fg_2 

file.at[file.functional_groups.isnull(),'functional_groups'] = 'None'
functional_group = file['functional_groups']
fg_num = functional_group.to_numpy()
fg_1 = np.zeros_like(fg_num)
fg_2 = np.zeros_like(fg_num)
for i in range(len(fg_num)):
    fg_1[i],fg_2[i] = get_func_group_number(fg_num[i])
file.insert(loc=9, column='functional_group_1', value=fg_1)
file.insert(loc=10, column='functional_group_2', value=fg_2)
#file.drop(['organic_linker1', 'organic_linker2', 'functional_groups'], axis=1)
#file
#file[file.functional_groups.isin(['None'])]

### 3. Assign Number to Metal Elements

In [None]:
## Extract element of metal from metal linker

# List of Metal from Metal Linker Number
# In the train set:
# Zn = 1,3  //Group 12
# Cu = 2    //Group 11
# V  = 9    //Group 5
# Ba = 10   //Group 2  - S-block
# Ni = 12   //Group 10
# Cr = 4    //Group 6
# Not in train set:
# Cd = 5    //Group 12
# Mn = 6    //Group 7
# Zr = 7    //Group 4
# Al = 8    //Group 13 - P-block
# In = 11   //Group 13 - P-block

# It seems like each metal element has unique number, except for Zn. --> Let's assign that:

metal_element = file['metal_linker']
mt_num = metal_element.to_numpy()
metal_group = np.zeros_like(mt_num)

for i in range(len(metal_element)):
    if np.isin(metal_element[i],[1,3,5]) :
        metal_group[i] = 12
    elif np.isin(metal_element[i],[8,11]) :
        metal_group[i] = 13
    elif metal_element[i] == 2 :
        metal_group[i] = 11
    elif metal_element[i] == 9 :
        metal_group[i] = 5
    elif metal_element[i] == 10 :
        metal_group[i] = 2
    elif metal_element[i] == 12 :
        metal_group[i] = 10
    elif metal_element[i] == 4 :
        metal_group[i] = 6
    elif metal_element[i] == 5 :
        metal_group[i] = 12
    elif metal_element[i] == 6 :
        metal_group[i] = 7
    elif metal_element[i] == 7 :
        metal_group[i] = 4
    else:
        raise Exception("Out-of-range number found.")
    if metal_element[i] == 3 :
        metal_element[i] = 1

#file['metal_element'] = metal_element
file.insert(loc=9, column='metal_group', value=metal_group)
file.insert(loc=8, column='metal_element', value=metal_element)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### 4. Metal And Topology 

In [None]:
def get_topology_group_number(ar):
    topology_group_list = ['pcu','etb','sra','acs','fof','bcu','nbo','tbo','pts','the','rht']
    topology_group_index = range(11)
    tp_d = {a:b for a,b in zip(topology_group_list,topology_group_index)}
    return tp_d[ar]
tp = file['topology']
tp_num = tp.to_numpy()
tp_1 = np.zeros_like(tp)
for i in range(len(tp_num)):
    tp_1[i] = get_topology_group_number(tp_num[i])
file.insert(loc=7, column='topology_num', value=tp_1)

## Refine &amp; Save File

In [None]:
file = file.drop(['functional_groups','metal_linker','topology'], axis=1)
file

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],volumetric_sa [m^2/cm^3],void_fraction,void_volume [cm^3/g],topology_num,metal_element,metal_group,functional_group_1,functional_group_2,organic_linker1,organic_linker2,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],CO2_working_capacity [mL/g]
0,mof_unit_2,2769.503842,2211.697211,603.61,800.436438,0.13794,0.1040,1,10,2,1,11,44,57,33.616780,7.147286,101.224774
1,mof_unit_3,1089.818728,773.687960,788.50,929.522690,0.14874,0.1262,0,2,11,11,9,22,24,19.263726,6.347967,118.987011
2,mof_unit_4,2205.198301,1304.638720,1441.53,1416.162290,0.21814,0.2220,2,9,5,20,19,17,24,25.701377,6.190085,187.626004
3,mof_unit_6,3954.659761,1543.027680,2430.55,1574.765449,0.37094,0.5725,2,9,5,7,17,7,23,17.146541,5.398304,55.786959
4,mof_unit_7,3565.914939,1954.749656,1530.02,1392.720602,0.33337,0.3662,1,10,2,14,14,53,55,18.363791,6.303857,111.690462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54069,mof_unit_66520,1426.479810,1272.451540,1343.62,1990.209562,0.30190,0.2038,0,1,12,5,4,12,21,5.867674,4.485481,7.602105
54070,mof_unit_66521,23943.701366,5497.752320,4182.24,1594.591926,0.66340,1.7399,0,1,12,8,5,9,27,4.060772,3.605688,2.675231
54071,mof_unit_66522,14389.971556,4396.164320,4149.64,2105.092235,0.57051,1.1246,0,1,12,13,2,9,20,4.313411,3.361233,-1.686092
54072,mof_unit_66523,16997.806645,3932.703680,4326.62,1662.240347,0.66963,1.7430,6,2,11,15,15,2,5,3.447440,2.781566,-7.546805


In [None]:
file.to_csv('train_preprocessed_021021_1.csv',index=False)

## ---------------------------- Note Whatever Below ----------------------------

In [None]:
file2 = pd.read_csv('/work/tmlcc2021/data/train.csv')
#file2.info()
file3 = file2[file2.functional_groups.isnull()]
file3[file3['CO2_working_capacity [mL/g]'] > 400]
#wc_mean = np.mean(file2['CO2_working_capacity [mL/g]'])
#wc_mean
#vv_mean = np.mean(file2['void_volume [cm^3/g]'])
#vv_mean

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],CO2_working_capacity [mL/g]
3917,mof_unit_3918,1827.789599,903.3878,2739.69,0.29754,0.3625,,3,10,29,pcu,28.51285,6.605597,444.282729
6659,mof_unit_6660,1863.278679,1176.53272,0.0,0.15438,0.1472,,9,4,16,sra,95.122577,7.559968,443.604265
7094,mof_unit_7095,2551.539437,1524.9276,689.49,0.13021,0.1312,,9,18,18,sra,55.423279,7.739193,594.614805
9593,mof_unit_9594,2128.61463,1424.81024,0.0,0.09902,0.0891,,9,2,18,sra,294.165168,9.796693,502.141564
14823,mof_unit_14824,2260.320252,1378.50784,1311.38,0.21174,0.2091,,3,14,14,pcu,35.902951,6.958965,420.123446
24498,mof_unit_24499,2222.286822,1224.57552,1747.16,0.21143,0.2311,,9,1,18,sra,92.403646,7.208212,583.749632
24942,mof_unit_24943,2417.406369,1324.69288,2037.27,0.22062,0.2424,,9,16,16,sra,66.536681,6.588408,532.470418
25500,mof_unit_25501,4147.156231,3042.34936,922.77,0.1604,0.1317,,4,22,22,acs,52.556515,8.142269,424.921892
31233,mof_unit_31234,4082.97483,3030.25408,884.52,0.15989,0.1297,,4,22,24,acs,56.44063,7.756242,413.728711
32629,mof_unit_32630,2211.455594,1424.81024,0.0,0.09693,0.0906,,9,16,18,sra,110.448601,8.534895,453.355832


In [None]:
_file = pd.read_csv('/work/tmlcc2021/data/train_rm.csv')
_file = file.iloc[:, 1:]
_file.to_csv('/work/tmlcc2021/data/train_rm.csv', index=False)

In [None]:
# functional_group = file['functional_groups'].unique()
# functional_group
# metal_linker = file['metal_linker'].unique()
# metal_linker
# vv_mean = np.mean(file['void_volume [cm^3/g]'])
# vv_mean

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=80c4b0c0-263f-4469-9860-e4bc52f80dd5' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>