# Features Extraction
**Disclaimer:** This notebook was saved from DeepNote.

### Important Features:
- Functional group  **// Done**
- Adsorption energy **// Already Available**
- Metal type **// Done**
- Void volume & void fraction & density  **// Already Available**
- GSA **// Already Available**
- VSA **// Done**
- Selectivity **// Already Available**


## Import Files

In [None]:
import numpy as np
import pandas as pd

file = pd.read_csv('/work/tmlcc2021/data/test.csv')
file

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]
0,mof_unit_68614,1208.301332,797.70936,586.54,0.11392,0.1039,OEt,2,5,26,pcu,36.639791,7.005640
1,mof_unit_68615,4126.414623,3733.65779,852.49,0.21367,0.1422,H-I,4,6,17,acs,18.390691,5.119399
2,mof_unit_68616,1602.148373,747.21048,3155.73,0.33883,0.4375,CN-OH,3,11,17,pcu,13.062850,5.045400
3,mof_unit_68617,2436.629312,995.80232,3521.09,0.40464,0.5963,OMe,2,1,28,pcu,9.601198,5.106238
4,mof_unit_68618,3123.418006,1337.53800,2678.46,0.38959,0.5479,NO2-Pr,3,8,19,pcu,12.974954,5.287639
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,mof_unit_85609,32660.944605,4723.68288,5720.14,0.77614,3.2318,OH-NO2,3,3,14,nbo,4.536626,3.146698
16996,mof_unit_85610,5070.998617,1499.13262,4017.28,0.59192,1.2058,Me-OMe,3,1,11,nbo,6.745508,3.658871
16997,mof_unit_85611,4669.804446,1322.04892,4288.76,0.54950,1.1689,Me-CN,2,7,23,pcu,4.666206,3.593052
16998,mof_unit_85612,4682.120862,1213.51148,4331.86,0.60643,1.4091,OH-HCO,3,7,25,pcu,4.823305,3.454497


### 1. Calculation of Volumetric Surface Area (VSA; m^2/cm^3)

In [None]:
## Calculation of VSA from GSA

sa = file['surface_area [m^2/g]']
sa_num = sa.to_numpy()
weight = file['weight [u]']
weight_num = 1.6605300000013*(10**-24)*weight.to_numpy()
sa_weight = np.multiply(sa_num,weight_num)
vol = file['volume [A^3]']
vol_num = (10**-24)*vol.to_numpy()
VSA = np.divide(sa_weight,vol_num)
file.insert(loc=4, column='volumetric_sa [m^2/cm^3]', value=VSA)

### 2. Assign Numbers to Functional Groups &amp; Function Classification

In [None]:
## Convert functional_groups strings into numbers

def get_func_group_number(functional_group):
    """
    This fuction will convert a string of functional group 
    into a number indicating a type of the functional group.
    """
    functional_group_split = functional_group.split('-')
    functional_group_list = ['None','F','Cl','Br','I','Me','Et','Pr','HCO','COOH','OH','OMe','OEt',
                            'OPr','NH2','CN','NHMe','NO2','Ph','SO3H','H']
    functional_group_index = range(0, len(functional_group_list))
    functional_group_dict = {fg: i for fg, i in zip(functional_group_list, functional_group_index)}

    fg_1 = functional_group_dict[functional_group_split[0]]
    if len(functional_group_split) == 1:
        fg_2 = fg_1
    else:
        fg_2 = functional_group_dict[functional_group_split[1]]
    return fg_1,fg_2 

def func_code(functional_group):
    '''
    This function will create classification for functional groups.
    #fg_class = o,n,x,hc,aro
    '''
    Halo = ['F','Cl','Br','I']
    HC = ['Me','Et','Pr']
    O = ['HCO','COOH','OH','OMe','OEt','OPr','NO2','SO3H']
    N = ['NH2','NHMe','CN']
    Aro = ['Ph']
    Nan = ['None','H']
    code = [[1,0,0,0,0],[0,1,0,0,0],[0,0,1,0,0],[0,0,0,1,0],[0,0,0,0,1],[0,0,0,0,0]]
    fg_group = {Halo: code[0] for Halo in Halo}
    fg_HC = {HC: code[1] for HC in HC}
    fg_O = {O: code[2] for O in O}
    fg_N = {N: code[3] for N in N}
    fg_Aro = {Aro: code[4] for Aro in Aro}
    fg_nan = {Nan: code[5] for Nan in Nan}
    fg_group.update(fg_HC)
    fg_group.update(fg_O)
    fg_group.update(fg_N)
    fg_group.update(fg_Aro)
    fg_group.update(fg_nan)

    functional_group_split2 = functional_group.split('-')
    fg_1 = fg_group[functional_group_split2[0]]
    if len(functional_group_split2) == 1:
        res = fg_1
    else:
        fg_2 = fg_group[functional_group_split2[1]]
        res = np.add(fg_1,fg_2)
    return res[0],res[1],res[2],res[3],res[4]

file.at[file.functional_groups.isnull(),'functional_groups'] = 'None'
functional_group = file['functional_groups']
fg_num = functional_group.to_numpy()
fg_1 = np.zeros_like(fg_num)
fg_2 = np.zeros_like(fg_num)
fg_3 = np.zeros_like(fg_num)
fg_4 = np.zeros_like(fg_num)
fg_5 = np.zeros_like(fg_num)
fg_6 = np.zeros_like(fg_num)
fg_7 = np.zeros_like(fg_num)
for i in range(len(fg_num)):
    fg_1[i],fg_2[i] = get_func_group_number(fg_num[i])
    fg_3[i],fg_4[i],fg_5[i],fg_6[i],fg_7[i] = func_code(fg_num[i])
file.insert(loc=9, column='functional_group_1', value=fg_1)
file.insert(loc=10, column='functional_group_2', value=fg_2)
file.insert(loc=11, column='fg_class_halo', value=fg_3)
file.insert(loc=12, column='fg_class_hc', value=fg_4)
file.insert(loc=13, column='fg_class_o', value=fg_5)
file.insert(loc=14, column='fg_class_n', value=fg_6)
file.insert(loc=15, column='fg_class_aro', value=fg_7)
#file.drop(['organic_linker1', 'organic_linker2', 'functional_groups'], axis=1)
file
#file[file.functional_groups.isin(['None'])]

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],volumetric_sa [m^2/cm^3],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,functional_group_1,...,fg_class_halo,fg_class_hc,fg_class_o,fg_class_n,fg_class_aro,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]
0,mof_unit_68614,1208.301332,797.70936,586.54,643.004178,0.11392,0.1039,OEt,2,12,...,0,0,1,0,0,5,26,pcu,36.639791,7.005640
1,mof_unit_68615,4126.414623,3733.65779,852.49,1280.848210,0.21367,0.1422,H-I,4,20,...,1,0,0,0,0,6,17,acs,18.390691,5.119399
2,mof_unit_68616,1602.148373,747.21048,3155.73,2443.918878,0.33883,0.4375,CN-OH,3,15,...,0,0,1,1,0,11,17,pcu,13.062850,5.045400
3,mof_unit_68617,2436.629312,995.80232,3521.09,2389.502677,0.40464,0.5963,OMe,2,11,...,0,0,1,0,0,1,28,pcu,9.601198,5.106238
4,mof_unit_68618,3123.418006,1337.53800,2678.46,1904.618116,0.38959,0.5479,NO2-Pr,3,17,...,0,1,1,0,0,8,19,pcu,12.974954,5.287639
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,mof_unit_85609,32660.944605,4723.68288,5720.14,1373.742636,0.77614,3.2318,OH-NO2,3,10,...,0,0,2,0,0,3,14,nbo,4.536626,3.146698
16996,mof_unit_85610,5070.998617,1499.13262,4017.28,1972.083915,0.59192,1.2058,Me-OMe,3,5,...,0,1,1,0,0,1,11,nbo,6.745508,3.658871
16997,mof_unit_85611,4669.804446,1322.04892,4288.76,2016.170710,0.54950,1.1689,Me-CN,2,5,...,0,1,0,1,0,7,23,pcu,4.666206,3.593052
16998,mof_unit_85612,4682.120862,1213.51148,4331.86,1864.328366,0.60643,1.4091,OH-HCO,3,10,...,0,0,2,0,0,7,25,pcu,4.823305,3.454497


### 3. Assign Number to Metal Elements &amp; Metal Group

In [None]:
## Extract element of metal from metal linker

# List of Metal from Metal Linker Number
# In the train set:
# Zn = 1,3  //Group 12
# Cu = 2    //Group 11
# V  = 9    //Group 5
# Ba = 10   //Group 2  - S-block
# Ni = 12   //Group 10
# Cr = 4    //Group 6
# Not in train set:
# Cd = 5    //Group 12
# Mn = 6    //Group 7
# Zr = 7    //Group 4
# Al = 8    //Group 13 - P-block
# In = 11   //Group 13 - P-block

# It seems like each metal element has unique number, except for Zn. --> Let's assign that:

metal_element = file['metal_linker']
mt_num = metal_element.to_numpy()
metal_group = np.zeros_like(mt_num)

for i in range(len(metal_element)):
    if np.isin(metal_element[i],[1,3,5]) :
        metal_group[i] = 12
    elif np.isin(metal_element[i],[8,11]) :
        metal_group[i] = 13
    elif metal_element[i] == 2 :
        metal_group[i] = 11
    elif metal_element[i] == 9 :
        metal_group[i] = 5
    elif metal_element[i] == 10 :
        metal_group[i] = 2
    elif metal_element[i] == 12 :
        metal_group[i] = 10
    elif metal_element[i] == 4 :
        metal_group[i] = 6
    elif metal_element[i] == 5 :
        metal_group[i] = 12
    elif metal_element[i] == 6 :
        metal_group[i] = 7
    elif metal_element[i] == 7 :
        metal_group[i] = 4
    else:
        raise Exception("Out-of-range number found.")
    if metal_element[i] == 3 :
        metal_element[i] = 1      #Assign same element number for Zn

#file['metal_element'] = metal_element
file.insert(loc=9, column='metal_group', value=metal_group)
file.insert(loc=8, column='metal_element', value=metal_element)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### 4. Metal And Topology 

In [None]:
##Assign number to each topology

def get_topology_group_number(ar):
    topology_group_list = ['pcu','etb','sra','acs','fof','bcu','nbo','tbo','pts','the','rht']
    topology_group_index = range(11)
    tp_d = {a:b for a,b in zip(topology_group_list,topology_group_index)}
    return tp_d[ar]
tp = file['topology']
tp_num = tp.to_numpy()
tp_1 = np.zeros_like(tp)
for i in range(len(tp_num)):
    tp_1[i] = get_topology_group_number(tp_num[i])
file.insert(loc=7, column='topology_num', value=tp_1)

## Refine &amp; Save File

In [None]:
#working_capacity = file[['MOFname','CO2_working_capacity [mL/g]']]
#working_capacity.to_csv('/work/Export/pretest_wc.csv',index=False)

In [None]:
file2 = file.drop(['functional_groups','metal_linker','topology'], axis=1)
file3 = file2.drop(['fg_class_halo','fg_class_hc','fg_class_o','fg_class_n','fg_class_aro'], axis=1)
#file = file.drop(['CO2_working_capacity [mL/g]'],axis=1)
file3

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],volumetric_sa [m^2/cm^3],void_fraction,void_volume [cm^3/g],topology_num,metal_element,metal_group,functional_group_1,functional_group_2,organic_linker1,organic_linker2,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]
0,mof_unit_68614,1208.301332,797.70936,586.54,643.004178,0.11392,0.1039,0,2,11,12,12,5,26,36.639791,7.005640
1,mof_unit_68615,4126.414623,3733.65779,852.49,1280.848210,0.21367,0.1422,3,4,6,20,4,6,17,18.390691,5.119399
2,mof_unit_68616,1602.148373,747.21048,3155.73,2443.918878,0.33883,0.4375,0,1,12,15,10,11,17,13.062850,5.045400
3,mof_unit_68617,2436.629312,995.80232,3521.09,2389.502677,0.40464,0.5963,0,2,11,11,11,1,28,9.601198,5.106238
4,mof_unit_68618,3123.418006,1337.53800,2678.46,1904.618116,0.38959,0.5479,0,1,12,17,7,8,19,12.974954,5.287639
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,mof_unit_85609,32660.944605,4723.68288,5720.14,1373.742636,0.77614,3.2318,6,1,12,10,17,3,14,4.536626,3.146698
16996,mof_unit_85610,5070.998617,1499.13262,4017.28,1972.083915,0.59192,1.2058,6,1,12,5,11,1,11,6.745508,3.658871
16997,mof_unit_85611,4669.804446,1322.04892,4288.76,2016.170710,0.54950,1.1689,0,2,11,5,15,7,23,4.666206,3.593052
16998,mof_unit_85612,4682.120862,1213.51148,4331.86,1864.328366,0.60643,1.4091,0,1,12,10,8,7,25,4.823305,3.454497


In [None]:
file3.to_csv('/work/Export/test_preprocessed_081021_1.csv',index=False)

## ---------------------------- Note Whatever Below ----------------------------

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=80c4b0c0-263f-4469-9860-e4bc52f80dd5' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>