In [1]:
from rdkit import Chem
import pandas as pd
import os
datasetPath = './dataset'
excelName = 'data.xlsx'
tgMethod = 'ALL'                        # ALL or method's name
ifType = False                          # use TYPE as feature or not
method = 2
solvent = ['DMAc', 'NMP', 'DMF']

csvName = 'dataMethod%d.csv' % method

In [2]:
dfRaw = pd.read_excel(os.path.join(datasetPath, excelName))
dfRaw = dfRaw.sample(frac=1).reset_index(drop=True)           # shuffle
dfRaw

Unnamed: 0,PID,Tg,Tg_Method,Type,Method,Solvent,Temperature1,avg(Temperature1),Time1,avg(Time1),Method2,min_temp,avg(min_temp),max_temp,avg(max_temp),Time2,avg(Time2)
0,P130189,247,DSC,Polycondensation,1,m-cresol,160-180,,3.5,,,,,,,,
1,P433104,205,DSC,Polycondensation,1,NMP,105,,3,,,,,,,,
2,P433620,134,DSC,Polycondensation,2,DMAc,0,0.0,24,24.0,3.0,100,100.0,100,100.0,4,4.0
3,P432353,278,DSC,Polycondensation,1,m-cresol,70-90,,4,,,,,,,,
4,P433254,232,DSC,Polycondensation,2,NMP,RT,20.0,24,24.0,2.0,80,80.0,300,300.0,6.5,6.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1010,P432792,210,DSC,Polycondensation,1,THF,RT,,72,,,,,,,,
1011,P432512,175,DSC,Polycondensation,2,DMAc,RT,20.0,2,2.0,2.0,60,60.0,200,200.0,1,1.0
1012,P432100,265,TMA,Polycondensation,2,DMAc,RT,20.0,overnight,20.0,2.0,100,100.0,300,300.0,6,6.0
1013,P432502,142,DSC,Polycondensation,1,m-cresol,200,,6,,,,,,,,


In [3]:
# 聚酰亚胺生成方法
if method>0:
    dfRaw = dfRaw[dfRaw['Method']==method].copy(deep=True)
print('num:', dfRaw.shape[0])

num: 635


In [4]:
# PID to smiles
def getSmiles(pid):
    mol = Chem.MolFromMolFile(os.path.join(datasetPath, 'Mol', pid+'.mol'))
    return Chem.MolToSmiles(mol)
dfRaw['SMILES'] = dfRaw.apply(lambda x:getSmiles(x['PID']), axis=1)
dfRaw

Unnamed: 0,PID,Tg,Tg_Method,Type,Method,Solvent,Temperature1,avg(Temperature1),Time1,avg(Time1),Method2,min_temp,avg(min_temp),max_temp,avg(max_temp),Time2,avg(Time2),SMILES
2,P433620,134,DSC,Polycondensation,2,DMAc,0,0.0,24,24.0,3.0,100,100.0,100,100.0,4,4.00,*Cc1ccc(C(C)(CCC)c2ccc(Cn3c(=O)c4cc5c(=O)n(*)c...
4,P433254,232,DSC,Polycondensation,2,NMP,RT,20.0,24,24.0,2.0,80,80.0,300,300.0,6.5,6.50,*c1ccc(Oc2ccc(Oc3ccc(N4C(=O)c5ccc(Oc6ccc7c(c6)...
7,P130370,316,DSC,Polyaddition&Polycondensation,2,DMAc,RT,20.0,2,2.0,2.0,110,110.0,250,250.0,1.25,1.25,*c1ccc(Oc2cc3ccccc3cc2Oc2ccc(-n3c(=O)c4cc5c(=O...
9,P130002,388,DMA,Polycondensation,2,DMAc,RT,20.0,24,24.0,2.0,100,100.0,300,300.0,4,4.00,*c1ccc(Oc2ccc(-n3c(=O)c4cc5c(=O)n(*)c(=O)c5cc4...
10,P433175,252,DSC,Polycondensation,2,DMAc,RT,20.0,24,24.0,3.0,120,120.0,120,120.0,3,3.00,*c1ccc(N(c2ccc(C#N)cc2)c2ccc(-n3c(=O)c4cc5c(=O...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,P432320,269,DSC,Polycondensation,2,DMAc,RT,20.0,12,12.0,2.0,60,60.0,250,250.0,5.5,5.50,*C(=O)c1cc(C(=O)c2ccc3c(c2)C(=O)N(c2cccc(N4C(=...
1005,P074270,253,DSC,Polyaddition&Polymer reaction,2,NMP,RT,20.0,24,24.0,3.0,RT,20.0,RT,20.0,6,6.00,*c1ccc(C(C)(C)c2ccc(C(C)(C)c3ccc(N4C(=O)c5ccc(...
1006,P130119,296,DSC,Polycondensation,2,DMAc,100,100.0,5,5.0,1.0,300,300.0,300,300.0,1.5,1.50,*c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(N4C(=O)c5ccc(C...
1011,P432512,175,DSC,Polycondensation,2,DMAc,RT,20.0,2,2.0,2.0,60,60.0,200,200.0,1,1.00,*c1cccc(OCCCCOc2cccc(N3C(=O)c4ccc(-c5ccc6c(c5)...


In [5]:
dfRaw = dfRaw[dfRaw['Tg']<500]

In [6]:
df = dfRaw.copy(deep=True)
df.drop(['PID', 'Tg_Method', 'Type', 'Method', 'Temperature1', 'Time1', 'min_temp', 'max_temp', 'Time2'], axis=1, inplace=True)
df

Unnamed: 0,Tg,Solvent,avg(Temperature1),avg(Time1),Method2,avg(min_temp),avg(max_temp),avg(Time2),SMILES
2,134,DMAc,0.0,24.0,3.0,100.0,100.0,4.00,*Cc1ccc(C(C)(CCC)c2ccc(Cn3c(=O)c4cc5c(=O)n(*)c...
4,232,NMP,20.0,24.0,2.0,80.0,300.0,6.50,*c1ccc(Oc2ccc(Oc3ccc(N4C(=O)c5ccc(Oc6ccc7c(c6)...
7,316,DMAc,20.0,2.0,2.0,110.0,250.0,1.25,*c1ccc(Oc2cc3ccccc3cc2Oc2ccc(-n3c(=O)c4cc5c(=O...
9,388,DMAc,20.0,24.0,2.0,100.0,300.0,4.00,*c1ccc(Oc2ccc(-n3c(=O)c4cc5c(=O)n(*)c(=O)c5cc4...
10,252,DMAc,20.0,24.0,3.0,120.0,120.0,3.00,*c1ccc(N(c2ccc(C#N)cc2)c2ccc(-n3c(=O)c4cc5c(=O...
...,...,...,...,...,...,...,...,...,...
1003,269,DMAc,20.0,12.0,2.0,60.0,250.0,5.50,*C(=O)c1cc(C(=O)c2ccc3c(c2)C(=O)N(c2cccc(N4C(=...
1005,253,NMP,20.0,24.0,3.0,20.0,20.0,6.00,*c1ccc(C(C)(C)c2ccc(C(C)(C)c3ccc(N4C(=O)c5ccc(...
1006,296,DMAc,100.0,5.0,1.0,300.0,300.0,1.50,*c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(N4C(=O)c5ccc(C...
1011,175,DMAc,20.0,2.0,2.0,60.0,200.0,1.00,*c1cccc(OCCCCOc2cccc(N3C(=O)c4ccc(-c5ccc6c(c5)...


In [7]:
df.rename({'avg(Temperature1)': 'temperature1',
           'avg(Time1)': 'time1',
           'Method2': 'method2',
           'avg(min_temp)': 'minTemp',
           'avg(max_temp)': 'maxTemp',
           'avg(Time2)': 'time2'}, axis=1, inplace=True)
order = ['SMILES', 'Solvent', 'temperature1', 'time1',
         'method2', 'minTemp', 'maxTemp', 'time2', 'Tg']
df = df[order]
df

Unnamed: 0,SMILES,Solvent,temperature1,time1,method2,minTemp,maxTemp,time2,Tg
2,*Cc1ccc(C(C)(CCC)c2ccc(Cn3c(=O)c4cc5c(=O)n(*)c...,DMAc,0.0,24.0,3.0,100.0,100.0,4.00,134
4,*c1ccc(Oc2ccc(Oc3ccc(N4C(=O)c5ccc(Oc6ccc7c(c6)...,NMP,20.0,24.0,2.0,80.0,300.0,6.50,232
7,*c1ccc(Oc2cc3ccccc3cc2Oc2ccc(-n3c(=O)c4cc5c(=O...,DMAc,20.0,2.0,2.0,110.0,250.0,1.25,316
9,*c1ccc(Oc2ccc(-n3c(=O)c4cc5c(=O)n(*)c(=O)c5cc4...,DMAc,20.0,24.0,2.0,100.0,300.0,4.00,388
10,*c1ccc(N(c2ccc(C#N)cc2)c2ccc(-n3c(=O)c4cc5c(=O...,DMAc,20.0,24.0,3.0,120.0,120.0,3.00,252
...,...,...,...,...,...,...,...,...,...
1003,*C(=O)c1cc(C(=O)c2ccc3c(c2)C(=O)N(c2cccc(N4C(=...,DMAc,20.0,12.0,2.0,60.0,250.0,5.50,269
1005,*c1ccc(C(C)(C)c2ccc(C(C)(C)c3ccc(N4C(=O)c5ccc(...,NMP,20.0,24.0,3.0,20.0,20.0,6.00,253
1006,*c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(N4C(=O)c5ccc(C...,DMAc,100.0,5.0,1.0,300.0,300.0,1.50,296
1011,*c1cccc(OCCCCOc2cccc(N3C(=O)c4ccc(-c5ccc6c(c5)...,DMAc,20.0,2.0,2.0,60.0,200.0,1.00,175


In [8]:
df.to_csv(os.path.join(datasetPath, csvName), encoding='utf8', index=False)

In [2]:
import pandas as pd
import os

df = pd.read_csv(os.path.join('./dataset', 'dataMethod2Deleted.csv'))
df

Unnamed: 0,SMILES,Solvent,temperature1,time1,method2,minTemp,maxTemp,time2,Tg
0,*Cc1ccc(C(C)(CCC)c2ccc(Cn3c(=O)c4cc5c(=O)n(*)c...,DMAc,0.0,24.0,3.0,100.0,100.0,4.00,134
1,*c1ccc(Oc2ccc(Oc3ccc(N4C(=O)c5ccc(Oc6ccc7c(c6)...,NMP,20.0,24.0,2.0,80.0,300.0,6.50,232
2,*c1ccc(Oc2cc3ccccc3cc2Oc2ccc(-n3c(=O)c4cc5c(=O...,DMAc,20.0,2.0,2.0,110.0,250.0,1.25,316
3,*c1ccc(Oc2ccc(-n3c(=O)c4cc5c(=O)n(*)c(=O)c5cc4...,DMAc,20.0,24.0,2.0,100.0,300.0,4.00,388
4,*c1ccc(N(c2ccc(C#N)cc2)c2ccc(-n3c(=O)c4cc5c(=O...,DMAc,20.0,24.0,3.0,120.0,120.0,3.00,252
...,...,...,...,...,...,...,...,...,...
619,*C(=O)c1cc(C(=O)c2ccc3c(c2)C(=O)N(c2cccc(N4C(=...,DMAc,20.0,12.0,2.0,60.0,250.0,5.50,269
620,*c1ccc(C(C)(C)c2ccc(C(C)(C)c3ccc(N4C(=O)c5ccc(...,NMP,20.0,24.0,3.0,20.0,20.0,6.00,253
621,*c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(N4C(=O)c5ccc(C...,DMAc,100.0,5.0,1.0,300.0,300.0,1.50,296
622,*c1cccc(OCCCCOc2cccc(N3C(=O)c4ccc(-c5ccc6c(c5)...,DMAc,20.0,2.0,2.0,60.0,200.0,1.00,175


In [3]:
df.drop_duplicates(subset=['SMILES'], keep='first', inplace=True)
df

Unnamed: 0,SMILES,Solvent,temperature1,time1,method2,minTemp,maxTemp,time2,Tg
0,*Cc1ccc(C(C)(CCC)c2ccc(Cn3c(=O)c4cc5c(=O)n(*)c...,DMAc,0.0,24.0,3.0,100.0,100.0,4.00,134
1,*c1ccc(Oc2ccc(Oc3ccc(N4C(=O)c5ccc(Oc6ccc7c(c6)...,NMP,20.0,24.0,2.0,80.0,300.0,6.50,232
2,*c1ccc(Oc2cc3ccccc3cc2Oc2ccc(-n3c(=O)c4cc5c(=O...,DMAc,20.0,2.0,2.0,110.0,250.0,1.25,316
3,*c1ccc(Oc2ccc(-n3c(=O)c4cc5c(=O)n(*)c(=O)c5cc4...,DMAc,20.0,24.0,2.0,100.0,300.0,4.00,388
4,*c1ccc(N(c2ccc(C#N)cc2)c2ccc(-n3c(=O)c4cc5c(=O...,DMAc,20.0,24.0,3.0,120.0,120.0,3.00,252
...,...,...,...,...,...,...,...,...,...
617,*c1ccc(Oc2ccc(N3C(=O)c4cccc(Oc5c(Oc6cccc7c6C(=...,NMP,20.0,20.0,3.0,20.0,20.0,20.00,238
618,*c1ccc(Cc2cccc(-n3c(=O)c4cc5c(=O)n(*)c(=O)c5cc...,DMAc,20.0,20.0,2.0,100.0,300.0,3.00,335
620,*c1ccc(C(C)(C)c2ccc(C(C)(C)c3ccc(N4C(=O)c5ccc(...,NMP,20.0,24.0,3.0,20.0,20.0,6.00,253
622,*c1cccc(OCCCCOc2cccc(N3C(=O)c4ccc(-c5ccc6c(c5)...,DMAc,20.0,2.0,2.0,60.0,200.0,1.00,175


In [4]:
df.to_csv(os.path.join('./dataset', 'dataMethod2DeletedDuplicates.csv'))