In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import math
import gc
import copy
import os

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error

import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import LGBMRegressor

In [2]:
#设置数据输入路径和导出路径
DATA_PATH = '../input'
SUBMISSIONS_PATH = './'
#用原子序数代替原子名称
ATOMIC_NUMBERS = {
    'H': 1,
    'C': 6,
    'N': 7,
    'O': 8,
    'F': 9
}

In [3]:
#设置显示参数
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 120)
pd.set_option('display.max_columns', 120)

In [4]:
#显示当前路径下的文件夹
os.listdir(DATA_PATH)

['read-giba', 'champs-scalar-coupling', 'quantum-machine-9-qm9']

In [5]:
#定义减小消耗内存函数
def reduce_mem_usage(df, verbose=False):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
train_csv = pd.read_csv('../input/read-giba/giba_train.csv')
test_csv = pd.read_csv('../input/read-giba/giba_test.csv')
train_csv.head(10)

Unnamed: 0,molecule_name,atom_index_1,atom_index_0,id,scalar_coupling_constant,linkM0,linkM1,inv_dist0R,mean_molecule_atom_0_dist_xyz,max_molecule_atom_0_dist_xyz,sd_molecule_atom_0_dist_xyz,mean_molecule_atom_1_dist_xyz,max_molecule_atom_1_dist_xyz,sd_molecule_atom_1_dist_xyz,coulomb_C.x,coulomb_H.x,coulomb_N.x,coulomb_O.x,yukawa_H.x,coulomb_C.y,coulomb_H.y,coulomb_O.y,yukawa_C.y,yukawa_H.y,vander_C.y,distC0,distC1,adH1,adC2,adC3,adC4,type
0,dsgdb9nsd_000001,0,1,0,84.8076,0.0,0.680487,0.003337,1.610344,1.783157,0.345594,1.09195,1.091953,3e-06,0.91579,1.682424,0.0,0.0,0.282831,0.0,3.663172,0.0,0.0,1.22922,0.0,1.091953,0.272988,1.78312,,,,1JHC
1,dsgdb9nsd_000001,2,1,1,-11.257,0.0,-4.886227,0.003337,1.610344,1.783157,0.345594,1.78312,1.78312,,0.91579,1.682424,0.0,0.0,0.282831,0.915792,1.682424,0.0,0.307304,0.282831,0.589902,,,1.783147,,,,2JHH
2,dsgdb9nsd_000001,3,1,2,-11.2548,0.0,-2.006931,0.003337,1.610344,1.783157,0.345594,1.783153,1.783158,7e-06,0.91579,1.682424,0.0,0.0,0.282831,0.915796,1.682415,0.0,0.307307,0.282827,0.589919,,,1.78312,,,,2JHH
3,dsgdb9nsd_000001,4,1,3,-11.2543,0.0,-0.9826,0.003337,1.610344,1.783157,0.345594,1.783151,1.783157,5e-06,0.91579,1.682424,0.0,0.0,0.282831,0.915795,1.682415,0.0,0.307306,0.282827,0.589915,,,1.78312,,,,2JHH
4,dsgdb9nsd_000001,0,2,4,84.8074,0.0,0.680487,0.003348,1.552753,1.783158,0.399065,1.09195,1.091953,3e-06,0.915792,1.682424,0.0,0.0,0.282831,0.0,3.663172,0.0,0.0,1.22922,0.0,1.091952,0.272988,1.783148,,,,1JHC
5,dsgdb9nsd_000001,3,2,5,-11.2541,0.0,-2.006931,0.003348,1.552753,1.783158,0.399065,1.783153,1.783158,7e-06,0.915792,1.682424,0.0,0.0,0.282831,0.915796,1.682415,0.0,0.307307,0.282827,0.589919,,,1.783148,,,,2JHH
6,dsgdb9nsd_000001,4,2,6,-11.2548,0.0,-0.9826,0.003348,1.552753,1.783158,0.399065,1.783151,1.783157,5e-06,0.915792,1.682424,0.0,0.0,0.282831,0.915795,1.682415,0.0,0.307306,0.282827,0.589915,,,1.783158,,,,2JHH
7,dsgdb9nsd_000001,0,3,7,84.8093,0.0,0.680487,0.003359,1.437547,1.783148,0.488753,1.09195,1.091953,3e-06,0.915796,1.682415,0.0,0.0,0.282827,0.0,3.663172,0.0,0.0,1.22922,0.0,1.091946,0.272987,1.783148,,,,1JHC
8,dsgdb9nsd_000001,4,3,8,-11.2543,0.0,-0.9826,0.003359,1.437547,1.783148,0.488753,1.783151,1.783157,5e-06,0.915796,1.682415,0.0,0.0,0.282827,0.915795,1.682415,0.0,0.307306,0.282827,0.589915,,,,,,,2JHH
9,dsgdb9nsd_000001,0,4,9,84.8095,0.0,0.680487,0.00337,1.091948,1.091948,,1.09195,1.091953,3e-06,0.915795,1.682415,0.0,0.0,0.282827,0.0,3.663172,0.0,0.0,1.22922,0.0,1.091948,0.272987,,,,,1JHC


In [7]:
test_csv.head(10)

Unnamed: 0,molecule_name,atom_index_1,atom_index_0,id,linkM0,linkM1,inv_dist0R,mean_molecule_atom_0_dist_xyz,max_molecule_atom_0_dist_xyz,sd_molecule_atom_0_dist_xyz,mean_molecule_atom_1_dist_xyz,max_molecule_atom_1_dist_xyz,sd_molecule_atom_1_dist_xyz,coulomb_C.x,coulomb_H.x,coulomb_N.x,coulomb_O.x,yukawa_H.x,coulomb_C.y,coulomb_H.y,coulomb_O.y,yukawa_C.y,yukawa_H.y,vander_C.y,distC0,distC1,adH1,adC2,adC3,adC4,type
0,dsgdb9nsd_000004,0,2,4658147,0.029943,-0.142512,0.00767,2.215518,3.323277,1.13128,1.661639,2.261178,0.847877,1.383779,0.300908,0.0,0.0,0.010843,0.833973,1.383779,0.0,0.251419,0.37161,0.336444,1.661639,1.661639,3.323277,,,,2JHC
1,dsgdb9nsd_000004,1,2,4658148,0.029943,-0.140758,0.00767,2.215518,3.323277,1.13128,1.661639,2.261178,0.847877,1.383779,0.300908,0.0,0.0,0.010843,0.833973,1.383779,0.0,0.251419,0.37161,0.336444,1.661639,1.661639,3.323277,,,,1JHC
2,dsgdb9nsd_000004,3,2,4658149,0.029943,-35.72059,0.00767,2.215518,3.323277,1.13128,3.323277,3.323277,,1.383779,0.300908,0.0,0.0,0.010843,1.383779,0.300908,0.0,0.37161,0.010843,0.704124,,,,2.261178,,,3JHH
3,dsgdb9nsd_000004,0,3,4658150,0.069869,-0.142512,0.007679,1.661639,2.261178,0.847877,1.661639,2.261178,0.847877,1.383779,0.300908,0.0,0.0,0.010843,0.833973,1.383779,0.0,0.251419,0.37161,0.336444,1.661639,1.661639,,,,,1JHC
4,dsgdb9nsd_000004,1,3,4658151,0.069869,-0.140758,0.007679,1.661639,2.261178,0.847877,1.661639,2.261178,0.847877,1.383779,0.300908,0.0,0.0,0.010843,0.833973,1.383779,0.0,0.251419,0.37161,0.336444,1.661639,1.661639,,,,,2JHC
5,dsgdb9nsd_000015,0,3,4658152,-0.001494,0.514615,0.002261,1.827658,2.640324,0.629906,1.973384,3.262567,0.984263,1.285913,2.132056,0.0,0.479667,0.246021,0.42778,3.79343,0.711195,0.041304,0.975102,0.006128,1.871326,0.623775,1.782964,,,,1JHC
6,dsgdb9nsd_000015,2,3,4658153,-0.001494,0.472307,0.002261,1.827658,2.640324,0.629906,1.973384,3.262566,0.984263,1.285913,2.132056,0.0,0.479667,0.246021,0.42778,3.793429,0.711195,0.041304,0.975102,0.006128,1.871326,0.623775,1.782964,,,,3JHC
7,dsgdb9nsd_000015,4,3,4658154,-0.001494,-4.884743,0.002261,1.827658,2.640324,0.629906,1.782964,1.782964,,1.285913,2.132056,0.0,0.479667,0.246021,1.285972,2.132212,0.479666,0.328303,0.246054,0.560316,,,1.785017,2.640324,,,2JHH
8,dsgdb9nsd_000015,5,3,4658155,-0.001494,-2.015793,0.002261,1.827658,2.640324,0.629906,1.785009,1.785017,1.1e-05,1.285913,2.132056,0.0,0.479667,0.246021,1.221544,1.91958,0.494163,0.318511,0.207056,0.587821,,,1.782964,2.640324,,,2JHH
9,dsgdb9nsd_000015,0,4,4658156,-0.003841,0.514615,0.002265,1.842413,2.63991,0.770397,1.973384,3.262567,0.984263,1.285972,2.132212,0.0,0.479666,0.246054,0.42778,3.79343,0.711195,0.041304,0.975102,0.006128,1.871118,0.623706,1.785001,,,,1JHC


In [8]:
#减小消耗内存，防止处理数据时内存不足
train_csv = reduce_mem_usage(train_csv,verbose = True)
test_csv = reduce_mem_usage(test_csv,verbose = True)

Mem. usage decreased to 577.51 Mb (49.2% reduction)
Mem. usage decreased to 301.07 Mb (49.2% reduction)


In [9]:
gc.collect()

11

In [10]:
#使用新的数值型特征molecule_index代替字符型molecule_name
train_csv['molecule_index'] = train_csv.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')
test_csv['molecule_index'] = test_csv.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')
#删除molecule_name列
train_csv = train_csv.drop(columns = 'molecule_name',axis = 1)
test_csv = test_csv.drop(columns = 'molecule_name',axis = 1)
train_csv.head(10)

Unnamed: 0,atom_index_1,atom_index_0,id,scalar_coupling_constant,linkM0,linkM1,inv_dist0R,mean_molecule_atom_0_dist_xyz,max_molecule_atom_0_dist_xyz,sd_molecule_atom_0_dist_xyz,mean_molecule_atom_1_dist_xyz,max_molecule_atom_1_dist_xyz,sd_molecule_atom_1_dist_xyz,coulomb_C.x,coulomb_H.x,coulomb_N.x,coulomb_O.x,yukawa_H.x,coulomb_C.y,coulomb_H.y,coulomb_O.y,yukawa_C.y,yukawa_H.y,vander_C.y,distC0,distC1,adH1,adC2,adC3,adC4,type,molecule_index
0,0,1,0,84.807602,0.0,0.680487,0.003337,1.610344,1.783157,0.345594,1.09195,1.091953,3e-06,0.91579,1.682424,0.0,0.0,0.282831,0.0,3.663172,0.0,0.0,1.22922,0.0,1.091953,0.272988,1.78312,,,,1JHC,1
1,2,1,1,-11.257,0.0,-4.886227,0.003337,1.610344,1.783157,0.345594,1.78312,1.78312,,0.91579,1.682424,0.0,0.0,0.282831,0.915792,1.682424,0.0,0.307304,0.282831,0.589902,,,1.783147,,,,2JHH,1
2,3,1,2,-11.2548,0.0,-2.006931,0.003337,1.610344,1.783157,0.345594,1.783153,1.783158,7e-06,0.91579,1.682424,0.0,0.0,0.282831,0.915796,1.682415,0.0,0.307307,0.282827,0.589919,,,1.78312,,,,2JHH,1
3,4,1,3,-11.2543,0.0,-0.9826,0.003337,1.610344,1.783157,0.345594,1.783151,1.783157,5e-06,0.91579,1.682424,0.0,0.0,0.282831,0.915795,1.682415,0.0,0.307306,0.282827,0.589915,,,1.78312,,,,2JHH,1
4,0,2,4,84.807404,0.0,0.680487,0.003348,1.552753,1.783158,0.399065,1.09195,1.091953,3e-06,0.915792,1.682424,0.0,0.0,0.282831,0.0,3.663172,0.0,0.0,1.22922,0.0,1.091952,0.272988,1.783148,,,,1JHC,1
5,3,2,5,-11.2541,0.0,-2.006931,0.003348,1.552753,1.783158,0.399065,1.783153,1.783158,7e-06,0.915792,1.682424,0.0,0.0,0.282831,0.915796,1.682415,0.0,0.307307,0.282827,0.589919,,,1.783148,,,,2JHH,1
6,4,2,6,-11.2548,0.0,-0.9826,0.003348,1.552753,1.783158,0.399065,1.783151,1.783157,5e-06,0.915792,1.682424,0.0,0.0,0.282831,0.915795,1.682415,0.0,0.307306,0.282827,0.589915,,,1.783158,,,,2JHH,1
7,0,3,7,84.809303,0.0,0.680487,0.003359,1.437547,1.783148,0.488753,1.09195,1.091953,3e-06,0.915796,1.682415,0.0,0.0,0.282827,0.0,3.663172,0.0,0.0,1.22922,0.0,1.091946,0.272987,1.783148,,,,1JHC,1
8,4,3,8,-11.2543,0.0,-0.9826,0.003359,1.437547,1.783148,0.488753,1.783151,1.783157,5e-06,0.915796,1.682415,0.0,0.0,0.282827,0.915795,1.682415,0.0,0.307306,0.282827,0.589915,,,,,,,2JHH,1
9,0,4,9,84.809502,0.0,0.680487,0.00337,1.091948,1.091948,,1.09195,1.091953,3e-06,0.915795,1.682415,0.0,0.0,0.282827,0.0,3.663172,0.0,0.0,1.22922,0.0,1.091948,0.272987,,,,,1JHC,1


In [11]:
test_csv.head(10)

Unnamed: 0,atom_index_1,atom_index_0,id,linkM0,linkM1,inv_dist0R,mean_molecule_atom_0_dist_xyz,max_molecule_atom_0_dist_xyz,sd_molecule_atom_0_dist_xyz,mean_molecule_atom_1_dist_xyz,max_molecule_atom_1_dist_xyz,sd_molecule_atom_1_dist_xyz,coulomb_C.x,coulomb_H.x,coulomb_N.x,coulomb_O.x,yukawa_H.x,coulomb_C.y,coulomb_H.y,coulomb_O.y,yukawa_C.y,yukawa_H.y,vander_C.y,distC0,distC1,adH1,adC2,adC3,adC4,type,molecule_index
0,0,2,4658147,0.029943,-0.142512,0.00767,2.215518,3.323277,1.13128,1.661639,2.261178,0.847877,1.383779,0.300908,0.0,0.0,0.010843,0.833973,1.383779,0.0,0.251419,0.37161,0.336444,1.661639,1.661639,3.323277,,,,2JHC,4
1,1,2,4658148,0.029943,-0.140758,0.00767,2.215518,3.323277,1.13128,1.661639,2.261178,0.847877,1.383779,0.300908,0.0,0.0,0.010843,0.833973,1.383779,0.0,0.251419,0.37161,0.336444,1.661639,1.661639,3.323277,,,,1JHC,4
2,3,2,4658149,0.029943,-35.720589,0.00767,2.215518,3.323277,1.13128,3.323277,3.323277,,1.383779,0.300908,0.0,0.0,0.010843,1.383779,0.300908,0.0,0.37161,0.010843,0.704124,,,,2.261178,,,3JHH,4
3,0,3,4658150,0.069869,-0.142512,0.007679,1.661639,2.261178,0.847877,1.661639,2.261178,0.847877,1.383779,0.300908,0.0,0.0,0.010843,0.833973,1.383779,0.0,0.251419,0.37161,0.336444,1.661639,1.661639,,,,,1JHC,4
4,1,3,4658151,0.069869,-0.140758,0.007679,1.661639,2.261178,0.847877,1.661639,2.261178,0.847877,1.383779,0.300908,0.0,0.0,0.010843,0.833973,1.383779,0.0,0.251419,0.37161,0.336444,1.661639,1.661639,,,,,2JHC,4
5,0,3,4658152,-0.001494,0.514615,0.002261,1.827658,2.640324,0.629906,1.973384,3.262567,0.984263,1.285913,2.132056,0.0,0.479667,0.246021,0.427779,3.79343,0.711195,0.041304,0.975102,0.006128,1.871326,0.623775,1.782964,,,,1JHC,15
6,2,3,4658153,-0.001494,0.472307,0.002261,1.827658,2.640324,0.629906,1.973384,3.262566,0.984263,1.285913,2.132056,0.0,0.479667,0.246021,0.427779,3.793429,0.711195,0.041304,0.975102,0.006128,1.871326,0.623775,1.782964,,,,3JHC,15
7,4,3,4658154,-0.001494,-4.884743,0.002261,1.827658,2.640324,0.629906,1.782964,1.782964,,1.285913,2.132056,0.0,0.479667,0.246021,1.285972,2.132212,0.479666,0.328303,0.246054,0.560316,,,1.785017,2.640324,,,2JHH,15
8,5,3,4658155,-0.001494,-2.015793,0.002261,1.827658,2.640324,0.629906,1.785009,1.785017,1.1e-05,1.285913,2.132056,0.0,0.479667,0.246021,1.221544,1.91958,0.494163,0.318511,0.207056,0.587821,,,1.782964,2.640324,,,2JHH,15
9,0,4,4658156,-0.003841,0.514615,0.002265,1.842412,2.63991,0.770397,1.973384,3.262567,0.984263,1.285972,2.132212,0.0,0.479666,0.246054,0.427779,3.79343,0.711195,0.041304,0.975102,0.006128,1.871118,0.623706,1.785001,,,,1JHC,15


In [12]:
submission_csv = pd.read_csv(f'{DATA_PATH}/champs-scalar-coupling/sample_submission.csv', index_col='id')

  mask |= (ar1 == a)


In [13]:
#设置QM9数据集使用的特征
qm9_columns = ['mulliken_min', 'mulliken_max', 'mulliken_atom_0', 'mulliken_atom_1']

In [14]:
print("Load QM9 features...")
data_qm9 = pd.read_pickle('../input/quantum-machine-9-qm9/data.covs.pickle')
data_qm9.head(10)

Load QM9 features...


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,rc_A,rc_B,rc_C,mu,alpha,homo,lumo,gap,r2,zpve,U0,U,H,G,Cv,freqs_min,freqs_max,freqs_mean,linear,mulliken_min,mulliken_max,mulliken_mean,mulliken_atom_0,mulliken_atom_1
0,5174511,dsgdb9nsd_033805,11,7,2JHC,,3.54257,1.50643,1.34544,4.4029,76.7,-0.2457,-0.0269,0.2188,964.8507,0.126725,-363.615135,-363.608497,-363.607553,-363.645885,27.528,155.249,3252.0483,1324.421867,1.0,-0.342191,0.15667,-6.25e-08,0.075457,-0.088328
1,5174510,dsgdb9nsd_033805,11,6,3JHC,,3.54257,1.50643,1.34544,4.4029,76.7,-0.2457,-0.0269,0.2188,964.8507,0.126725,-363.615135,-363.608497,-363.607553,-363.645885,27.528,155.249,3252.0483,1324.421867,1.0,-0.342191,0.15667,-6.25e-08,0.075457,-0.101114
2,5174509,dsgdb9nsd_033805,11,5,2JHC,,3.54257,1.50643,1.34544,4.4029,76.7,-0.2457,-0.0269,0.2188,964.8507,0.126725,-363.615135,-363.608497,-363.607553,-363.645885,27.528,155.249,3252.0483,1324.421867,1.0,-0.342191,0.15667,-6.25e-08,0.075457,-0.088328
3,5174508,dsgdb9nsd_033805,11,4,1JHC,,3.54257,1.50643,1.34544,4.4029,76.7,-0.2457,-0.0269,0.2188,964.8507,0.126725,-363.615135,-363.608497,-363.607553,-363.645885,27.528,155.249,3252.0483,1324.421867,1.0,-0.342191,0.15667,-6.25e-08,0.075457,-0.139609
4,5174507,dsgdb9nsd_033805,11,3,2JHC,,3.54257,1.50643,1.34544,4.4029,76.7,-0.2457,-0.0269,0.2188,964.8507,0.126725,-363.615135,-363.608497,-363.607553,-363.645885,27.528,155.249,3252.0483,1324.421867,1.0,-0.342191,0.15667,-6.25e-08,0.075457,-0.221362
5,5174506,dsgdb9nsd_033805,11,2,3JHC,,3.54257,1.50643,1.34544,4.4029,76.7,-0.2457,-0.0269,0.2188,964.8507,0.126725,-363.615135,-363.608497,-363.607553,-363.645885,27.528,155.249,3252.0483,1324.421867,1.0,-0.342191,0.15667,-6.25e-08,0.075457,0.138626
6,5174512,dsgdb9nsd_033805,11,8,3JHC,,3.54257,1.50643,1.34544,4.4029,76.7,-0.2457,-0.0269,0.2188,964.8507,0.126725,-363.615135,-363.608497,-363.607553,-363.645885,27.528,155.249,3252.0483,1324.421867,1.0,-0.342191,0.15667,-6.25e-08,0.075457,-0.10112
7,5174496,dsgdb9nsd_033805,9,11,3JHH,,3.54257,1.50643,1.34544,4.4029,76.7,-0.2457,-0.0269,0.2188,964.8507,0.126725,-363.615135,-363.608497,-363.607553,-363.645885,27.528,155.249,3252.0483,1324.421867,1.0,-0.342191,0.15667,-6.25e-08,0.116017,0.075457
8,5174503,dsgdb9nsd_033805,10,7,3JHC,,3.54257,1.50643,1.34544,4.4029,76.7,-0.2457,-0.0269,0.2188,964.8507,0.126725,-363.615135,-363.608497,-363.607553,-363.645885,27.528,155.249,3252.0483,1324.421867,1.0,-0.342191,0.15667,-6.25e-08,0.116019,-0.088328
9,5174502,dsgdb9nsd_033805,10,6,3JHC,,3.54257,1.50643,1.34544,4.4029,76.7,-0.2457,-0.0269,0.2188,964.8507,0.126725,-363.615135,-363.608497,-363.607553,-363.645885,27.528,155.249,3252.0483,1324.421867,1.0,-0.342191,0.15667,-6.25e-08,0.116019,-0.101114


In [15]:
#删除不使用的特征列
data_qm9 = data_qm9.drop(columns = ['type', 'linear', 'atom_index_0', 'atom_index_1', 
                                    'scalar_coupling_constant', 'U', 'G', 'H', 
                                    'mulliken_mean', 'r2', 'U0','rc_A','rc_B',
                                    'rc_C', 'mu', 'alpha', 'homo','lumo', 'gap',
                                    'zpve', 'Cv', 'freqs_min', 'freqs_max', 'freqs_mean',], axis=1)
data_qm9 = reduce_mem_usage(data_qm9,verbose=True)

Mem. usage decreased to 191.29 Mb (41.7% reduction)


In [16]:
data_qm9.head(10)

Unnamed: 0,id,molecule_name,mulliken_min,mulliken_max,mulliken_atom_0,mulliken_atom_1
0,5174511,dsgdb9nsd_033805,-0.342191,0.15667,0.075457,-0.088328
1,5174510,dsgdb9nsd_033805,-0.342191,0.15667,0.075457,-0.101114
2,5174509,dsgdb9nsd_033805,-0.342191,0.15667,0.075457,-0.088328
3,5174508,dsgdb9nsd_033805,-0.342191,0.15667,0.075457,-0.139609
4,5174507,dsgdb9nsd_033805,-0.342191,0.15667,0.075457,-0.221362
5,5174506,dsgdb9nsd_033805,-0.342191,0.15667,0.075457,0.138626
6,5174512,dsgdb9nsd_033805,-0.342191,0.15667,0.075457,-0.10112
7,5174496,dsgdb9nsd_033805,-0.342191,0.15667,0.116017,0.075457
8,5174503,dsgdb9nsd_033805,-0.342191,0.15667,0.116019,-0.088328
9,5174502,dsgdb9nsd_033805,-0.342191,0.15667,0.116019,-0.101114


In [17]:
#data_qm9.set_index('id',inplace = True)
#使用新的数值型特征molecule_index代替字符型molecule_name
data_qm9['molecule_index'] = data_qm9.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')
#删除molecule_name列
data_qm9 = data_qm9.drop(columns = 'molecule_name',axis = 1)

In [18]:
data_qm9.head(10)

Unnamed: 0,id,mulliken_min,mulliken_max,mulliken_atom_0,mulliken_atom_1,molecule_index
0,5174511,-0.342191,0.15667,0.075457,-0.088328,33805
1,5174510,-0.342191,0.15667,0.075457,-0.101114,33805
2,5174509,-0.342191,0.15667,0.075457,-0.088328,33805
3,5174508,-0.342191,0.15667,0.075457,-0.139609,33805
4,5174507,-0.342191,0.15667,0.075457,-0.221362,33805
5,5174506,-0.342191,0.15667,0.075457,0.138626,33805
6,5174512,-0.342191,0.15667,0.075457,-0.10112,33805
7,5174496,-0.342191,0.15667,0.116017,0.075457,33805
8,5174503,-0.342191,0.15667,0.116019,-0.088328,33805
9,5174502,-0.342191,0.15667,0.116019,-0.101114,33805


In [19]:
train_csv = pd.merge(train_csv, data_qm9, how='left', on=['molecule_index','id'])
test_csv = pd.merge(test_csv, data_qm9, how='left', on=['molecule_index','id'])
#删除data_qm9文件，防止内存不足
del data_qm9
gc.collect()

70

In [20]:
train_csv.set_index('id',inplace = True)
#train_csv = train_csv.drop(['molecule_name'],axis =1)
train_csv.head(10)

Unnamed: 0_level_0,atom_index_1,atom_index_0,scalar_coupling_constant,linkM0,linkM1,inv_dist0R,mean_molecule_atom_0_dist_xyz,max_molecule_atom_0_dist_xyz,sd_molecule_atom_0_dist_xyz,mean_molecule_atom_1_dist_xyz,max_molecule_atom_1_dist_xyz,sd_molecule_atom_1_dist_xyz,coulomb_C.x,coulomb_H.x,coulomb_N.x,coulomb_O.x,yukawa_H.x,coulomb_C.y,coulomb_H.y,coulomb_O.y,yukawa_C.y,yukawa_H.y,vander_C.y,distC0,distC1,adH1,adC2,adC3,adC4,type,molecule_index,mulliken_min,mulliken_max,mulliken_atom_0,mulliken_atom_1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
0,0,1,84.807602,0.0,0.680487,0.003337,1.610344,1.783157,0.345594,1.09195,1.091953,3e-06,0.91579,1.682424,0.0,0.0,0.282831,0.0,3.663172,0.0,0.0,1.22922,0.0,1.091953,0.272988,1.78312,,,,1JHC,1,-0.535689,0.133923,0.133921,-0.535689
1,2,1,-11.257,0.0,-4.886227,0.003337,1.610344,1.783157,0.345594,1.78312,1.78312,,0.91579,1.682424,0.0,0.0,0.282831,0.915792,1.682424,0.0,0.307304,0.282831,0.589902,,,1.783147,,,,2JHH,1,-0.535689,0.133923,0.133921,0.133922
2,3,1,-11.2548,0.0,-2.006931,0.003337,1.610344,1.783157,0.345594,1.783153,1.783158,7e-06,0.91579,1.682424,0.0,0.0,0.282831,0.915796,1.682415,0.0,0.307307,0.282827,0.589919,,,1.78312,,,,2JHH,1,-0.535689,0.133923,0.133921,0.133923
3,4,1,-11.2543,0.0,-0.9826,0.003337,1.610344,1.783157,0.345594,1.783151,1.783157,5e-06,0.91579,1.682424,0.0,0.0,0.282831,0.915795,1.682415,0.0,0.307306,0.282827,0.589915,,,1.78312,,,,2JHH,1,-0.535689,0.133923,0.133921,0.133923
4,0,2,84.807404,0.0,0.680487,0.003348,1.552753,1.783158,0.399065,1.09195,1.091953,3e-06,0.915792,1.682424,0.0,0.0,0.282831,0.0,3.663172,0.0,0.0,1.22922,0.0,1.091952,0.272988,1.783148,,,,1JHC,1,-0.535689,0.133923,0.133922,-0.535689
5,3,2,-11.2541,0.0,-2.006931,0.003348,1.552753,1.783158,0.399065,1.783153,1.783158,7e-06,0.915792,1.682424,0.0,0.0,0.282831,0.915796,1.682415,0.0,0.307307,0.282827,0.589919,,,1.783148,,,,2JHH,1,-0.535689,0.133923,0.133922,0.133923
6,4,2,-11.2548,0.0,-0.9826,0.003348,1.552753,1.783158,0.399065,1.783151,1.783157,5e-06,0.915792,1.682424,0.0,0.0,0.282831,0.915795,1.682415,0.0,0.307306,0.282827,0.589915,,,1.783158,,,,2JHH,1,-0.535689,0.133923,0.133922,0.133923
7,0,3,84.809303,0.0,0.680487,0.003359,1.437547,1.783148,0.488753,1.09195,1.091953,3e-06,0.915796,1.682415,0.0,0.0,0.282827,0.0,3.663172,0.0,0.0,1.22922,0.0,1.091946,0.272987,1.783148,,,,1JHC,1,-0.535689,0.133923,0.133923,-0.535689
8,4,3,-11.2543,0.0,-0.9826,0.003359,1.437547,1.783148,0.488753,1.783151,1.783157,5e-06,0.915796,1.682415,0.0,0.0,0.282827,0.915795,1.682415,0.0,0.307306,0.282827,0.589915,,,,,,,2JHH,1,-0.535689,0.133923,0.133923,0.133923
9,0,4,84.809502,0.0,0.680487,0.00337,1.091948,1.091948,,1.09195,1.091953,3e-06,0.915795,1.682415,0.0,0.0,0.282827,0.0,3.663172,0.0,0.0,1.22922,0.0,1.091948,0.272987,,,,,1JHC,1,-0.535689,0.133923,0.133923,-0.535689


In [21]:
test_csv.set_index('id',inplace = True)
#test_csv = test_csv.drop(['molecule_name'],axis =1)
test_csv.head(10)

Unnamed: 0_level_0,atom_index_1,atom_index_0,linkM0,linkM1,inv_dist0R,mean_molecule_atom_0_dist_xyz,max_molecule_atom_0_dist_xyz,sd_molecule_atom_0_dist_xyz,mean_molecule_atom_1_dist_xyz,max_molecule_atom_1_dist_xyz,sd_molecule_atom_1_dist_xyz,coulomb_C.x,coulomb_H.x,coulomb_N.x,coulomb_O.x,yukawa_H.x,coulomb_C.y,coulomb_H.y,coulomb_O.y,yukawa_C.y,yukawa_H.y,vander_C.y,distC0,distC1,adH1,adC2,adC3,adC4,type,molecule_index,mulliken_min,mulliken_max,mulliken_atom_0,mulliken_atom_1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
4658147,0,2,0.029943,-0.142512,0.00767,2.215518,3.323277,1.13128,1.661639,2.261178,0.847877,1.383779,0.300908,0.0,0.0,0.010843,0.833973,1.383779,0.0,0.251419,0.37161,0.336444,1.661639,1.661639,3.323277,,,,2JHC,4,-0.207019,0.207019,0.207019,-0.207019
4658148,1,2,0.029943,-0.140758,0.00767,2.215518,3.323277,1.13128,1.661639,2.261178,0.847877,1.383779,0.300908,0.0,0.0,0.010843,0.833973,1.383779,0.0,0.251419,0.37161,0.336444,1.661639,1.661639,3.323277,,,,1JHC,4,-0.207019,0.207019,0.207019,-0.207019
4658149,3,2,0.029943,-35.720589,0.00767,2.215518,3.323277,1.13128,3.323277,3.323277,,1.383779,0.300908,0.0,0.0,0.010843,1.383779,0.300908,0.0,0.37161,0.010843,0.704124,,,,2.261178,,,3JHH,4,-0.207019,0.207019,0.207019,0.207019
4658150,0,3,0.069869,-0.142512,0.007679,1.661639,2.261178,0.847877,1.661639,2.261178,0.847877,1.383779,0.300908,0.0,0.0,0.010843,0.833973,1.383779,0.0,0.251419,0.37161,0.336444,1.661639,1.661639,,,,,1JHC,4,-0.207019,0.207019,0.207019,-0.207019
4658151,1,3,0.069869,-0.140758,0.007679,1.661639,2.261178,0.847877,1.661639,2.261178,0.847877,1.383779,0.300908,0.0,0.0,0.010843,0.833973,1.383779,0.0,0.251419,0.37161,0.336444,1.661639,1.661639,,,,,2JHC,4,-0.207019,0.207019,0.207019,-0.207019
4658152,0,3,-0.001494,0.514615,0.002261,1.827658,2.640324,0.629906,1.973384,3.262567,0.984263,1.285913,2.132056,0.0,0.479667,0.246021,0.427779,3.79343,0.711195,0.041304,0.975102,0.006128,1.871326,0.623775,1.782964,,,,1JHC,15,-0.223408,0.126612,0.101236,-0.223408
4658153,2,3,-0.001494,0.472307,0.002261,1.827658,2.640324,0.629906,1.973384,3.262566,0.984263,1.285913,2.132056,0.0,0.479667,0.246021,0.427779,3.793429,0.711195,0.041304,0.975102,0.006128,1.871326,0.623775,1.782964,,,,3JHC,15,-0.223408,0.126612,0.101236,-0.223408
4658154,4,3,-0.001494,-4.884743,0.002261,1.827658,2.640324,0.629906,1.782964,1.782964,,1.285913,2.132056,0.0,0.479667,0.246021,1.285972,2.132212,0.479666,0.328303,0.246054,0.560316,,,1.785017,2.640324,,,2JHH,15,-0.223408,0.126612,0.101236,0.101229
4658155,5,3,-0.001494,-2.015793,0.002261,1.827658,2.640324,0.629906,1.785009,1.785017,1.1e-05,1.285913,2.132056,0.0,0.479667,0.246021,1.221544,1.91958,0.494163,0.318511,0.207056,0.587821,,,1.782964,2.640324,,,2JHH,15,-0.223408,0.126612,0.101236,0.126612
4658156,0,4,-0.003841,0.514615,0.002265,1.842412,2.63991,0.770397,1.973384,3.262567,0.984263,1.285972,2.132212,0.0,0.479666,0.246054,0.427779,3.79343,0.711195,0.041304,0.975102,0.006128,1.871118,0.623706,1.785001,,,,1JHC,15,-0.223408,0.126612,0.101229,-0.223408


In [22]:
train_csv.columns

Index(['atom_index_1', 'atom_index_0', 'scalar_coupling_constant', 'linkM0',
       'linkM1', 'inv_dist0R', 'mean_molecule_atom_0_dist_xyz',
       'max_molecule_atom_0_dist_xyz', 'sd_molecule_atom_0_dist_xyz',
       'mean_molecule_atom_1_dist_xyz', 'max_molecule_atom_1_dist_xyz',
       'sd_molecule_atom_1_dist_xyz', 'coulomb_C.x', 'coulomb_H.x',
       'coulomb_N.x', 'coulomb_O.x', 'yukawa_H.x', 'coulomb_C.y',
       'coulomb_H.y', 'coulomb_O.y', 'yukawa_C.y', 'yukawa_H.y', 'vander_C.y',
       'distC0', 'distC1', 'adH1', 'adC2', 'adC3', 'adC4', 'type',
       'molecule_index', 'mulliken_min', 'mulliken_max', 'mulliken_atom_0',
       'mulliken_atom_1'],
      dtype='object')

In [23]:
#将所有的缺失数据填充为0
train_csv = train_csv.fillna(0)
test_csv = test_csv.fillna(0)

In [24]:
train_csv.to_csv(f'{SUBMISSIONS_PATH}/fin_train.csv')
test_csv.to_csv(f'{SUBMISSIONS_PATH}/fin_test.csv')