In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
from multiprocessing import Pool
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
root = '../input/champs-scalar-coupling/'
Qm = '../input/quantum-machine-9-aka-qm9/dsgdb9nsd.xyz/'
# Any results you write to the current directory are saved as output.

In [2]:
train = pd.read_csv(root+'train.csv')
structures = pd.read_csv(root+'structures.csv')
type_list = train['type'].unique().tolist()

In [3]:
def get_atoms(name):
    path = root + 'structures/' + name + '.xyz'
    qmp = Qm + name + '.xyz'
    mm = pd.read_csv(qmp,sep='\t',engine='python', skiprows=2, skipfooter=3, names=range(5))[4]
    if mm.dtype == 'O':
        mm = mm.str.replace('*^','e',regex=False).astype(float)
    lis = []
    file = open(path,'r')
    number = int(file.readline())
    file.readline()
    for i in range(number): 
        line = file.readline().split()
        lis.append([line[0],np.array(line[1:]).astype(float),mm[i]])
    file.close()
    return lis
print(*get_atoms('dsgdb9nsd_000007'),sep='  \n')

['C', array([-0.018704  ,  1.52558201,  0.01043281]), -0.345672]  
['C', array([ 0.00210374, -0.00388191,  0.00199882]), -0.345672]  
['H', array([0.99487275, 1.93974324, 0.0029412 ]), 0.115222]  
['H', array([-0.54207611,  1.92361063, -0.86511735]), 0.115225]  
['H', array([-0.52524112,  1.91417308,  0.90002399]), 0.115225]  
['H', array([ 0.52548654, -0.40190784,  0.87754395]), 0.115225]  
['H', array([-1.01147651, -0.4180338 ,  0.00950849]), 0.115222]  
['H', array([ 0.50862619, -0.3924704 , -0.88760117]), 0.115225]


In [4]:
def get_distance(coor1, coor2):
    return np.linalg.norm(coor2 - coor1)
def unit_vector(vector):
    return vector / np.linalg.norm(vector)
def angle_between(v1, v2):
    v1_u = unit_vector(v1)
    v2_u = unit_vector(v2)
    return np.degrees(np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0)))
def angle_bw_points(p1,p2,p3):
    return angle_between(p2-p1,p3-p1)

In [5]:
a = np.array([0,0,0])
b = np.array([2,0,0])
c = np.array([0,0,4])
print(angle_bw_points(a,b,c))

90.0


In [6]:
def get_next_points(name,index0,index1,k=5):
    lis,dis = get_atoms(name), []
    if index1 < index0:
        at0 = lis.pop(index0)
        at1 = lis.pop(index1)
    else:
        at1 = lis.pop(index1)
        at0 = lis.pop(index0)
    for i in lis: dis.append(min(get_distance(at0[1], i[1]),get_distance(at1[1], i[1])))
    a = np.argsort(np.array(dis))[:k]
    #a = np.argwhere(np.array(dis) < get_distance(at0[1], at1[1])).T[0]
    return [lis[i] for i in a.tolist()]

In [7]:
def get_point(name,index):
    lis = get_atoms(name)
    return lis[index]

In [8]:
get_next_points('dsgdb9nsd_000007',0,2)

[['H', array([-0.54207611,  1.92361063, -0.86511735]), 0.115225],
 ['H', array([-0.52524112,  1.91417308,  0.90002399]), 0.115225],
 ['C', array([ 0.00210374, -0.00388191,  0.00199882]), -0.345672],
 ['H', array([-1.01147651, -0.4180338 ,  0.00950849]), 0.115222],
 ['H', array([ 0.52548654, -0.40190784,  0.87754395]), 0.115225]]

In [9]:
def get_from_type(lis):
    ret = []
    for i in lis:
        if i == 'H': ret.append(1.0)
        elif i == 'C': ret.append(2.0)
        elif i == 'O': ret.append(3.0)
        elif i == 'N': ret.append(4.0)
        elif i == 'F': ret.append(5.0)
        else: ret.append(0.0)
    return ret

In [10]:
def get_feature_list(df,j):
    name = df['molecule_name'][j]
    index0 = df['atom_index_0'][j]
    index1 = df['atom_index_1'][j]
    lis = get_next_points(name,index0,index1,8)
    p0 = get_point(name,index0)
    p1 = get_point(name,index1)
    fea = get_from_type(p0[0])
    fea.append(p0[2])
    fea.extend(get_from_type(p1[0]))
    fea.append(p1[2])
    fea.append(get_distance(p0[1], p1[1]))
    for i in lis: 
        fea.extend(get_from_type(i[0]))
        fea.append(i[2])
        fea.extend([get_distance(p0[1], i[1]),get_distance(p1[1], i[1]),
                    angle_bw_points(p0[1],p1[1],i[1]), angle_bw_points(p1[1],p0[1],i[1])])
    for i in range(8 - len(lis)):
        fea.extend(get_from_type('I'))
        fea.append(0)
        fea.extend([50.0, 50.0, 45.0, 45.0])
    return [np.float32(k) for k in fea]

In [11]:
df_ini = train

In [12]:
featr = ['type0','charge0','type1','charge1','distance',
'type_0','charge_0','dist0_0','dist1_0','angle0_0','angle1_0',
'type_1','charge_1','dist0_1','dist1_1','angle0_1','angle1_1',
'type_2','charge_2','dist0_2','dist1_2','angle0_2','angle1_2',
'type_3','charge_3','dist0_3','dist1_3','angle0_3','angle1_3',
'type_4','charge_4','dist0_4','dist1_4','angle0_4','angle1_4',
'type_5','charge_5','dist0_5','dist1_5','angle0_5','angle1_5',
'type_6','charge_6','dist0_6','dist1_6','angle0_6','angle1_6',
'type_7','charge_7','dist0_7','dist1_7','angle0_7','angle1_7']

In [13]:
from tqdm import trange
def get_dataframe(df_ini):
    arr = np.zeros((df_ini.shape[0], 53))
    for j in range(df_ini.shape[0]):
        fea = get_feature_list(df_ini,j)
        arr[j,:] = np.array(fea)
        #print(fea)
        break
    return pd.DataFrame(arr)
v = get_dataframe(df_ini)
v.columns = featr
v.head()

Unnamed: 0,type0,charge0,type1,charge1,distance,type_0,charge_0,dist0_0,dist1_0,angle0_0,angle1_0,type_1,charge_1,dist0_1,dist1_1,angle0_1,angle1_1,type_2,charge_2,dist0_2,dist1_2,angle0_2,angle1_2,type_3,charge_3,dist0_3,dist1_3,angle0_3,angle1_3,type_4,charge_4,dist0_4,dist1_4,angle0_4,angle1_4,type_5,charge_5,dist0_5,dist1_5,angle0_5,angle1_5,type_6,charge_6,dist0_6,dist1_6,angle0_6,angle1_6,type_7,charge_7,dist0_7,dist1_7,angle0_7,angle1_7
0,1.0,0.133921,2.0,-0.535689,1.091953,1.0,0.133923,1.783147,1.091946,35.264217,109.471321,1.0,0.133923,1.783157,1.091948,35.263863,109.472069,1.0,0.133922,1.78312,1.091952,35.26577,109.468407,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
def tt(j):
        return get_feature_list(df_ini,j)
def get_dataframe(df_ini):
    with Pool(8) as p:
        a = p.map(tt,[i for i in range(df_ini.shape[0])])
    return pd.DataFrame(np.array(a))

In [15]:
%%time
df_ini = train
df_t = get_dataframe(train)

CPU times: user 4min 51s, sys: 46.5 s, total: 5min 38s
Wall time: 6h 38s


In [16]:
df_t.columns = featr
dd = pd.concat([train,df_t],axis=1)
dd.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,type0,charge0,type1,charge1,distance,type_0,charge_0,dist0_0,dist1_0,angle0_0,angle1_0,type_1,charge_1,dist0_1,dist1_1,angle0_1,angle1_1,type_2,charge_2,dist0_2,dist1_2,angle0_2,angle1_2,type_3,charge_3,dist0_3,dist1_3,angle0_3,angle1_3,type_4,charge_4,dist0_4,dist1_4,angle0_4,angle1_4,type_5,charge_5,dist0_5,dist1_5,angle0_5,angle1_5,type_6,charge_6,dist0_6,dist1_6,angle0_6,angle1_6,type_7,charge_7,dist0_7,dist1_7,angle0_7,angle1_7
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,1.0,0.133921,2.0,-0.535689,1.091953,1.0,0.133923,1.783147,1.091946,35.264217,109.471321,1.0,0.133923,1.783157,1.091948,35.263863,109.472069,1.0,0.133922,1.78312,1.091952,35.26577,109.468407,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,1.0,0.133921,1.0,0.133922,1.78312,2.0,-0.535689,1.091953,1.091952,35.26577,35.265823,1.0,0.133923,1.783147,1.783158,60.000893,60.000324,1.0,0.133923,1.783157,1.783148,60.000378,60.000839,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,1.0,0.133921,1.0,0.133923,1.783147,2.0,-0.535689,1.091953,1.091946,35.264217,35.264465,1.0,0.133922,1.78312,1.783158,60.000893,59.998783,1.0,0.133923,1.783157,1.783148,59.999844,60.000336,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,1.0,0.133921,1.0,0.133923,1.783157,2.0,-0.535689,1.091953,1.091948,35.263863,35.264069,1.0,0.133922,1.78312,1.783148,60.000378,59.998783,1.0,0.133923,1.783147,1.783148,59.999844,59.999821,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,1.0,0.133922,2.0,-0.535689,1.091952,1.0,0.133923,1.783158,1.091946,35.263729,109.472351,1.0,0.133923,1.783148,1.091948,35.264214,109.47142,1.0,0.133921,1.78312,1.091953,35.265823,109.468407,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0,0.0,0.0,50.0,50.0,45.0,45.0


In [17]:
print(dd.shape)
print(train.shape)
dd.to_csv('train_.csv')

(4658147, 59)
(4658147, 6)


In [18]:
# %%time
# test = pd.read_csv(root+'test.csv')
# df_ini = test
# df_t = get_dataframe(test)

In [19]:
# df_t.columns = featr
# dd = pd.concat([test,df_t],axis=1)
# dd.head()

In [20]:
# print(dd.shape)
# print(test.shape)
# dd.to_csv('test_.csv')