In [1]:
import numpy as np
import pandas as pd
import os,pickle, gc
from tqdm import tqdm
import matplotlib.pyplot as plt


In [46]:
pd.set_option("max_columns",300)
pd.set_option("max_rows",300)

In [2]:
train = pd.read_pickle("../pickle/train.pkl")

In [3]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


In [4]:
train_bond = pd.read_csv("../external/train_bonds.csv")

In [5]:
test_bond = pd.read_csv("../external/test_bonds.csv")

In [6]:
del train_bond["Unnamed: 0"], test_bond["Unnamed: 0"]

In [7]:
train_bond.head(20)

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,nbond,L2dist,error,bond_type
0,dsgdb9nsd_000001,0,1,1,1.091953,0,1CH
1,dsgdb9nsd_000001,0,2,1,1.091952,0,1CH
2,dsgdb9nsd_000001,0,3,1,1.091946,0,1CH
3,dsgdb9nsd_000001,0,4,1,1.091948,0,1CH
4,dsgdb9nsd_000002,0,1,1,1.01719,0,1HN
5,dsgdb9nsd_000002,0,2,1,1.017187,0,1HN
6,dsgdb9nsd_000002,0,3,1,1.017208,0,1HN
7,dsgdb9nsd_000003,0,1,1,0.962107,0,1HO
8,dsgdb9nsd_000003,0,2,1,0.962107,0,1HO
9,dsgdb9nsd_000005,0,2,1,1.066598,0,1CH


In [8]:
train_bond["bond_type"].value_counts()

1CH    709416
1CC    439391
1CO    130279
1CN    107955
1HN     43363
2CO     34719
1HO     33058
2CC     26007
2CN     25263
3CC     12527
3CN     11327
1NN      5295
1NO      4337
1CF      1970
2NN      1269
2NO        18
Name: bond_type, dtype: int64

## next bond

In [10]:
def replace_concat(df):
    df = pd.concat([df, df.rename(
        columns={"atom_index_0":"atom_index_1","atom_index_1":"atom_index_0"})],
        axis=0,
        sort=True
    ).reset_index(drop=True)
    return df

In [73]:
use_cols = ["molecule_name","atom_index_0","atom_index_1","nbond","bond_type","L2dist"]
train_bond_1CC = train_bond[train_bond.bond_type.str.contains("CC")]
next_bond = replace_concat(train_bond)[use_cols].merge(
                             replace_concat(train_bond_1CC)[use_cols], 
                             left_on=["molecule_name","atom_index_1"], 
                             right_on=["molecule_name","atom_index_0"],
                             how="inner",
                            suffixes=("_l","_r"))
next_bond.drop(["atom_index_0_r","nbond_l"], axis=1, inplace=True)
next_bond.rename(
        columns={
                "atom_index_0_l":"idx_0",
                "atom_index_1_l":"idx_1",
                "atom_index_1_r":"idx_2",
                "nbond_r":"nbond_next",
                "bond_type_l":"bond_type_next",
                "bond_type_r":"bond_type_next2",
                "L2dist_l":"dist",
                "L2dist_r":"dist2"
        },inplace=True)
next_bond = next_bond[next_bond.idx_0 != next_bond.idx_2]

In [81]:
next_bond["dist"].mean()

1.3255517442915496

In [74]:
next_bond.head()

Unnamed: 0,molecule_name,idx_0,idx_1,bond_type_next,dist,idx_2,nbond_next,bond_type_next2,dist2
1,dsgdb9nsd_000007,5,1,1CH,1.094958,0,1,1CC,1.529629
2,dsgdb9nsd_000007,6,1,1CH,1.094953,0,1,1CC,1.529629
3,dsgdb9nsd_000007,7,1,1CH,1.094968,0,1,1CC,1.529629
4,dsgdb9nsd_000009,0,1,1CC,1.455009,2,3,3CC,1.201441
7,dsgdb9nsd_000009,2,1,3CC,1.201441,0,1,1CC,1.455009


In [12]:
next_bond = pd.concat([next_bond, pd.get_dummies(next_bond["bond_type_next"])], axis=1)

In [20]:
agg_recipe = {**{"nbond_next":["min","max","nunique","count"]} ,**{c:["max","sum"] for c in next_bond["bond_type_next"].unique()}}

In [21]:
agg_recipe

{'nbond_next': ['min', 'max', 'nunique', 'count'],
 '1CH': ['max', 'sum'],
 '1CC': ['max', 'sum'],
 '3CC': ['max', 'sum'],
 '3CN': ['max', 'sum'],
 '2CO': ['max', 'sum'],
 '1CO': ['max', 'sum'],
 '1CN': ['max', 'sum'],
 '2CN': ['max', 'sum'],
 '2CC': ['max', 'sum'],
 '1CF': ['max', 'sum']}

In [22]:
next_bond_gr = next_bond.groupby(["molecule_name","idx_0"]).agg(agg_recipe)

In [23]:
next_bond_gr.columns = pd.Index([e[0] + "_" +e[1] for e in next_bond_gr.columns.tolist()])
next_bond_gr.reset_index(inplace=True)

In [24]:
next_bond_gr.head()

Unnamed: 0,molecule_name,idx_0,nbond_next_min,nbond_next_max,nbond_next_nunique,nbond_next_count,1CH_max,1CH_sum,1CC_max,1CC_sum,...,1CO_max,1CO_sum,1CN_max,1CN_sum,2CN_max,2CN_sum,2CC_max,2CC_sum,1CF_max,1CF_sum
0,dsgdb9nsd_000007,2,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,dsgdb9nsd_000007,3,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,dsgdb9nsd_000007,4,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,dsgdb9nsd_000007,5,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,dsgdb9nsd_000007,6,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
next_bond[next_bond.molecule_name=="dsgdb9nsd_110064"].sort_values(["idx_0","idx_1","idx_2"])

Unnamed: 0,molecule_name,idx_0,idx_1,bond_type_next,dist,idx_2,nbond_next,bond_type_next2,dist2
2880553,dsgdb9nsd_110064,0,1,1CO,1.421575,2,1,1CC,1.536803
2880557,dsgdb9nsd_110064,1,2,1CC,1.536803,3,1,1CC,1.537807
2880558,dsgdb9nsd_110064,1,2,1CC,1.536803,7,1,1CC,1.548445
2880569,dsgdb9nsd_110064,2,3,1CC,1.537807,5,1,1CC,1.545106
2880578,dsgdb9nsd_110064,2,7,1CC,1.548445,6,1,1CC,1.54535
2880565,dsgdb9nsd_110064,3,2,1CC,1.537807,1,1,1CC,1.536803
2880564,dsgdb9nsd_110064,3,2,1CC,1.537807,7,1,1CC,1.548445
2880585,dsgdb9nsd_110064,3,5,1CC,1.545106,6,1,1CC,1.555746
2880574,dsgdb9nsd_110064,4,3,1CO,1.445811,2,1,1CC,1.537807
2880573,dsgdb9nsd_110064,4,3,1CO,1.445811,5,1,1CC,1.545106


In [56]:
use_cols = ["molecule_name","atom_index_0","atom_index_1","nbond","bond_type","L2dist"]
tmp = replace_concat(train_bond[train_bond.molecule_name=="dsgdb9nsd_110064"])[use_cols].merge(
                             replace_concat(train_bond[train_bond.molecule_name=="dsgdb9nsd_110064"])[use_cols], 
                             left_on=["molecule_name","atom_index_1"], 
                             right_on=["molecule_name","atom_index_0"],
                             how="inner",
                            suffixes=("_l","_r")).drop(["atom_index_0_r","nbond_l"],axis=1).sort_values("atom_index_0_l")
tmp = tmp.loc[tmp.atom_index_0_l!=tmp.atom_index_1_r]
tmp.rename(
        columns={
                "atom_index_0_l":"idx_0",
                "atom_index_1_l":"idx_1",
                "atom_index_1_r":"idx_2",
                "nbond_r":"nbond_next",
                "bond_type_l":"bond_type_next",
                "L2dist_l":"next_dist",
                "L2dist_r":"next2_dist"
        },inplace=True)
tmp.sort_values("idx_0")

Unnamed: 0,molecule_name,idx_0,idx_1,bond_type_next,next_dist,idx_2,nbond_next,bond_type_r,next2_dist
14,dsgdb9nsd_110064,0,1,1CO,1.421575,2,1,1CC,1.536803
13,dsgdb9nsd_110064,0,1,1CO,1.421575,11,1,1CH,1.094682
12,dsgdb9nsd_110064,0,1,1CO,1.421575,10,1,1CH,1.095731
118,dsgdb9nsd_110064,1,0,1CO,1.421575,9,1,1HO,0.963895
38,dsgdb9nsd_110064,1,2,1CC,1.536803,7,1,1CC,1.548445
37,dsgdb9nsd_110064,1,2,1CC,1.536803,3,1,1CC,1.537807
36,dsgdb9nsd_110064,1,2,1CC,1.536803,12,1,1CH,1.097794
71,dsgdb9nsd_110064,2,7,1CC,1.548445,6,1,1CC,1.54535
69,dsgdb9nsd_110064,2,7,1CC,1.548445,8,1,1CO,1.42169
68,dsgdb9nsd_110064,2,7,1CC,1.548445,19,1,1CH,1.094032


In [57]:
tmp = pd.concat([tmp, pd.get_dummies(tmp["bond_type_next"])], axis=1)

In [63]:
tmp

Unnamed: 0,molecule_name,idx_0,idx_1,bond_type_next,next_dist,idx_2,nbond_next,bond_type_r,next2_dist,1CC,1CH,1CO,1HO
14,dsgdb9nsd_110064,0,1,1CO,1.421575,2,1,1CC,1.536803,0,0,1,0
13,dsgdb9nsd_110064,0,1,1CO,1.421575,11,1,1CH,1.094682,0,0,1,0
12,dsgdb9nsd_110064,0,1,1CO,1.421575,10,1,1CH,1.095731,0,0,1,0
118,dsgdb9nsd_110064,1,0,1CO,1.421575,9,1,1HO,0.963895,0,0,1,0
38,dsgdb9nsd_110064,1,2,1CC,1.536803,7,1,1CC,1.548445,1,0,0,0
37,dsgdb9nsd_110064,1,2,1CC,1.536803,3,1,1CC,1.537807,1,0,0,0
36,dsgdb9nsd_110064,1,2,1CC,1.536803,12,1,1CH,1.097794,1,0,0,0
27,dsgdb9nsd_110064,2,1,1CC,1.536803,0,1,1CO,1.421575,1,0,0,0
53,dsgdb9nsd_110064,2,3,1CC,1.537807,4,1,1CO,1.445811,1,0,0,0
52,dsgdb9nsd_110064,2,3,1CC,1.537807,13,1,1CH,1.095088,1,0,0,0


In [61]:
tmp.groupby(["molecule_name","idx_0"]).agg({c:["sum"] for c in tmp["bond_type_next"].unique()}).reset_index()

Unnamed: 0_level_0,molecule_name,idx_0,1CO,1CC,1HO,1CH
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,sum,sum,sum
0,dsgdb9nsd_110064,0,3,0,0,0
1,dsgdb9nsd_110064,1,1,3,0,0
2,dsgdb9nsd_110064,2,0,9,0,0
3,dsgdb9nsd_110064,3,1,6,0,0
4,dsgdb9nsd_110064,4,3,0,0,0
5,dsgdb9nsd_110064,5,0,6,0,0
6,dsgdb9nsd_110064,6,0,6,0,0
7,dsgdb9nsd_110064,7,1,6,0,0
8,dsgdb9nsd_110064,8,3,0,0,0
9,dsgdb9nsd_110064,9,0,0,1,0


In [65]:
tmp2 = tmp.groupby(["molecule_name","idx_0","idx_1"]).agg({c:["max"] for c in tmp["bond_type_next"].unique()})
tmp2.columns = pd.Index([e[0]+"_"+e[1] for e in tmp2.columns.tolist()])
tmp2.reset_index(inplace=True)
tmp2.groupby(["molecule_name","idx_0"]).agg({c+"_max":["sum"] for c in tmp["bond_type_next"].unique()}).reset_index()

Unnamed: 0_level_0,molecule_name,idx_0,1CO_max,1CC_max,1HO_max,1CH_max
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,sum,sum,sum
0,dsgdb9nsd_110064,0,1,0,0,0
1,dsgdb9nsd_110064,1,1,1,0,0
2,dsgdb9nsd_110064,2,0,3,0,0
3,dsgdb9nsd_110064,3,1,2,0,0
4,dsgdb9nsd_110064,4,1,0,0,0
5,dsgdb9nsd_110064,5,0,2,0,0
6,dsgdb9nsd_110064,6,0,2,0,0
7,dsgdb9nsd_110064,7,1,2,0,0
8,dsgdb9nsd_110064,8,1,0,0,0
9,dsgdb9nsd_110064,9,0,0,1,0


In [53]:
tmp.groupby(["molecule_name","idx_0"])["bond_type_next"].nunique()

molecule_name     idx_0
dsgdb9nsd_110064  0        1
                  1        2
                  2        1
                  3        2
                  4        1
                  5        1
                  6        1
                  7        2
                  8        1
                  9        1
                  10       1
                  11       1
                  12       1
                  13       1
                  14       1
                  15       1
                  16       1
                  17       1
                  18       1
                  19       1
                  20       1
Name: bond_type_next, dtype: int64