#### Translation
```
np.uniform(u, low=-b, high=b)
```

#### Rotation
https://pytorch3d.readthedocs.io/en/latest/modules/transforms.html  
matrice de rotation: pytorch3d.transforms.random_rotation

https://math.stackexchange.com/questions/180418/calculate-rotation-matrix-to-align-vector-a-to-vector-b-in-3d  

#### Permutation
```
np.random.permutation
```

nombre d'atomes par molécule : entre 4 et 23  
atomes présents : {1, 6, 7, 8, 16, 17}

In [75]:
import os
import pandas as pd
import numpy as np
from ase import io
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder

%load_ext autoreload
%autoreload 2

# from utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [76]:
def get_id_from_filename(filename: str) -> int:
    return int(filename.split("_")[1].split(".")[0])

get_id_from_filename("id_2337.xyz")

2337

In [77]:
atom = io.read("train/atoms/train/id_1.xyz")
r = atom.get_positions()
r

array([[ 0.857135,  1.766441,  1.943662],
       [ 0.099524,  0.525085,  1.758627],
       [-0.233978,  0.294832,  0.290564],
       [ 0.740909, -0.375661, -0.628106],
       [-0.519742, -1.108375, -0.204441],
       [-1.656612, -1.365652, -1.170557],
       [ 0.916179,  0.131485, -1.980716],
       [ 1.022234,  1.944131,  2.930746],
       [ 0.353962,  2.557052,  1.549804],
       [-0.841044,  0.528336,  2.354028],
       [ 0.698684, -0.314844,  2.141217],
       [-0.841061,  1.081979, -0.162159],
       [ 1.675159, -0.731132, -0.180879],
       [-0.377184, -1.900361,  0.534083],
       [-1.740849, -0.565663, -1.903374],
       [-2.610844, -1.439292, -0.64818 ],
       [-1.507099, -2.299673, -1.712636],
       [ 0.103102,  0.661861, -2.28071 ],
       [ 1.071134, -0.629252, -2.639089]])

In [78]:
from ase.visualize import view
print(atom.get_chemical_symbols())
view(atom, viewer="x3d")

['N', 'C', 'C', 'C', 'C', 'C', 'N', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']


In [79]:
print(atom.get_atomic_numbers())
print(atom.get_center_of_mass())
print(atom.get_moments_of_inertia())

[7 6 6 6 6 6 7 1 1 1 1 1 1 1 1 1 1 1 1]
[0.02960112 0.01087163 0.00017221]
[121.81724473 305.64416583 383.56495588]


In [80]:
df_energy = pd.read_csv("train/energies/train.csv", index_col="id")
df_energy

Unnamed: 0_level_0,energy
id,Unnamed: 1_level_1
1,-83.490555
2,-90.173970
3,-84.345419
4,-62.286505
5,-90.351426
...,...
6766,-70.422283
6767,-90.129364
6768,-84.196158
6769,-83.860432


In [81]:
def permute_atom(atom):
    index_perm = np.argsort(atom.get_positions()[:,0])
    atom.get_positions()[index_perm]
    atom_perm = atom.copy()
    
    atom_perm.set_positions(atom.get_positions()[index_perm])
    atom_perm.set_atomic_numbers(atom.get_atomic_numbers()[index_perm])
    return atom_perm

def remove_translation(atom):
    atom_centered = atom.copy()
    atom_centered.set_positions(atom_centered.get_positions() - atom_centered.get_center_of_mass())
    return atom_centered

def rotate_molecule(atom):
    """ atom is already centered """
    atom_rotated = atom.copy()
    
    X = atom_rotated.get_positions()
    pca = PCA(n_components=3)
    pca.fit(X)
    atom_rotated.rotate(pca.components_[0,:], (1, 0, 0))
    
    X = atom_rotated.get_positions()
    pca = PCA(n_components=3)
    pca.fit(X)
    atom_rotated.rotate(pca.components_[1,:], (0, 1, 0))
    
    return atom_rotated

def preprocess(atom):
    atom_copy = atom.copy()
    atom_copy = remove_translation(atom_copy)
    atom_copy = rotate_molecule(atom_copy)
    atom_copy = permute_atom(atom_copy)
    return atom_copy

atom_rotated = rotate_molecule(remove_translation(atom))

In [82]:
atom_rotated.get_center_of_mass()

array([ 1.33007301e-16, -2.43846718e-17,  1.16381388e-17])

In [83]:
print(atom_rotated.get_atomic_numbers())
print(atom_rotated.get_center_of_mass())
print(atom_rotated.get_moments_of_inertia())

[7 6 6 6 6 6 7 1 1 1 1 1 1 1 1 1 1 1 1]
[ 1.33007301e-16 -2.43846718e-17  1.16381388e-17]
[121.81724473 305.64416583 383.56495588]


In [84]:
X = atom_rotated.get_positions()
pca = PCA(n_components=3)
pca.fit(X)
pca.components_

array([[ 1.00000000e+00, -0.00000000e+00, -0.00000000e+00],
       [-0.00000000e+00,  1.00000000e+00,  5.41288960e-16],
       [-0.00000000e+00,  1.60070371e-16, -1.00000000e+00]])

In [92]:
%%time
df = pd.DataFrame(columns=["positions", "atomic_number", "atomic_number_encoded"])#, "energy"])
encoder = {1: 0, 6: 2, 7: 3, 8: 4, 16: 5, 17: 6}

for filename in os.listdir("train/atoms/test/")[:]:
    ids_file = get_id_from_filename(filename)
    if ids_file not in [6771, 6772, 6773]:
        atom = io.read(f"train/atoms/test/id_{ids_file}.xyz")

        atom = preprocess(atom)
        atomic_number_encoded = list(map(lambda x: encoder[x], atom.get_atomic_numbers()))
        #energy = df_energy.loc[ids_file][0]

        cols = [atom.get_positions(), atom.get_atomic_numbers(), atomic_number_encoded]#, energy]
        df.loc[ids_file] = cols

df

CPU times: user 5.42 s, sys: 55.8 ms, total: 5.48 s
Wall time: 5.49 s


Unnamed: 0,positions,atomic_number,atomic_number_encoded
7903,"[[-1.2046062687482697, 1.4936802750009526, -0....","[1, 8, 6, 1, 1, 6, 6, 1, 1, 1, 6, 1, 8, 1]","[0, 4, 2, 0, 0, 2, 2, 0, 0, 0, 2, 0, 4, 0]"
7332,"[[-1.511916232112587, 0.27212276240209693, -2....","[1, 1, 1, 1, 6, 1, 6, 7, 6, 1, 1, 6, 6, 7, 1, ...","[0, 0, 0, 0, 2, 0, 2, 3, 2, 0, 0, 2, 2, 3, 0, ..."
8088,"[[-0.8942968257481967, 1.8111772386209282, -0....","[1, 1, 6, 1, 6, 6, 6, 1, 6, 1, 8, 1]","[0, 0, 2, 0, 2, 2, 2, 0, 2, 0, 4, 0]"
7462,"[[-1.396253897133661, -1.4285636929288799, -1....","[1, 1, 1, 1, 6, 6, 6, 6, 1, 1, 1, 6, 1, 6, 1, ...","[0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 2, 0, 2, 0, ..."
7446,"[[-1.4567906708914387, 0.916996666236952, -0.5...","[1, 1, 1, 6, 6, 1, 1, 7, 8, 6, 1, 6, 1, 1, 1]","[0, 0, 0, 2, 2, 0, 0, 3, 4, 2, 0, 2, 0, 0, 0]"
...,...,...,...
7117,"[[-1.4302863396994177, 1.3498543985089055, 1.6...","[1, 1, 6, 1, 1, 6, 1, 1, 6, 6, 1, 1, 6, 1, 1]","[0, 0, 2, 0, 0, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0]"
7986,"[[-1.398883173843184, -1.292695304638306, -1.0...","[1, 8, 6, 1, 1, 6, 6, 1, 1, 6, 8, 1]","[0, 4, 2, 0, 0, 2, 2, 0, 0, 2, 4, 0]"
7907,"[[-1.4380747333176824, -0.5399652625432195, -0...","[1, 1, 1, 1, 1, 1, 6, 1, 6, 6, 6, 1, 6, 6, 1, ...","[0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 0, 2, 2, 0, ..."
8336,"[[-1.5754987237039872, 0.8707649230242314, 1.4...","[1, 1, 1, 1, 1, 6, 1, 6, 6, 6, 1, 6, 1, 7, 7, ...","[0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 0, 2, 0, 3, 3, ..."


In [93]:
df[["energy"]].mean()[0]

KeyError: "None of [Index(['energy'], dtype='object')] are in the [columns]"

In [88]:
df[["energy"]] = df[["energy"]].mean()[0]

In [96]:
df[["positions"]] = -78.07771600753753
df[["positions"]]

Unnamed: 0,positions
7903,-78.077716
7332,-78.077716
8088,-78.077716
7462,-78.077716
7446,-78.077716
...,...
7117,-78.077716
7986,-78.077716
7907,-78.077716
8336,-78.077716


In [101]:
df_test.sort_index()

Unnamed: 0,positions
6770,-78.077716
6774,-78.077716
6775,-78.077716
6776,-78.077716
6777,-78.077716
...,...
8462,-78.077716
8463,-78.077716
8464,-78.077716
8465,-78.077716


In [99]:
# df_test = df[["energy"]].assign("energy"=df[["energy"]].mean()[0])
df_test = df[["positions"]]
df_test.sort_index().to_csv("test.csv")

In [None]:
df_energy = pd.read_csv("train/energies/train.csv", index_col="id")


In [63]:
import pickle
pickle.dump(df, open("train_preprocess.pkl", "wb"))

In [None]:
# pickle.load(open("train_preprocess.pkl", "rb"))