# NONCOVToolbox: Step 5
## Neural Network Prediction

## Test for generation of a dataset for Deep Learning predictions.

For one-hot encodings, i need to flag each entry of the dataset to be able to cluster the shifts according to the noncovalent interaction type.

- known drawbacks:
    - not all nuclei (shifts) will actually be involved in the noncov interaction, therefore i expect the predictions to be biased toward shielded values
 
one way to restructure the database is to write a small function that parses through the 'Molecule' column and creates a new column called 'NCI' where it appends the type of nci based on the number encountered in the molecule title. Ettore sei un cazzo di genio



In [7]:
# Get the NONCOVToolbox library and print header
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import glob
import numpy as np
from sklearn.cluster import KMeans
import pathlib as Path

path_noncov = os.path.abspath(os.path.join('..', 'src'))

if path_noncov not in sys.path:
    sys.path.append(path_noncov)

from noncov import NONCOVToolbox, NONCOVHeader

noncov = NONCOVToolbox()

#NONCOVHeader.print_header()

# Pre work on molecular geometries
from noncov import StructureModifier

# OrcaAnalysis module for postprocessing of DFT calculations
from noncov import OrcaAnalysis

# Graph molecular representations
from noncov import MolecularGraph

# Functions to store data in dataframes
from noncov import MachineLearning

# Show performance and features of various NMR functions in module
from noncov import NMRFunctions

# Display the molecule while its displaced, not yet interactive in Jupyter but interactive in VS Code
from noncov import MolView

# Disable printing
def blockPrint():
    sys.stdout = open(os.devnull, 'w')

# Restore printing
def enablePrint():
    sys.stdout = sys.__stdout__

In [5]:
import pandas as pd
#import tensorflow as tf
#from tensorflow import keras
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [15]:
# Get work directory and scratch folder for the output data
current_dir = os.getcwd()
print(f'Current work directory is: {current_dir}')

scratch_dir = os.path.abspath(os.path.join('..', 'scratch'))
print(f'Current scratch directory is: {scratch_dir}')
scratch_dir = OrcaAnalysis().convert_path(scratch_dir)

datasets_dir = os.path.join(scratch_dir, 'GenerateMLDataset/data/')
print(f'Dataset directory is: {datasets_dir}')
datasets_dir = OrcaAnalysis().convert_path(datasets_dir)

dataset_name = 'fragments_hopt_nmr.csv'

nucprop = os.path.join(datasets_dir, dataset_name)
nucprop_df = pd.read_csv(nucprop)

Current work directory is: /Users/ettorebartalucci/Desktop/NONCOV/results
Current scratch directory is: /Users/ettorebartalucci/Desktop/NONCOV/scratch
Normalized path using os.path: /Users/ettorebartalucci/Desktop/NONCOV/scratch
Dataset directory is: /Users/ettorebartalucci/Desktop/NONCOV/scratch/GenerateMLDataset/data/
Normalized path using os.path: /Users/ettorebartalucci/Desktop/NONCOV/scratch/GenerateMLDataset/data/


In [17]:
print(nucprop_df.head(10))

              Molecule           Atom   x_coord   y_coord   z_coord  \
0  df_cut_4_n1_opt.xyz   Nucleus 7H :  1.004732  0.618235  0.399603   
1  df_cut_4_n1_opt.xyz   Nucleus 8H : -0.479139  0.823720  0.426466   
2  df_cut_4_n1_opt.xyz   Nucleus 9H : -1.139424  1.293590 -0.715278   
3  df_cut_4_n1_opt.xyz  Nucleus 10H : -2.529680  1.410024 -0.731269   
4  df_cut_4_n1_opt.xyz  Nucleus 11H : -3.272787  1.055261  0.391918   
5  df_cut_4_n1_opt.xyz  Nucleus 12H : -2.626124  0.579825  1.530644   
6  df_cut_4_n1_opt.xyz  Nucleus 13H : -1.235220  0.460846  1.547455   
7  df_cut_4_n1_opt.xyz  Nucleus 14H :  1.553520  1.498202  0.765691   
8  df_cut_4_n1_opt.xyz  Nucleus 19H :  1.284892 -0.237829  1.022092   
9  df_cut_4_n1_opt.xyz  Nucleus 20H : -0.561879  1.500165 -1.613817   

   sigma_iso  sigma_xx  sigma_yy  sigma_zz  dia_sigma_xx  ...  dia_sigma_zz  \
0      28.71     34.44     26.20     25.50         45.09  ...         20.37   
1      28.66     33.20     27.90     24.88         24.68  ..

In [19]:
# Create a LabelEncoder object
label_encoder = preprocessing.LabelEncoder()

# Use the LabelEncoder object to transform the Species target variable
nucprop_df['Molecule'] = label_encoder.fit_transform(nucprop_df['Molecule'])
nucprop_df['Atom'] = label_encoder.fit_transform(nucprop_df['Atom'])

print(nucprop_df.head(10))

   Molecule  Atom   x_coord   y_coord   z_coord  sigma_iso  sigma_xx  \
0         0    83  1.004732  0.618235  0.399603      28.71     34.44   
1         0    85 -0.479139  0.823720  0.426466      28.66     33.20   
2         0    88 -1.139424  1.293590 -0.715278      25.80     29.45   
3         0     2 -2.529680  1.410024 -0.731269      25.71     21.87   
4         0     4 -3.272787  1.055261  0.391918      23.23     22.20   
5         0     7 -2.626124  0.579825  1.530644      25.80     21.81   
6         0     9 -1.235220  0.460846  1.547455      25.57     29.22   
7         0    12  1.553520  1.498202  0.765691      28.84     33.28   
8         0    26  1.284892 -0.237829  1.022092      30.04     34.52   
9         0    32 -0.561879  1.500165 -1.613817      30.01     34.81   

   sigma_yy  sigma_zz  dia_sigma_xx  ...  dia_sigma_zz  para_sigma_xx  \
0     26.20     25.50         45.09  ...         20.37         -12.17   
1     27.90     24.88         24.68  ...         34.50       

In [25]:
np_nucprop_df = nucprop_df.to_numpy()
print(np_nucprop_df)

[[ 0.000000e+00  8.300000e+01  1.004732e+00 ...  0.000000e+00
   8.940000e+00 -8.400000e-01]
 [ 0.000000e+00  8.500000e+01 -4.791390e-01 ...  0.000000e+00
   8.320000e+00 -2.700000e-01]
 [ 0.000000e+00  8.800000e+01 -1.139424e+00 ...  0.000000e+00
   7.640000e+00  1.400000e-01]
 ...
 [ 5.130000e+02  8.700000e+01 -4.337932e+00 ...  0.000000e+00
   1.770700e+02 -6.000000e-02]
 [ 5.130000e+02  3.000000e+00 -4.687532e+00 ...  0.000000e+00
   2.147300e+02 -2.000000e-02]
 [ 5.130000e+02  1.000000e+01  1.425015e+00 ...  0.000000e+00
   9.723000e+01  2.000000e-01]]


In [39]:
# The input data will contain all rows and the first 4 columns
X_data = np_nucprop_df[:,0:21]

# The output data will contain all rows and the last columns
Y_data = np_nucprop_df[:,5]

In [43]:
Y_data

array([ 28.71,  28.66,  25.8 , ...,  64.56,  46.06, 210.16])

In [None]:
# Instantiate a StandardScaler object
scaler = StandardScaler()

# Fit the StandardScaler to the data
scaler.fit(X_data)

# Transform the input data
X_data = scaler.transform(X_data)

In [None]:
Y_data = tf.keras.utils.to_categorical(Y_data,3)


In [None]:
NB_CLASSES = 3
NB_FEATURES = 4

# Create a sequential model, a simple linear stack of layers for Keras
model = tf.keras.models.Sequential()

# Add the first hidden layer
model.add(keras.layers.Dense(128,                               # Number of nodes in the layer
                              input_shape=(NB_FEATURES,),       # Number of input variables
                              name='hidden_layer_1',            # Logical name
                              activation='relu'))               # Activation function

# Add the second hidden layer
model.add(keras.layers.Dense(128,
                              name='hidden_layer_2',
                              activation='relu'))

# Add the output layer with softmax activation
model.add(keras.layers.Dense(NB_CLASSES,
                             name='output_layer',
                             activation='softmax'))
  

In [None]:
model.compile(loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Print the model meta-data
model.summary()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.10)


In [None]:
VERBOSE=1
BATCH_SIZE=16
EPOCHS=10
VALIDATION_SPLIT=0.2

history = model.fit(X_train,
                    Y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    verbose=VERBOSE,
                    validation_split=VALIDATION_SPLIT)

In [None]:
import matplotlib.pyplot as plt

#Plot accuracy of the model after each epoch.
pd.DataFrame(history.history)["accuracy"].plot(figsize=(5, 5))
plt.title("Accuracy improvements with Epoch")
plt.show()

In [None]:
model.evaluate(X_test,Y_test)


In [None]:
model.save("iris_save")


In [None]:
loaded_model = keras.models.load_model("iris_save")


In [None]:
# New input data
prediction_input = [[6.6, 3. , 4.4, 1.4]]

# Scale the input data with the same scaling model
scaled_input = scaler.transform(prediction_input)

# Get raw prediction probabilities
raw_prediction = model.predict(scaled_input)
print("Raw Prediction Output (Probabilities) :" , raw_prediction)

# Interpret the model output
prediction = np.argmax(raw_prediction)
print("Prediction is ", label_encoder.inverse_transform([prediction]))