In [31]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.metrics import r2_score

# Define the file path of the dataset in your Google Drive
file_path = '/content/DataSetFinal.xlsx'

# Load the dataset
data = pd.read_excel(file_path)

# Select the relevant features for prediction, excluding non-numeric or irrelevant columns
X = data.drop(['No','CN_literature', 'Molecular mass (g/mol)','SMILES', 'Name', 'Class', 'Formula','References','Set'], axis=1)
y = data['CN_literature']

# Standardize the feature set for ANN
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create the ANN model
model = Sequential()

# Add input layer (number of features as input nodes) and hidden layers
model.add(Dense(units=21, activation='relu', input_shape=(X_train.shape[1],)))
# model.add(Dense(units=11, activation='relu'))
# model.add(Dense(units=16, activation='relu'))

# Add output layer (1 output node for regression)
model.add(Dense(units=1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=10, validation_data=(X_test, y_test), verbose=1)

# Make predictions
y_pred_ann = model.predict(X_test)

# Evaluate the model performance
mse_ann = mean_squared_error(y_test, y_pred_ann)
rmse_ann = np.sqrt(mse_ann)
print(f'ANN RMSE: {rmse_ann}')

y_pred_ann = model.predict(X_test, batch_size=32)

# Reshape the predictions to 1D array if needed
y_pred_ann = y_pred_ann.flatten()

# Now calculate MAPE

def calculate_mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape_ann = calculate_mape(y_test, y_pred_ann)
print(f'ANN MAPE: {mape_ann}')

r2 = r2_score(y_test, y_pred_ann)
print(f'R-squared: {r2}')

model.save('cn_model.keras')

new_model =  tf.keras.models.load_model("/content/cn_model.keras")
new_model.summary()

# Sample Validation Test Data
file_path = '/content/Sample Saldana Test.xlsx'

# Load the dataset
data = pd.read_excel(file_path)

X = data.drop([ 'Nombre','Clase','SMILE', 'CN Exp','CN Calc. Saldana'], axis=1)

# Standardize the feature set for ANN
scaler = StandardScaler()
x_new_data = scaler.fit_transform(X)

# New prediction using new values
predicted_values = new_model.predict(x_new_data)

# Export the results
pd.DataFrame(predicted_values).to_csv('output.csv', index=False)



Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 2011.8867 - val_loss: 1853.2445
Epoch 2/100
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1919.0233 - val_loss: 1773.5179
Epoch 3/100
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1938.1938 - val_loss: 1684.9739
Epoch 4/100
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1986.4398 - val_loss: 1582.2358
Epoch 5/100
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1626.7333 - val_loss: 1467.6368
Epoch 6/100
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1630.2086 - val_loss: 1334.8553
Epoch 7/100
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1466.5266 - val_loss: 1200.0322
Epoch 8/100
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1262.9835 - val_loss: 1063.1769
Epoch 9/100


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
