# Main file for running our model

In [25]:
###Libraries (not in function files)
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import root_mean_squared_error as rmse

In [1]:
###Own functions
from project_dataload import data_load, check_data, summarize_1
from project_gaussian import *
from Fingerprints import Sinemats, Coulombmatrices, Ewaldsummatrices
from PCA_decomposition import *
from GP import GP


In [None]:
###Data load and check

data_dir="./project_data/"
train, test = data_load(data_dir)

print('Check train data')
check_data(train)
print('Check test data')
check_data(test)
summary = summarize_1(train, test)
print('Max number of atoms',summary['max_number_of_atoms'])


In [None]:
### Implement fingerprint
max_num_atoms = summary['max_number_of_atoms']
cmats = Coulombmatrices(train, max_num_atoms)
y_vals = train['hform']
print('Shape of cmats', cmats.shape)

In [None]:
# Set target and feature vector 
X = pd.DataFrame(data = cmats, index=train.id)
y = pd.DataFrame(train['hform'].values, index = train.id)
print('X: {}'.format(X.shape))
print('y: {}'.format(y.shape))

In [11]:
# Split training data into a training set and a validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=251)
y_test_values = y_test.values  # Extract values as a NumPy array
y_train_values = y_train.values  # Extract values as a NumPy array

In [None]:
### Split data
print('Shape of split data: ',X_train.shape, X_test.shape, y_train.shape, y_test.shape)

plt.figure(figsize = (8, 4))
plt.hist(y_train, bins=30, alpha=0.7, density=True, label='Train set')
plt.hist(y_test, bins=30, alpha=0.7, density=True, label='Test set')

plt.grid()
plt.title("Normalized histogram of test and train sets")
plt.xlabel("Heat of formation, eV/atom")
plt.legend()
plt.show()


In [13]:
# PC Decomposition

n_PC = 135

X_train_pca, X_test_pca = PCA_decomposition(X_train, X_test, n_PC)

In [15]:
# Gaussian process

simpleGP = GP(X_train_pca, y_train_values, sigma = 0.003)

In [None]:
# Train and predict
k = 1
l=0.04
simpleGP.train_GP(l, k)
predictions = np.array([simpleGP.predict(fingerprint) for fingerprint in X_test_pca])
print(f"With l = {l} RMSE = {rmse(y_test_values, predictions)}")


In [None]:
plt.figure()
plt.scatter(y_test_values, predictions, s = 1,color='black')
plt.xlabel('Test values')
plt.ylabel('Predicted values')
plt.grid()
xg = np.linspace(-4,1.5, 2)
plt.plot(xg,xg, color = 'red', linestyle = 'dashed')
