In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

In [3]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

Using loader for data

In [4]:
import sys
sys.path.append("..")

In [5]:
from helpers.picklers import *
from helpers.data_utils import *

Data path

In [6]:
data_path = "../data/descriptor/DescriptorData.csv"

Prepare training data for our models

In [7]:
# Loading descriptor files
df = pd.read_csv(data_path)

In [8]:
df.describe()

Unnamed: 0,energy,energy_per_atom,band_gap,total_magnetization,0,1,2,3,4,5,6,7,8,9,10,11,12,13
count,18962.0,18962.0,18962.0,18961.0,18962.0,18962.0,18962.0,18962.0,18962.0,18962.0,18962.0,18962.0,18962.0,18962.0,18962.0,18962.0,18962.0,18962.0
mean,-95.830071,-5.306329,0.451718,1.217187,13.254852,3.309276,-4.636984,-3.769513,8.800812,0.920736,5.791024,0.064184,5.935859,-0.441945,3.412756,-0.088312,2.869125,0.009662
std,164.855659,2.39634,1.179319,5.259865,3.38432,2.111308,4.057872,2.905091,13.075827,7.369018,2.326354,1.849358,1.99938,2.463763,0.792149,0.671106,0.711807,0.338538
min,-2846.529074,-13.701623,0.0,0.0,2.5,0.0,-14.5,-10.0,1.0,-120.0,1.0,-5.5,2.0,-12.617647,0.894385,-7.551524,1.102304,-7.07666
25%,-94.418136,-7.084322,0.0,3.5e-07,11.0,1.5,-7.5,-6.0,2.0,-1.0,4.166667,-1.0,4.333333,-1.666667,2.924287,-0.460726,2.489369,-3e-06
50%,-40.617644,-5.076996,0.0,3.52e-05,13.5,3.5,-4.5,-4.0,4.0,0.0,5.948043,0.0,6.0,-0.0,3.285963,-0.018106,2.871633,0.0
75%,-19.101495,-3.473633,0.0,0.1073695,15.5,5.0,-2.0,-1.5,9.0,2.0,7.0,1.0,8.0,0.646307,3.788326,0.126517,3.189913,0.012342
max,-1.434844,-0.406819,9.0594,214.001,21.5,10.0,4.5,3.5,180.0,90.0,12.0,5.5,15.441176,10.769231,20.025557,6.674738,19.209523,5.985605


In [9]:
# Saving in the object
X = df.loc[:, [str(i) for i in range(0, 14)]].to_numpy()
y = df.loc[:, "energy_per_atom"].to_numpy()

In [10]:
X.shape, y.shape

((18962, 14), (18962,))

In [11]:
np.random.seed(42)

In [12]:
# Shuffling and Splitting it
X_shuffled, y_shuffled, ind = shuffle_data(X, y, None)
X_train, y_train, X_test, y_test = split(X_shuffled, y_shuffled)

In [13]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((15169, 14), (15169,), (3793, 14), (3793,))

Applying Linear Regression to the Data

In [14]:
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Calculating MSE

In [15]:
y_pred = clf.predict(X_test)
np.mean(np.square(y_test - y_pred))

3.604330640160972

Apply Standard Scaler

In [16]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Retraining

In [17]:
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Calculating MSE

In [18]:
y_pred = clf.predict(X_test)
np.mean(np.square(y_test - y_pred))

3.6043306401609714

Checking the coefficients

In [19]:
clf.coef_

array([-0.58525709,  1.31136753, -1.23925249,  1.35888831, -0.22282615,
        0.2884042 , -0.13628736,  0.02847524,  0.12008086,  0.00771974,
        0.58713191, -0.0503738 ,  0.75102788, -0.06006075])

In [20]:
sorted(np.arange(len(clf.coef_)), key=lambda x: abs(clf.coef_[x]))

[9, 7, 11, 13, 8, 6, 4, 5, 0, 10, 12, 2, 1, 3]