In [7]:
pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 4.8 kB/s eta 0:00:01    |█▏                              | 788 kB 302 kB/s eta 0:01:12     |█▏                              | 829 kB 302 kB/s eta 0:01:11     |██▍                             | 1.7 MB 116 kB/s eta 0:02:57     |███                             | 2.1 MB 94 kB/s eta 0:03:35     |███▎                            | 2.3 MB 88 kB/s eta 0:03:45     |███▍                            | 2.4 MB 89 kB/s eta 0:03:43     |████▍                           | 3.1 MB 129 kB/s eta 0:02:29     |████▍                           | 3.1 MB 129 kB/s eta 0:02:29     |█████▊                          | 4.0 MB 138 kB/s eta 0:02:12     |█████▉                          | 4.1 MB 138 kB/s eta 0:02:12     |██████▍                         | 4.4 MB 144 kB/s eta 0:02:04     |████████▊                       | 6.1 MB 97 kB/s eta 0:02:46     |███████████▊        

In [20]:
%matplotlib inline

In [21]:
import numpy as np
import pymatgen as pymat
import mendeleev as mendel
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from random import shuffle
import matplotlib.pyplot as plt

In [22]:
fcc_elements = ["Ag", "Al", "Au", "Cu", "Ir", "Ni", "Pb", "Pd", "Pt", "Rh", "Th", "Yb"]
bcc_elements = ["Ba", "Cr", "Eu", "Fe", "Li", "Mn", "Mo", "Na", "Nb", "Ta", "V", "W" ]
hcp_elements = ["Be", "Ca", "Cd", "Co", "Dy", "Er", "Gd", "Hf", "Ho", "Lu", "Mg", "Re", 
                "Ru", "Sc", "Tb", "Ti", "Tl", "Tm", "Y", "Zn", "Zr"]
others = ["Sb", "Sm", "Bi", "Ce", "Sn", "Si"]
# Others (Solids): "Sb", "Sm", Bi" and "As" are Rhombohedral; "C" , "Ce" and "Sn" are Allotropic; 
# "Si" and "Ge" are Face-centered diamond-cubic;

elements = fcc_elements + bcc_elements + hcp_elements + others

# This function randomly arranges the elements so we can have representation for all groups both in the training and testing set
shuffle(elements) 

data_youngs_modulus = []
data_lattice_constant = []
data_melting_point = []
data_specific_heat = []
data_atomic_mass = []
data_CTE = []

for item in elements:
    data_youngs_modulus.append(pymat.Element(item).youngs_modulus)
    data_lattice_constant.append(mendel.element(item).lattice_constant)
    data_melting_point.append(mendel.element(item).melting_point)
    data_specific_heat.append(mendel.element(item).specific_heat)
    data_atomic_mass.append(pymat.Element(item).atomic_mass)
    data_CTE.append(pymat.Element(item).coefficient_of_linear_thermal_expansion)

In [23]:
# You can see how these lists look by printing them.

print(data_youngs_modulus)
print(data_lattice_constant)
print(data_melting_point)
print(data_specific_heat)
print(data_atomic_mass)
print(data_CTE)

[128.0, 70.0, 50.0, 50.0, 447.0, 69.0, 78.0, 24.0, 8.0, 83.0, 79.0, 108.0, 287.0, 411.0, 47.0, 116.0, 45.0, 34.0, 13.0, 279.0, 16.0, 20.0, 329.0, 211.0, 56.0, 10.0, 200.0, 55.0, 209.0, 105.0, 32.0, 186.0, 70.0, 78.0, 463.0, 50.0, 64.0, 275.0, 74.0, 61.0, 18.0, 198.0, 65.0, 130.0, 121.0, 4.9, 168.0, 55.0, 528.0, 74.0, 68.0]
[3.02, 4.05, 9.0, 2.98, 2.7, 3.51, 4.08, 5.49, 3.46, 4.09, 5.08, 2.66, 2.29, 3.16, 5.43, 2.95, 3.21, 5.16, 5.02, 2.88, 4.95, 5.58, 3.15, 2.87, 3.6, 4.23, 3.52, 3.64, 2.51, 3.3, 4.75, 3.31, 3.56, 3.2, 2.76, 5.82, 3.65, 3.8, 3.54, 3.59, 4.61, 8.89, 3.58, 3.61, 3.89, 3.49, 3.92, 4.51, 3.84, 3.31, 3.23]
[2160.0, 933.5, 1350.0, 594.1, 2583.0, 1936.0, 1337.58, 1097.0, 576.6, 1235.1, 2028.0, 692.73, 1551.0, 3680.0, 1683.0, 1933.0, 922.0, 1072.0, 1002.0, 2130.0, 600.65, 1112.0, 2890.0, 1808.0, 1629.0, 370.96, 1726.0, 1586.0, 1768.0, 2741.0, 544.5, 3269.0, 1802.0, 2503.0, 3453.0, 505.1, 1795.0, 2239.0, 1818.0, 1685.0, 1095.0, 1517.0, 1747.0, 1356.6, 1825.0, 553.69, 2045.0, 90

In [24]:
# Here we will organize the data in a format sk-learn accepts
# We will develop linear models that relate Young's modulus and melting temperature, CTE vs. melting temperature 
# and lattice constant and melting temperature

# These would be the sets for Melting Point

melt_train = data_melting_point[:45] # This takes the first 45 entries to be the Training Set
melt_test = data_melting_point[-6:] # This takes the last 6 entries to be the Testing Set

# This Reshape function in the next two lines, turns each of the horizontal lists [ x, y, z] into a
# vertical NumPy array [[x]
#                       [y]
#                       [z]]
# This Step is required to work with the Sklearn Linear Model

melt_train = np.array(melt_train).reshape(-1,1) 
melt_test = np.array(melt_test).reshape(-1,1)

#Each data set will be divided in training and test data
# These would be the sets for Young's Modulus

young_train = data_youngs_modulus[:45]
young_test = data_youngs_modulus[-6:]
young_train = np.array(young_train).reshape(-1,1)
young_test = np.array(young_test).reshape(-1,1)

# These would be the sets for Lattice Constants

lattice_train = data_lattice_constant[:45]
lattice_test = data_lattice_constant[-6:]
lattice_train = np.array(lattice_train).reshape(-1,1)
lattice_test = np.array(lattice_test).reshape(-1,1)
# These would be the sets for Specific Heat

specheat_train = data_specific_heat[:45]
specheat_test = data_specific_heat[-6:]
specheat_train = np.array(specheat_train).reshape(-1,1)
specheat_test = np.array(specheat_test).reshape(-1,1)

# These would be the sets for Atomic Mass

mass_train = data_atomic_mass[:45]
mass_test = data_atomic_mass[-6:]
mass_train = np.array(mass_train).reshape(-1,1)
mass_test = np.array(mass_test).reshape(-1,1)

# These would be the sets for CTE

coefTE_train = data_CTE[:45]
coefTE_test = data_CTE[-6:]
coefTE_train = np.array(coefTE_train).reshape(-1,1)
coefTE_test = np.array(coefTE_test).reshape(-1,1)

In [25]:
# This function defines a model, trains it, and uses it to predict
# It also outputs the linear model and information about its accuracy

def regression(x_train, x_test, y_train, y_test):
    
    # Define the model and train it
    model = linear_model.LinearRegression()
    model.fit(x_train, y_train)
    
    #Join train + test data 
    full_x = np.concatenate((x_train, x_test), axis=0)
    full_y = np.concatenate((y_train, y_test), axis=0)
    
    # Use the model to predict the entire set of data
    predictions = model.predict(full_x) # Make it for all values
    
    # Print model and mean squared error and variance score
    print("Linear Equation: %.4e X + (%.4e)"%(model.coef_, model.intercept_))
    print("Mean squared error: %.4e" % (mean_squared_error(full_y, predictions)))
    print('Variance score: %.4f' % r2_score(full_y, predictions))    
    
    return predictions

In [35]:
import plotly #This is the library import
import plotly.graph_objs as go # This is the graphical object (Think "plt" in Matplotlib if you have used that before)
from plotly.offline import iplot # These lines are necessary to run Plotly in Jupyter Notebooks, but not in a dedicated environment

plotly.offline.init_notebook_mode(connected=True)

def plot(x_train, x_test, y_train, y_test, x_label, y_label, predictions):
    
    # The reshape functions in the next two lines, turns each of the
    # vertical NumPy array [[x]
    #                       [y]
    #                       [z]]
    # into python lists [ x, y, z]
    
    # This step is required to create plots with plotly like we did in the previous tutorial
    
    x_train = x_train.reshape(1,-1).tolist()[0]
    x_test = x_test.reshape(1,-1).tolist()[0]
    y_train = y_train.reshape(1,-1).tolist()[0]
    y_test = y_test.reshape(1,-1).tolist()[0]    
    predictions = predictions.reshape(1,-1).tolist()[0]
    full_x_list = x_train + x_test

    
    # Now we get back to what we know. Remember, to plot in Plotly, we need a layout and at least one trace
    
    layout0= go.Layout(hovermode= 'closest', width = 800, height=600, showlegend=True,  # Hovermode establishes the way the labels that appear when you hover are arranged # Establishing a square plot width=height
    xaxis= dict(title=go.layout.xaxis.Title(text=x_label, font=dict(size=24)), zeroline= False, gridwidth= 1, tickfont=dict(size=18)), # Axis Titles. Removing the X-axis Mark. Adding a Grid
    yaxis= dict(title=go.layout.yaxis.Title(text=y_label, font=dict(size=24)), zeroline= False, gridwidth= 1, tickfont=dict(size=18)), # Axis Titles. Removing the Y-axis Mark. Adding a Grid
    legend=dict(font=dict(size=24))) # Adding a legend
    

    training = go.Scatter(x = x_train, y = y_train, mode = 'markers', 
                          marker= dict(size= 10, color= 'green'), name= "Training Data") 
    # This trace contains the values for the data in the training set
    
    actual = go.Scatter(x = x_test, y = y_test, mode = 'markers', 
                        marker= dict(size= 10, color= 'red'), name= "Testing Data") 
    # This trace contains the values for the data in the testing set

    prediction = go.Scatter(x = full_x_list, y = predictions, mode = 'lines', 
                            line = dict(color = "blue", width = 1.5),name= "Model") 
    # This trace will be the line the model fitted the data to

    data = [training, actual, prediction]
    fig= go.Figure(data, layout=layout0)
    iplot(fig)                  

In [36]:
predictions = regression(melt_train, melt_test, young_train, young_test) 
# This line calls the Regression model implemented in the function 

plot(melt_train, melt_test, young_train, young_test, "Melting Temperature (K)", "Young's Modulus (GPa)", predictions) 
# This line plots the results from that model

Linear Equation: 1.1004e-01 X + (-5.7949e+01)
Mean squared error: 7.5680e+03
Variance score: 0.5203


In [37]:
predictions = regression(melt_train, melt_test, lattice_train, lattice_test)

plot(melt_train, melt_test, lattice_train, lattice_test, 
                       "Melting Temperature (K)", "Lattice Constant (Å)", predictions)

Linear Equation: -6.0676e-04 X + (4.9991e+00)
Mean squared error: 1.5334e+00
Variance score: 0.1073


In [38]:
predictions = regression(melt_train, melt_test, coefTE_train, coefTE_test)

plot(melt_train, melt_test, coefTE_train, coefTE_test, "Melting Temperature (K)", "Coefficient of Linear Thermal Expansion (K<sup>-1</sup>)", predictions)

Linear Equation: -9.9213e-09 X + (3.1516e-05)
Mean squared error: 7.8782e-11
Variance score: 0.4516
