# Ridge Regression for Ionic Conductivity

### Import Libraries

In [13]:
pip install plotly


Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import pandas as pd
import plotly

from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score, mean_squared_error


### Import data

In [2]:
#elyte_conductivity_data = pd.read_csv("/Users/natalieclemans/Desktop/MatSci176/Project/CALiSol-23 Dataset.csv")
#elyte_conductivity_data = pd.read_csv("CALiSol-23 Dataset.csv")
elyte_conductivity_data = pd.read_csv("Pre-processed CALiSol Data.csv")
print(elyte_conductivity_data.columns)

Index(['Unnamed: 0', 'doi', 'k', 'T', 'c', 'salt', 'c units',
       'solvent ratio type', 'EC', 'PC', 'DMC', 'EMC', 'DEC', 'DME', 'DMSO',
       'AN', 'MOEMC', 'TFP', 'EA', 'MA', 'FEC', 'DOL', '2-MeTHF', 'DMM',
       'Freon 11', 'Methylene chloride', 'THF', 'Toluene', 'Sulfolane',
       '2-Glyme', '3-Glyme', '4-Glyme', '3-Me-2-Oxazolidinone',
       '3-MeSulfolane', 'Ethyldiglyme', 'DMF', 'Ethylbenzene',
       'Ethylmonoglyme', 'Benzene', 'g-Butyrolactone', 'Cumene',
       'Propylsulfone', 'Pseudocumeme', 'TEOS', 'm-Xylene', 'o-Xylene'],
      dtype='object')


In [3]:
print(f'Number of solvents: {len(elyte_conductivity_data.columns) - 9}')

Number of solvents: 37


### Arrange data

In [4]:
# elyte_conductivity_data.drop(['c units', 'solvent ratio type'], axis=1)
cols = elyte_conductivity_data.columns[3:]

X = elyte_conductivity_data[cols]
X = X.drop(['c units', 'solvent ratio type'],axis=1)
y = elyte_conductivity_data['k']

lambdas = [.1, 1, 10, 100]


### Select data from most interesting salts
#### LiPF6, LiBF4, LiAsF6, LiBOB

In [5]:
# I wish I could write beautiful, object oriented code here but that just wouldn't be me now would it
# LiPF6
indices_LiPF6 = X[X['salt'] == 'LiPF6'].index
X_LiPF6 = X[X['salt'] == 'LiPF6'].drop(['salt'], axis=1)
y_LiPF6 = y.loc[indices_LiPF6]

# LiBF4
indices_LiBF4 = X[X['salt'] == 'LiBF4'].index
X_LiBF4 = X[X['salt'] == 'LiBF4'].drop(['salt'], axis=1)
y_LiBF4 = y.loc[indices_LiBF4]

# LiAsF6
indices_LiAsF6 = X[X['salt'] == 'LiAsF6'].index
X_LiAsF6 = X[X['salt'] == 'LiAsF6'].drop(['salt'], axis=1)
y_LiAsF6 = y.loc[indices_LiAsF6]

# LiBOB
indices_LiBOB = X[X['salt'] == 'LiBOB'].index
X_LiBOB = X[X['salt'] == 'LiBOB'].drop(['salt'], axis=1)
y_LiBOB = y.loc[indices_LiBOB]

### Function for implementing ridge regression, and evaluation

In [6]:
def ridgeregress(X,y,a):
    ridge = Ridge(alpha=a)
    # Test-train split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,train_size=0.8, random_state=137)
    ridge.fit(X_train,y_train)
    ridge_cv = RidgeCV(alphas=lambdas)
    ridge_cv.fit(X_train,y_train)
    best_lambda = ridge_cv.alpha_
    print(f'best lambda is {best_lambda}')

    # Eval
    y_pred = ridge.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    return r2, mse
    
r2s = np.zeros(4)
mses = np.zeros(4)

### LiPF6 Ridge Regression ("1")

In [7]:
r2_LiPF6, mse_LiPF6 = ridgeregress(X_LiPF6,y_LiPF6,0.1)
print(f"R² Score: {r2_LiPF6:.4f}")
print(f"MSE: {mse_LiPF6:.4f}")
r2s[0] = r2_LiPF6
mses[0] = mse_LiPF6

best lambda is 0.1
R² Score: 0.8082
MSE: 3.4263


### LiBF4 Ridge Regression ("2")

In [8]:
r2_LiBF4, mse_LiBF4 = ridgeregress(X_LiBF4,y_LiBF4,10.0)
print(f"R² Score: {r2_LiBF4:.4f}")
print(f"MSE: {mse_LiBF4:.4f}")
r2s[1] = r2_LiBF4
mses[1] = mse_LiBF4

best lambda is 10.0
R² Score: 0.7822
MSE: 0.6196


### LiAsF6 Ridge Regression ("3")

In [9]:
r2_LiAsF6, mse_LiAsF6 = ridgeregress(X_LiAsF6,y_LiAsF6,10.0)
print(f"R² Score: {r2_LiAsF6:.4f}")
print(f"MSE: {mse_LiAsF6:.4f}")
r2s[2] = r2_LiAsF6
mses[2] = mse_LiAsF6

best lambda is 10.0
R² Score: 0.7655
MSE: 8.9498


### LiBOB Ridge Regression ("4")

In [10]:
r2_LiBOB, mse_LiBOB = ridgeregress(X_LiBOB,y_LiBOB,1.0)
print(f"R² Score: {r2_LiBOB:.4f}")
print(f"MSE: {mse_LiBOB:.4f}")
r2s[3] = r2_LiBOB
mses[3] = mse_LiBOB

best lambda is 1.0
R² Score: 0.8103
MSE: 2.7253


### What do we make of the data now that we've got it?

In [11]:
print(f'The R² scores in order: {r2s}; and the MSEs: {mses}')

The R² scores in order: [0.80817195 0.78216833 0.76551763 0.81027494]; and the MSEs: [3.42634628 0.61957036 8.94975681 2.72526094]
