# Ridge Regression for Ionic Conductivity

### Import Libraries

In [1]:
pip install plotly


Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import plotly

from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score, mean_squared_error


### Import data

In [3]:
#elyte_conductivity_data = pd.read_csv("/Users/natalieclemans/Desktop/MatSci176/Project/CALiSol-23 Dataset.csv")
#elyte_conductivity_data = pd.read_csv("CALiSol-23 Dataset.csv")
elyte_conductivity_data = pd.read_csv("Pre-processed CALiSol Data.csv")
print(elyte_conductivity_data.columns)

Index(['Unnamed: 0', 'doi', 'k', 'T', 'c', 'salt', 'c units',
       'solvent ratio type', 'EC', 'PC', 'DMC', 'EMC', 'DEC', 'DME', 'DMSO',
       'AN', 'MOEMC', 'TFP', 'EA', 'MA', 'FEC', 'DOL', '2-MeTHF', 'DMM',
       'Freon 11', 'Methylene chloride', 'THF', 'Toluene', 'Sulfolane',
       '2-Glyme', '3-Glyme', '4-Glyme', '3-Me-2-Oxazolidinone',
       '3-MeSulfolane', 'Ethyldiglyme', 'DMF', 'Ethylbenzene',
       'Ethylmonoglyme', 'Benzene', 'g-Butyrolactone', 'Cumene',
       'Propylsulfone', 'Pseudocumeme', 'TEOS', 'm-Xylene', 'o-Xylene'],
      dtype='object')


In [4]:
print(f'Number of solvents: {len(elyte_conductivity_data.columns) - 9}')

Number of solvents: 37


### Arrange data

In [5]:
# elyte_conductivity_data.drop(['c units', 'solvent ratio type'], axis=1)
cols = elyte_conductivity_data.columns[3:]

X = elyte_conductivity_data[cols]
X = X.drop(['c units', 'solvent ratio type'],axis=1)
y = elyte_conductivity_data['k']

lambdas = [.1, 1, 10, 100]


### Select data from most interesting salts
#### LiPF6, LiBF4, LiAsF6, LiBOB

In [6]:
# I wish I could write beautiful, object oriented code here but that just wouldn't be me now would it
# LiPF6
indices_LiPF6 = X[X['salt'] == 'LiPF6'].index
X_LiPF6 = X[X['salt'] == 'LiPF6'].drop(['salt'], axis=1)
y_LiPF6 = y.loc[indices_LiPF6]

# LiBF4
indices_LiBF4 = X[X['salt'] == 'LiBF4'].index
X_LiBF4 = X[X['salt'] == 'LiBF4'].drop(['salt'], axis=1)
y_LiBF4 = y.loc[indices_LiBF4]

# LiAsF6
indices_LiAsF6 = X[X['salt'] == 'LiAsF6'].index
X_LiAsF6 = X[X['salt'] == 'LiAsF6'].drop(['salt'], axis=1)
y_LiAsF6 = y.loc[indices_LiAsF6]

# LiBOB
indices_LiBOB = X[X['salt'] == 'LiBOB'].index
X_LiBOB = X[X['salt'] == 'LiBOB'].drop(['salt'], axis=1)
y_LiBOB = y.loc[indices_LiBOB]

### Function for implementing ridge regression, and evaluation

In [7]:
def ridgeregress(X,y,a):
    ridge = Ridge(alpha=a)
    # Test-train split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,train_size=0.8, random_state=137)
    ridge.fit(X_train,y_train)
    ridge_cv = RidgeCV(alphas=lambdas)
    ridge_cv.fit(X_train,y_train)
    best_lambda = ridge_cv.alpha_
    print(f'best lambda is {best_lambda}')

    # Eval
    y_pred = ridge.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    return r2, mse
    
r2s = np.zeros(4)
mses = np.zeros(4)

### LiPF6 Ridge Regression ("1")

In [8]:
r2_LiPF6, mse_LiPF6 = ridgeregress(X_LiPF6,y_LiPF6,0.1)
print(f"R² Score: {r2_LiPF6:.4f}")
print(f"MSE: {mse_LiPF6:.4f}")
r2s[0] = r2_LiPF6
mses[0] = mse_LiPF6

best lambda is 0.1
R² Score: 0.8082
MSE: 3.4263


### LiBF4 Ridge Regression ("2")

In [9]:
r2_LiBF4, mse_LiBF4 = ridgeregress(X_LiBF4,y_LiBF4,10.0)
print(f"R² Score: {r2_LiBF4:.4f}")
print(f"MSE: {mse_LiBF4:.4f}")
r2s[1] = r2_LiBF4
mses[1] = mse_LiBF4

best lambda is 10.0
R² Score: 0.7822
MSE: 0.6196


### LiAsF6 Ridge Regression ("3")

In [10]:
r2_LiAsF6, mse_LiAsF6 = ridgeregress(X_LiAsF6,y_LiAsF6,10.0)
print(f"R² Score: {r2_LiAsF6:.4f}")
print(f"MSE: {mse_LiAsF6:.4f}")
r2s[2] = r2_LiAsF6
mses[2] = mse_LiAsF6

best lambda is 10.0
R² Score: 0.7655
MSE: 8.9498


### LiBOB Ridge Regression ("4")

In [11]:
r2_LiBOB, mse_LiBOB = ridgeregress(X_LiBOB,y_LiBOB,1.0)
print(f"R² Score: {r2_LiBOB:.4f}")
print(f"MSE: {mse_LiBOB:.4f}")
r2s[3] = r2_LiBOB
mses[3] = mse_LiBOB

best lambda is 1.0
R² Score: 0.8103
MSE: 2.7253


### What do we make of the data now that we've got it?

In [12]:
print(f'The R² scores in order: {r2s}; and the MSEs: {mses}')

The R² scores in order: [0.80817195 0.78216833 0.76551763 0.81027494]; and the MSEs: [3.42634628 0.61957036 8.94975681 2.72526094]


In [24]:
import plotly.graph_objects as go

# Datasets (electrolytes)
datasets = ["LiPF₆", "LiBF₄", "LiAsF₆", "LiBOB"]

# R² scores for each model on each dataset
ridge_r2 = [0.8082, 0.7822, 0.7655, 0.8103]
lasso_r2 = [0.79, 0.78, 0.77, 0.805] # Random data
nn_r2 = [0.81, 0.785, 0.76, 0.812]  # Random data

# Create bars for each model
fig = go.Figure()

fig.add_trace(go.Bar(x=datasets, y=ridge_r2, name="Ridge", marker_color="blue"))
fig.add_trace(go.Bar(x=datasets, y=lasso_r2, name="Lasso", marker_color="red"))
fig.add_trace(go.Bar(x=datasets, y=nn_r2, name="Neural Network", marker_color="green"))

# Layout
fig.update_layout(
    title="R² Scores for Different Models Across Electrolytes",
    xaxis_title="Electrolyte",
    yaxis_title="R² Score",
    barmode="group",  # 'group' places bars side by side; use 'stack' for stacked bars
    template="plotly_white",
    yaxis=dict(range=[0.75, 0.85])
)

fig.show()


### Assign salts to their MW
#### Skip this maybe, if we're selecting a few salts

In [14]:
molar_weights_salts = {'LiPF6' : 151.91, 'LiBF4': 93.75,
                      'LiFSI': 187.7, 'LiTDI' : 192.1, 'LiPDI' : 242.1, 'LiTFSI' : 287.07, 'LiClO4' : 160.44, 'LiAsF6' : 195.9,
 'LiBOB' : 193.79, 'LiCF3SO3' : 156.01, 'LiBPFPB' : 193.8, ' LiBPFPB' : 193.8, 'LiBMB': 221.85, 'LiN(CF3SO2)2' : 287.07} # [g/mol]

salt_names_col = list(elyte_conductivity_data['salt'])
print(f'salt names: {salt_names_col}')
salts_weights_col = salt_names_col

for s in range(len(salt_names_col)):
    salt = salt_names_col[s]
    if not salt.isalnum:
        print(f'WRONG! {salt}')
    salts_weights_col[s] = molar_weights_salts.get(salt)
print(salts_weights_col)
i = 0
for s in salts_weights_col:
    if s == None:
        print(f'WRONG! {s}, index {i}')
    i+=1

salt_df = pd.DataFrame({'salt weights': salts_weights_col})
print(f'Length of salt names: {len(salt_names_col)}; len of salt weights: {len(salt_df)}')
nulls = salt_df.isnull()
null_salts = []
i = 0
for n in nulls:
    if n:
        null_salts.append(salt_names_col[i])
        print(salt_names_col)
    i+=1
print(null_salts)

salt names: ['LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6

In [15]:
# print(f'number of salts: {len(molar_weights_salts)}')

### Switch out salt column

In [16]:
X.insert(2, 'salt weights', salts_weights_col)
X = X.drop('salt',axis=1)


### Regress

In [17]:
ridge = Ridge(alpha=1)

# Final cleanup
print(f'Nulls in Y: {y.isnull().sum()}')
i = 0
for val in y:
    if val != val:
        print(f'WRONG! {val}, index: {i}')
        y = y.drop(i)
        X = X.drop(i,axis=0)
    i+=1
print(f'Nulls left now in Y: {y.isnull().sum()}')
# Test-train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,train_size=0.8, random_state=137)

ridge.fit(X_train,y_train)

ridge_cv = RidgeCV(alphas=lambdas)
ridge_cv.fit(X_train,y_train)
best_lambda = ridge_cv.alpha_
print(f'best lambda is {best_lambda}')

Nulls in Y: 0
Nulls left now in Y: 0
best lambda is 0.1


### Evaluate

In [18]:
y_pred = ridge.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"R² Score: {r2:.4f}, which is ")
print(f"MSE: {mse:.4f}")



R² Score: 0.6493, which is 
MSE: 5.9477


### Try predicting k for a new set of observations

In [19]:
print(X.columns)
# 12.37,332.15,0.6524,LiPF6,mol/kg,w,0,0.9,0,0,0.1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0

new_X = [[350,0.8,151, 0,.3,0,0,0,0,0,0,0,.3,0,0,.03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,.37,0,0,0,0,0,0,0,0,0]]
predicted_y_new = ridge.predict(new_X)
print(f"Predicted ionic conductivity for new observations: {predicted_y_new[0]:.4f}")
print(f'The average k value is {y.mean()} and the min is {y.min()}')

Index(['T', 'c', 'salt weights', 'EC', 'PC', 'DMC', 'EMC', 'DEC', 'DME',
       'DMSO', 'AN', 'MOEMC', 'TFP', 'EA', 'MA', 'FEC', 'DOL', '2-MeTHF',
       'DMM', 'Freon 11', 'Methylene chloride', 'THF', 'Toluene', 'Sulfolane',
       '2-Glyme', '3-Glyme', '4-Glyme', '3-Me-2-Oxazolidinone',
       '3-MeSulfolane', 'Ethyldiglyme', 'DMF', 'Ethylbenzene',
       'Ethylmonoglyme', 'Benzene', 'g-Butyrolactone', 'Cumene',
       'Propylsulfone', 'Pseudocumeme', 'TEOS', 'm-Xylene', 'o-Xylene'],
      dtype='object')
Predicted ionic conductivity for new observations: 6.4396
The average k value is 4.120550101970756 and the min is -0.172510519



X does not have valid feature names, but Ridge was fitted with feature names

