# Ridge Regression for Ionic Conductivity

### Import Libraries

In [2]:
import numpy as np
import pandas as pd

from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score, mean_squared_error


### Import data

In [3]:
#elyte_conductivity_data = pd.read_csv("/Users/natalieclemans/Desktop/MatSci176/Project/CALiSol-23 Dataset.csv")
#elyte_conductivity_data = pd.read_csv("CALiSol-23 Dataset.csv")
elyte_conductivity_data = pd.read_csv("Pre-processed CALiSol-23 Dataset.csv")
print(elyte_conductivity_data.columns)

FileNotFoundError: [Errno 2] No such file or directory: 'Pre-processed CALiSol-23 Dataset.csv'

In [16]:
print(f'Number of solvents: {len(elyte_conductivity_data.columns) - 7}')

Number of solvents: 38


### Arrange data

In [6]:
# elyte_conductivity_data.drop(['c units', 'solvent ratio type'], axis=1)
cols = elyte_conductivity_data.columns[2:]

X = elyte_conductivity_data[cols]
X = X.drop(['c units', 'solvent ratio type'],axis=1)
y = elyte_conductivity_data['k']
# print(X)

lambdas = [.1, 1, 10, 100]


### Assign salts to their MW

In [7]:
molar_weights_salts = {'LiPF6' : 151.91, 'LiBF4': 93.75,
                      'LiFSI': 187.7, 'LiTDI' : 192.1, 'LiPDI' : 242.1, 'LiTFSI' : 287.07, 'LiClO4' : 160.44, 'LiAsF6' : 195.9,
 'LiBOB' : 193.79, 'LiCF3SO3' : 156.01, 'LiBPFPB' : 193.8, ' LiBPFPB' : 193.8, 'LiBMB': 221.85, 'LiN(CF3SO2)2' : 287.07} # [g/mol]

salt_names_col = list(elyte_conductivity_data['salt'])
print(f'salt names: {salt_names_col}')
salts_weights_col = salt_names_col

for s in range(len(salt_names_col)):
    salt = salt_names_col[s]
    if not salt.isalnum:
        print(f'WRONG! {salt}')
    salts_weights_col[s] = molar_weights_salts.get(salt)
print(salts_weights_col)
i = 0
for s in salts_weights_col:
    if s == None:
        print(f'WRONG! {s}, index {i}')
    i+=1

salt_df = pd.DataFrame({'salt weights': salts_weights_col})
print(f'Length of salt names: {len(salt_names_col)}; len of salt weights: {len(salt_df)}')
nulls = salt_df.isnull()
null_salts = []
i = 0
for n in nulls:
    if n:
        null_salts.append(salt_names_col[i])
        print(salt_names_col)
    i+=1
print(null_salts)

salt names: ['LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiFSI', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiBF4', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiTDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPDI', 'LiPF6', 'LiPF6', 'LiPF6', 'LiPF6

In [14]:
print(f'number of salts: {len(molar_weights_salts)}')

number of salts: 14


### Switch out salt column

In [9]:
X.insert(2, 'salt weights', salts_weights_col)
X = X.drop('salt',axis=1)


### Regress

In [10]:
ridge = Ridge(alpha=0.1)

# Final cleanup
print(f'Nulls in Y: {y.isnull().sum()}')
i = 0
for val in y:
    if val != val:
        print(f'WRONG! {val}, index: {i}')
        y = y.drop(i)
        X = X.drop(i,axis=0)
    i+=1
print(f'Nulls left now in Y: {y.isnull().sum()}')
# Test-train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,train_size=0.8, random_state=137)

ridge.fit(X_train,y_train)

ridge_cv = RidgeCV(alphas=lambdas)
ridge_cv.fit(X_train,y_train)
best_lambda = ridge_cv.alpha_
print(f'best lambda is {best_lambda}')

Nulls in Y: 523
WRONG! nan, index: 799
WRONG! nan, index: 813
WRONG! nan, index: 814
WRONG! nan, index: 827
WRONG! nan, index: 828
WRONG! nan, index: 829
WRONG! nan, index: 830
WRONG! nan, index: 841
WRONG! nan, index: 842
WRONG! nan, index: 843
WRONG! nan, index: 844
WRONG! nan, index: 845
WRONG! nan, index: 1023
WRONG! nan, index: 1024
WRONG! nan, index: 1037
WRONG! nan, index: 1038
WRONG! nan, index: 1039
WRONG! nan, index: 1051
WRONG! nan, index: 1052
WRONG! nan, index: 1053
WRONG! nan, index: 1054
WRONG! nan, index: 1055
WRONG! nan, index: 1205
WRONG! nan, index: 1219
WRONG! nan, index: 1233
WRONG! nan, index: 1247
WRONG! nan, index: 1248
WRONG! nan, index: 1249
WRONG! nan, index: 1261
WRONG! nan, index: 1262
WRONG! nan, index: 1263
WRONG! nan, index: 1264
WRONG! nan, index: 1265
WRONG! nan, index: 1266
WRONG! nan, index: 1443
WRONG! nan, index: 1457
WRONG! nan, index: 1458
WRONG! nan, index: 1471
WRONG! nan, index: 1472
WRONG! nan, index: 1473
WRONG! nan, index: 1474
WRONG! nan, 

### Evaluate

In [11]:
y_pred = ridge.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"R² Score: {r2:.4f}, which is ")
print(f"MSE: {mse:.4f}")



R² Score: 0.6501, which is 
MSE: 5.9334


### Try predicting k for a new set of observations

In [12]:
print(X.columns)
# 12.37,332.15,0.6524,LiPF6,mol/kg,w,0,0.9,0,0,0.1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0

new_X = [[350,0.8,151, 0,.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,.4,0,0,0,0,0,0,0,0,0]]
predicted_y_new = ridge.predict(new_X)
print(f"Predicted ionic conductivity for new observations: {predicted_y_new[0]:.4f}")
print(f'The average k value is {y.mean()}')

Index(['T', 'c', 'salt weights', 'EC', 'PC', 'DMC', 'EMC', 'DEC', 'DME',
       'DMSO', 'AN', 'MOEMC', 'TFP', 'EA', 'MA', 'FEC', 'DOL', '2-MeTHF',
       'DMM', 'Freon 11', 'Methylene chloride', 'THF', 'Toluene', 'Sulfolane',
       '2-Glyme', '3-Glyme', '4-Glyme', '3-Me-2-Oxazolidinone',
       '3-MeSulfolane', 'Ethyldiglyme', 'DMF', 'Ethylbenzene',
       'Ethylmonoglyme', 'Benzene', 'g-Butyrolactone', 'Cumene',
       'Propylsulfone', 'Pseudocumeme', 'TEOS', 'm-Xylene', 'o-Xylene'],
      dtype='object')
Predicted ionic conductivity for new observations: 8.3569
The average k value is 4.120550101970756


