<a href="https://colab.research.google.com/github/gaurinotgauri/band_gap_prediction_polymer/blob/main/bandgap_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [80]:
!pip install lightgbm



In [81]:
!pip install rdkit



In [82]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys
from rdkit.Chem import Descriptors
from rdkit.DataStructs.cDataStructs import CreateFromBitString
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.metrics import mean_squared_error


In [83]:
df1 = pd.read_csv('/content/train.csv')
dft1 = pd.read_csv('/content/test.csv')
sub = pd.read_csv('/content/sample_submission.csv')

In [79]:
#generate molecular descriptors and fingerprints
def encode_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

#molecular descriptors
    descriptors = {
        'MolecularWeight': Descriptors.MolWt(mol),
        'NumAtoms': mol.GetNumAtoms(),
        'NumBonds': mol.GetNumBonds(),
        'NumHeteroatoms': Descriptors.NumHeteroatoms(mol)
    }

#Morgan fingerprint
    morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    morgan_fp_bits = list(map(int, morgan_fp.ToBitString()))
    descriptors.update({f'MorganFP_{i}': bit for i, bit in enumerate(morgan_fp_bits)})

#MACCS keys fingerprint
    maccs_fp = MACCSkeys.GenMACCSKeys(mol)
    maccs_fp_bits = list(map(int, maccs_fp.ToBitString()))
    descriptors.update({f'MACCSFP_{i}': bit for i, bit in enumerate(maccs_fp_bits)})

    return descriptors

encoded_features = df1['polymer'].apply(encode_smiles)
encoded_featurest = dft1['polymer'].apply(encode_smiles)

encoded_df = pd.DataFrame(encoded_features.tolist())
encoded_dft = pd.DataFrame(encoded_featurest.tolist())

df = pd.concat([df1, encoded_df], axis=1)
dft = pd.concat([dft1, encoded_dft], axis=1)


In [84]:
df.drop('id',axis=1, inplace=True)
df.drop('polymer',axis=1, inplace=True)
dft.drop('id',axis=1, inplace=True)
dft.drop('polymer',axis=1, inplace=True)

In [85]:
dft.shape, df.shape ,sub.shape

((845, 2303), (2534, 2304), (845, 2))

In [86]:
X = df.drop(columns=['band_gap'])
y = df['band_gap']
X_scaled = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [87]:
#lightbgm parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'n_estimators': 100
}

#lightbgm format
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

gbm = lgb.train(params,
                lgb_train,
                valid_sets=[lgb_train, lgb_eval])

#predicting on test set
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error on Test Set:", rmse)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010217 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13194
[LightGBM] [Info] Number of data points in the train set: 2027, number of used features: 669
[LightGBM] [Info] Start training from score 4.556972
Root Mean Squared Error on Test Set: 0.5821554868337167


In [88]:
pred = gbm.predict(dft)
bandgap = pd.DataFrame({'band_gap': pred})

In [89]:
submission_df = pd.concat([sub['id'],bandgap], axis=1)

In [90]:
submission_df.head()

Unnamed: 0,id,band_gap
0,402,5.32
1,321,3.01
2,457,3.27
3,879,2.6
4,1536,2.77


In [91]:
submission_df.to_csv('bandgap.csv', index=False)