<a href="https://colab.research.google.com/github/deepsharma26/SIRT1_Main/blob/Data_spliting_and_Feature_selection/MACCS_FP_gen_and_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#PartA: Generating MACCS fingerprint for SIRT1 based dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
! wget https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
! unzip -o fingerprints_xml.zip

--2025-08-21 12:45:00--  https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip [following]
--2025-08-21 12:45:01--  https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10871 (11K) [application/zip]
Saving to: ‘fingerprints_xml.zip’


2025-08-21 12:45:01 (87.0 MB/s) - ‘fingerprints_xml.zip’ saved [10871/10871]

Archive:  fingerprints_xml.zip
  inflating: AtomPairs2DFingerprintCount.xml  
  inflating: AtomPairs2DFin

In [3]:
import glob
xml_files = glob.glob("*.xml")
xml_files.sort()
xml_files

['AtomPairs2DFingerprintCount.xml',
 'AtomPairs2DFingerprinter.xml',
 'EStateFingerprinter.xml',
 'ExtendedFingerprinter.xml',
 'Fingerprinter.xml',
 'GraphOnlyFingerprinter.xml',
 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothFingerprinter.xml',
 'MACCSFingerprinter.xml',
 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount.xml',
 'SubstructureFingerprinter.xml']

In [5]:
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

In [6]:
fp = dict(zip(FP_list, xml_files))
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [7]:
df = pd.read_csv('/content/SIRT1_04_bioactivity_data_3class_pIC50.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 933 entries, 0 to 932
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          933 non-null    int64  
 1   molecule_chembl_id  933 non-null    object 
 2   canonical_smiles    933 non-null    object 
 3   Class               933 non-null    object 
 4   MW                  933 non-null    float64
 5   LogP                933 non-null    float64
 6   NumHDonors          933 non-null    float64
 7   NumHAcceptors       933 non-null    float64
 8   pIC50               933 non-null    float64
dtypes: float64(5), int64(1), object(3)
memory usage: 65.7+ KB


In [8]:
df = pd.concat( [df['canonical_smiles'],df['molecule_chembl_id']], axis=1 )
df.to_csv('molecule.smi', sep='\t', index=False, header=False)
df

Unnamed: 0,canonical_smiles,molecule_chembl_id
0,NC(=O)C1CCCc2c1[nH]c1ccc(Cl)cc21,CHEMBL420311
1,Cc1ccc2[nH]c3c(c2c1)CCCC3C(N)=O,CHEMBL115600
2,NC(=O)C1CCCc2c1[nH]c1ccccc21,CHEMBL112265
3,NC(=O)C1CCCCc2c1[nH]c1ccc(Cl)cc21,CHEMBL446446
4,CCOC(=O)C1CCCc2c1[nH]c1ccc(Cl)cc21,CHEMBL171137
...,...,...
928,O=C(Nc1ccccc1)Nc1ccc(NC(=O)c2ccccc2)cc1,CHEMBL4793948
929,O=C(NCCc1c[nH]c2ccccc12)c1cc2ccccc2cc1O,CHEMBL4749004
930,O=C(O)CCNC(=S)NCCCNc1nc(Nc2ccccc2Cl)ncc1C(=O)N...,CHEMBL5416344
931,O=C(O)CCNC(=S)NCCCNc1nc(Nc2ccccc2)nc(Nc2ccccc2...,CHEMBL5424599


In [9]:
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [10]:
!pip install padelpy

Collecting padelpy
  Downloading padelpy-0.1.16-py3-none-any.whl.metadata (7.7 kB)
Downloading padelpy-0.1.16-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m81.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: padelpy
Successfully installed padelpy-0.1.16


#Here the fingerprint name has to be choosen (MACCS)

In [11]:
from padelpy import padeldescriptor

fingerprint = 'MACCS'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #MACCS.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi',
                d_file=fingerprint_output_file, #'MACCS.csv'
                #descriptortypes='MACCSFingerprint.xml',
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

In [12]:
descriptors = pd.read_csv(fingerprint_output_file)
descriptors

Unnamed: 0,Name,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
0,CHEMBL420311,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,1,1,0
1,CHEMBL115600,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0
2,CHEMBL112265,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,1,1,0
3,CHEMBL446446,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,1,1,0
4,CHEMBL171137,0,0,0,0,0,0,0,0,0,...,1,0,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
928,CHEMBL4793948,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
929,CHEMBL4749004,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
930,CHEMBL5416344,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
931,CHEMBL5424599,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0


#PartB: Feature Selection

To select the best features we need the "Class or Bioactivity" column along with generated MACCS fartures. We have manually added the Bioactivity column or last column.

In [19]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_classif
from sklearn.preprocessing import StandardScaler


data = pd.read_csv('/content/MACCS_bioactivity_class.csv') #bioactivitycolumn has been added manually
data
print(data.columns)
data.loc[data['BioactivityClass'] == 'Active', 'BioactivityClass'] = 'P'
data.loc[data['BioactivityClass'] == 'Inactive', 'BioactivityClass'] = 'N'
data
data['BioactivityClass'].replace('P', '1', inplace=True)
data['BioactivityClass'].replace('N', '0', inplace=True)
data.rename(columns = {"BioactivityClass": "BioactivityClass"}, inplace = True)
data

Index(['Name', 'MACCSFP1', 'MACCSFP2', 'MACCSFP3', 'MACCSFP4', 'MACCSFP5',
       'MACCSFP6', 'MACCSFP7', 'MACCSFP8', 'MACCSFP9',
       ...
       'MACCSFP158', 'MACCSFP159', 'MACCSFP160', 'MACCSFP161', 'MACCSFP162',
       'MACCSFP163', 'MACCSFP164', 'MACCSFP165', 'MACCSFP166',
       'BioactivityClass'],
      dtype='object', length=168)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['BioactivityClass'].replace('P', '1', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['BioactivityClass'].replace('N', '0', inplace=True)


Unnamed: 0,Name,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166,BioactivityClass
0,CHEMBL420311,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,1,1,1,0,1
1,CHEMBL115600,0,0,0,0,0,0,0,0,0,...,1,0,1,1,1,1,1,1,0,1
2,CHEMBL112265,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,1,1,1,0,0
3,CHEMBL446446,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,1,1,1,0,1
4,CHEMBL171137,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
928,CHEMBL4793948,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,0,0
929,CHEMBL4749004,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,0,0
930,CHEMBL5416344,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,0,0
931,CHEMBL5424599,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,0,0


In [20]:
data.isnull().sum().sum()
data.dropna(inplace =True)
data.isnull().sum().sum()
data1 = data.iloc[:,1:167]
data1
data2 = data.iloc[:,-1]
data2

Unnamed: 0,BioactivityClass
0,1
1,1
2,0
3,1
4,0
...,...
928,0
929,0
930,0
931,0


In [21]:
from sklearn.feature_selection import VarianceThreshold
threshold = VarianceThreshold(threshold=.1)
def variance_threshold_selector(data1, threshold=0.10):
    selector = VarianceThreshold(threshold)
    selector.fit(data1)
    return data1[data1.columns[selector.get_support(indices=True)]]

data1_HV = variance_threshold_selector(data1)

#Clean column names again (precaution)
data1_HV.columns = data1_HV.columns.str.strip()

if 'BioactivityClass' in data1_HV.columns:
    data1_HV = data1_HV.drop(columns=['BioactivityClass'])
data2_HV = pd.merge(data1_HV, data2, left_index=True, right_index=True)
data2_HV.columns = data2_HV.columns.str.strip()
if 'BioactivityClass_x' in data2_HV.columns and 'BioactivityClass_y' in data2_HV.columns:
    data2_HV = data2_HV.drop(columns=['BioactivityClass_x'])
    data2_HV = data2_HV.rename(columns={'BioactivityClass_y': 'BioactivityClass'})

data2_HV

Unnamed: 0,MACCSFP16,MACCSFP36,MACCSFP38,MACCSFP43,MACCSFP47,MACCSFP50,MACCSFP52,MACCSFP53,MACCSFP54,MACCSFP57,...,MACCSFP150,MACCSFP151,MACCSFP152,MACCSFP153,MACCSFP155,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,BioactivityClass
0,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,1
1,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,1,1
2,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
928,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,1,1,1,0,0
929,0,0,0,0,0,0,0,1,0,0,...,1,1,1,1,1,1,1,1,0,0
930,0,0,1,1,1,0,0,1,0,0,...,1,1,0,1,1,1,1,1,0,0
931,0,0,0,1,1,0,0,1,0,0,...,1,1,0,1,1,1,1,1,0,0


In [22]:
correlated_features_1 = set()
corr_matrix_class = data2_HV.drop(columns=['BioactivityClass']).corr()


for i in range(len(corr_matrix_class.columns)):
    for j in range(i):
        if abs(corr_matrix_class.iloc[i, j]) > 0.9:
            colname = corr_matrix_class.columns[i]
            correlated_features_1.add(colname)
correlated_features_1
def remove_correlated_features(features, data):
    for x in features:
        data.drop(x, axis=1, inplace=True)
    return data
data3 = remove_correlated_features(correlated_features_1, data2_HV)
data3

Unnamed: 0,MACCSFP16,MACCSFP36,MACCSFP38,MACCSFP43,MACCSFP47,MACCSFP50,MACCSFP52,MACCSFP53,MACCSFP54,MACCSFP57,...,MACCSFP150,MACCSFP151,MACCSFP152,MACCSFP153,MACCSFP155,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,BioactivityClass
0,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,1
1,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,1,1
2,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
928,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,1,1,1,0,0
929,0,0,0,0,0,0,0,1,0,0,...,1,1,1,1,1,1,1,1,0,0
930,0,0,1,1,1,0,0,1,0,0,...,1,1,0,1,1,1,1,1,0,0
931,0,0,0,1,1,0,0,1,0,0,...,1,1,0,1,1,1,1,1,0,0


In [23]:
X = data3.iloc[:,1:84]  #independent columns
X
y = data3.iloc[:,-1]
y

Unnamed: 0,BioactivityClass
0,1
1,1
2,0
3,1
4,0
...,...
928,0
929,0
930,0
931,0


In [24]:
scaler = StandardScaler()
X_transform = scaler.fit_transform(X)
X_transform_pd = pd.DataFrame(X_transform, columns = X.columns)
X_transform_pd
from sklearn.feature_selection import SelectKBest, mutual_info_classif
kBest = SelectKBest(mutual_info_classif, k = 10)
X_kBestFeatures = kBest.fit_transform(X,y)
X_kBestFeatures.shape

kBest = SelectKBest(mutual_info_classif, k = 20)
X_kBestFeatures = kBest.fit_transform(X,y)
X_kBestFeatures.shape

kBest = SelectKBest(mutual_info_classif, k = 30)
X_kBestFeatures = kBest.fit_transform(X,y)
X_kBestFeatures.shape

kBest.get_support(True)
X_kBestFeatures = X.iloc[:, kBest.get_support(True)]
X_kBestFeatures.columns
f = kBest.get_support(1)
X_new = X[X.columns[f]]
X_new
data_scores = pd.DataFrame(kBest.scores_)
data_columns = pd.DataFrame(X.columns)
featureScores = pd.concat([data_columns, data_scores], axis = 1)

#naming the dataframe columns
featureScores.columns = ['Features', 'Scores']
featureScores

Unnamed: 0,Features,Scores
0,MACCSFP36,0.023270
1,MACCSFP38,0.009485
2,MACCSFP43,0.000000
3,MACCSFP47,0.015280
4,MACCSFP50,0.017481
...,...,...
78,MACCSFP155,0.007429
79,MACCSFP157,0.012754
80,MACCSFP158,0.020911
81,MACCSFP159,0.010101


In [25]:
Graph = (featureScores.nlargest(10, 'Scores'))
Graph
Graph = (featureScores.nlargest(20, 'Scores'))
Graph
Graph = (featureScores.nlargest(30, 'Scores'))
Graph
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import pandas as pd
import matplotlib.pyplot as plt

# Ensure X is DataFrame and convert y to NumPy array
X = data3.iloc[:,1:84]  #independent columns
y_array = data3.iloc[:,-1].values

#Top 10 Features
k = 10
kBest = SelectKBest(mutual_info_classif, k=k)
# Pass X as DataFrame and y_array as NumPy array
X_new = kBest.fit_transform(X, y_array)
selected_mask = kBest.get_support()
selected_columns = X.columns[selected_mask]
selected_scores = kBest.scores_[selected_mask]

# Step 2: Create and save top-k feature scores CSV
featureScores = pd.DataFrame({'Features': selected_columns, 'Scores': selected_scores})
Graph = featureScores.nlargest(k, 'Scores')  # Sorted top-k features
Graph.to_csv(f'MACCSFeatures_top{k}.csv', index=False)

# Step 3: Plot and save bar chart
data = pd.read_csv(f'MACCSFeatures_top{k}.csv')
df = pd.DataFrame(data)
# X_labels = list(df.iloc[:, 0])
# Y_scores = list(df.iloc[:, 1])

plt.figure(figsize=(10, 6))
plt.bar(df['Features'], df['Scores'], color='r')
plt.title(f"MACCS Feature_importance (Top {k})", fontweight="bold")
plt.xlabel("Features", fontweight='bold')
plt.ylabel("Scores", fontweight='bold')
plt.xticks(rotation=80, fontweight="bold")
plt.yticks(fontweight="bold")
plt.tight_layout()
plt.savefig(f"Features_top{k}.jpg", bbox_inches='tight')
plt.close()

# Step 4: Combine selected features and target, save dataset
Atom_pair_2d_FS2 = pd.concat([X[selected_columns].reset_index(drop=True), data3.iloc[:,-1].reset_index(drop=True)], axis=1) # Use original y for concat
Atom_pair_2d_FS2.to_csv(f'MACCS2_Trainset_top{k}.csv', index=False)


#Top 20 Features
k = 20
kBest = SelectKBest(mutual_info_classif, k=k)
X_new = kBest.fit_transform(X, y_array) # Use y_array
selected_mask = kBest.get_support()
selected_columns = X.columns[selected_mask]
selected_scores = kBest.scores_[selected_mask]
featureScores = pd.DataFrame({'Features': selected_columns, 'Scores': selected_scores})
Graph = featureScores.nlargest(k, 'Scores')
Graph.to_csv(f'MACCSFeatures_top{k}.csv', index=False)

data = pd.read_csv(f'MACCSFeatures_top{k}.csv')
df = pd.DataFrame(data)
plt.figure(figsize=(10, 6))
plt.bar(df['Features'], df['Scores'], color='r')
plt.title(f"MACCS Feature_importance (Top {k})", fontweight="bold")
plt.xlabel("Features", fontweight='bold')
plt.ylabel("Scores", fontweight='bold')
plt.xticks(rotation=80, fontweight="bold")
plt.yticks(fontweight="bold")
plt.tight_layout()
plt.savefig(f"Features_top{k}.jpg", bbox_inches='tight')
plt.close()

Atom_pair_2d_FS2 = pd.concat([X[selected_columns].reset_index(drop=True), data3.iloc[:,-1].reset_index(drop=True)], axis=1) # Use original y for concat
Atom_pair_2d_FS2.to_csv(f'MACCS2_Trainset_top{k}.csv', index=False)


#Top 30 Features
k = 30
kBest = SelectKBest(mutual_info_classif, k=k)
X_new = kBest.fit_transform(X, y_array) # Use y_array
selected_mask = kBest.get_support()
selected_columns = X.columns[selected_mask]
selected_scores = kBest.scores_[selected_mask]
featureScores = pd.DataFrame({'Features': selected_columns, 'Scores': selected_scores})
Graph = featureScores.nlargest(k, 'Scores')
Graph.to_csv(f'MACCSFeatures_top{k}.csv', index=False)

data = pd.read_csv(f'MACCSFeatures_top{k}.csv')
df = pd.DataFrame(data)
plt.figure(figsize=(10, 6))
plt.bar(df['Features'], df['Scores'], color='r')
plt.title(f"MACCS Feature_importance (Top {k})", fontweight="bold")
plt.xlabel("Features", fontweight='bold')
plt.ylabel("Scores", fontweight='bold')
plt.xticks(rotation=80, fontweight="bold")
plt.yticks(fontweight="bold")
plt.tight_layout()
plt.savefig(f"Features_top{k}.jpg", bbox_inches='tight')
plt.close()

Atom_pair_2d_FS2 = pd.concat([X[selected_columns].reset_index(drop=True), data3.iloc[:,-1].reset_index(drop=True)], axis=1) # Use original y for concat
Atom_pair_2d_FS2.to_csv(f'MACCS2_Trainset_top{k}.csv', index=False)

Downloading data in zip file

In [29]:
! zip SIRT1_MACCS_FP_Gen_and_selection.zip *.csv *.jpg

updating: MACCS2_Trainset_top10.csv (deflated 93%)
updating: MACCS2_Trainset_top20.csv (deflated 94%)
updating: MACCS2_Trainset_top30.csv (deflated 94%)
updating: MACCS_bioactivity_class.csv (deflated 94%)
updating: MACCS.csv (deflated 94%)
updating: MACCSFeatures_top10.csv (deflated 48%)
updating: MACCSFeatures_top20.csv (deflated 55%)
updating: MACCSFeatures_top30.csv (deflated 58%)
updating: SIRT1_04_bioactivity_data_3class_pIC50.csv (deflated 76%)
  adding: Features_top10.jpg (deflated 52%)
  adding: Features_top20.jpg (deflated 43%)
  adding: Features_top30.jpg (deflated 39%)
