## The purpose of this notebook is to create training datasets for PolymerGPT

Originally sourced from TransPolymer, there are ten distinct downstream polymer property training datasets: Eea, Egb, Egc, Ei, EPS, Nc, OPV, Xc, PE_I, and PE_II. Six of these datasets share the same structure, consisting of two columns: "smiles" and "values."

The goal is to combine the datasets to obtain data on multiple properties for each polymer string.

### Making a dataset with polymers that appear in at least 6 dataset

In [11]:
import pandas as pd

# File paths of the datasets you want to analyze
file_paths = {
    'Egc': '/home/dion/projects/New_TransPolymer/TransPolymer/data/Egc.csv',
    'Egb': '/home/dion/projects/New_TransPolymer/TransPolymer/data/Egb.csv',
    'Eea': '/home/dion/projects/New_TransPolymer/TransPolymer/data/Eea.csv',
    'Ei': '/home/dion/projects/New_TransPolymer/TransPolymer/data/Ei.csv',
    'Xc': '/home/dion/projects/New_TransPolymer/TransPolymer/data/Xc.csv',
    'EPS': '/home/dion/projects/New_TransPolymer/TransPolymer/data/EPS.csv',
    'Nc': '/home/dion/projects/New_TransPolymer/TransPolymer/data/Nc.csv'
}

# Load datasets into a dictionary
datasets = {name: pd.read_csv(path) for name, path in file_paths.items()}

# Add a 'dataset' column to each dataset to track its origin
for name, df in datasets.items():
    df['dataset'] = name

# Concatenate all datasets into a single DataFrame
all_data = pd.concat(datasets.values(), ignore_index=True)

# Group by the SMILES strings and count unique dataset appearances
polymer_counts = all_data.groupby('smiles')['dataset'].nunique().reset_index()
polymer_counts.columns = ['smiles', 'dataset_count']

# Filter for polymers that appear in exactly 6 datasets
smiles_in_6_datasets = polymer_counts[polymer_counts['dataset_count'] == 6]['smiles']

# Filter the original `all_data` to only include polymers that are in 6 datasets
polymers_in_6_datasets = all_data[all_data['smiles'].isin(smiles_in_6_datasets)]

# Pivot the data to get properties from each dataset as columns
polymers_pivoted = polymers_in_6_datasets.pivot_table(
    index='smiles', 
    columns='dataset', 
    values='value',  # Replace with the actual property column name in your datasets
    aggfunc='first'
).reset_index()

# Display the result
print(polymers_pivoted)

# Save the resulting DataFrame to a CSV file
polymers_pivoted.to_csv('/home/dion/projects/New_TransPolymer/TransPolymer/Dion/data_Dion/323_combined_dataset.csv', index=False)



dataset                       smiles   EPS     Eea     Egb     Egc      Ei  \
0         *C(=O)C(F)(F)C(=O)C(*)(F)F  3.77  4.1382  3.1408  4.4683  7.3587   
1             *C(=O)C(F)(F)CC(*)(F)F  3.64  3.3520  4.9978  4.9534  8.1829   
2             *C(=O)C(F)(F)NC(*)(F)F  4.75  3.7985  5.0875  4.8818  8.5584   
3             *C(=O)CC(F)(F)C(*)(F)F  3.37  2.8445  5.0395  5.5534  7.9225   
4             *C(=O)NC(F)(F)C(*)(F)F  3.53  2.4953  5.4991  5.8517  8.3456   
..                               ...   ...     ...     ...     ...     ...   
318      *c1ccc(CC(F)(F)C(*)(F)F)cc1  3.38  1.7485  5.1100  5.1278  7.0010   
319      *c1ccc(NC(F)(F)C(*)(F)F)cc1  3.92  1.5240  4.1626  4.4728  6.1304   
320       *c1ccc(NC(F)(F)C(*)(F)F)s1  4.50  1.8646  3.3898  4.7359  5.8710   
321      *c1ccc(OC(=S)c2ccc(*)s2)cc1  5.49  3.1094  2.0472  2.8495  5.9615   
322                     *c1cccc(*)n1  4.35  2.2143  2.9424  3.8339  6.0452   

dataset      Nc  Xc  
0        1.6328 NaN  
1        1.6107 NaN

### Making a dataset that combine all six polymer property dataset

In [14]:
import pandas as pd

# File paths of the datasets you want to analyze
file_paths = {
    'Egc': '/home/dion/projects/New_TransPolymer/TransPolymer/data/Egc.csv',
    'Egb': '/home/dion/projects/New_TransPolymer/TransPolymer/data/Egb.csv',
    'Eea': '/home/dion/projects/New_TransPolymer/TransPolymer/data/Eea.csv',
    'Ei': '/home/dion/projects/New_TransPolymer/TransPolymer/data/Ei.csv',
    'Xc': '/home/dion/projects/New_TransPolymer/TransPolymer/data/Xc.csv',
    'EPS': '/home/dion/projects/New_TransPolymer/TransPolymer/data/EPS.csv',
    'Nc': '/home/dion/projects/New_TransPolymer/TransPolymer/data/Nc.csv'
}

# Load datasets and rename 'value' column to the dataset name (property name)
datasets = {}
for name, path in file_paths.items():
    df = pd.read_csv(path)
    df = df.rename(columns={'value': f'{name}_property'})  # Rename 'value' column to indicate the property
    datasets[name] = df

# Merge all datasets on 'smiles' column
# Start with an empty DataFrame and merge each dataset one at a time
combined_data = None
for df in datasets.values():
    if combined_data is None:
        combined_data = df  # Initialize with the first dataset
    else:
        combined_data = pd.merge(combined_data, df, on='smiles', how='outer')  # Outer join to keep all polymer strings

# Display the combined data
display(combined_data)

# Save the combined dataset to a CSV file if needed
combined_data.to_csv('/home/dion/projects/New_TransPolymer/TransPolymer/Dion/data_Dion/3654_combined_dataset.csv', index=False)



Unnamed: 0,smiles,Egc_property,Egb_property,Eea_property,Ei_property,Xc_property,EPS_property,Nc_property
0,*/C=C\C(C#N)C*,5.3057,,,,,,
1,*=CC1CC(C=*)C2CN(c3cc(C(F)(F)F)cc(C(F)(F)F)c3)...,4.3204,,,,,,
2,*=CC1OC(C=*)C2C(=O)N(c3cc(C(F)(F)F)cc(C(F)(F)F...,5.0857,,,,,,
3,*=CC1OC(C=*)C2C(=O)N(c3ccc(C(F)(F)F)cc3)C(=O)C12,5.3690,,,,,,
4,*=CC1OC(C=*)C2C(=O)N(c3cccc(C(F)(F)F)c3)C(=O)C12,5.0879,,,,,,
...,...,...,...,...,...,...,...,...
3649,*c1sc(-c2cc(OCCCCC)c(*)s2)cc1OCCCCC,1.2005,,,,,,
3650,*c1sc(-c2cc(SCC(C)CC)c(*)s2)cc1SCC(C)CC,1.6105,,,,,,
3651,*c1sc(-c2sc(-c3sc(*)c4nccnc34)c3c2OC(CCCCCC)CO...,1.6167,,,,,,
3652,*c1sc(-c2sc(-c3sc(*)c4nccnc34)c3c2OCCO3)c2c1OCCO2,1.0217,,,,,,


In [16]:
# verify missing values in Pandas

# Load the CSV file
combined_data = pd.read_csv('/home/dion/projects/New_TransPolymer/TransPolymer/Dion/data_Dion/3654_combined_dataset.csv')

# Check for NaN values
print(combined_data.isna().sum())  # Shows how many NaN values per column
print(combined_data.info())        # Provides a summary including non-null counts


smiles             0
Egc_property     274
Egb_property    3093
Eea_property    3286
Ei_property     3284
Xc_property     3222
EPS_property    3272
Nc_property     3272
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3654 entries, 0 to 3653
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   smiles        3654 non-null   object 
 1   Egc_property  3380 non-null   float64
 2   Egb_property  561 non-null    float64
 3   Eea_property  368 non-null    float64
 4   Ei_property   370 non-null    float64
 5   Xc_property   432 non-null    float64
 6   EPS_property  382 non-null    float64
 7   Nc_property   382 non-null    float64
dtypes: float64(7), object(1)
memory usage: 228.5+ KB
None
