In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, re
from sklearn import metrics, preprocessing
from scipy import signal

In [3]:
train_df = pd.read_csv('cubic_d_spacing.csv', sep='\t')

In [4]:
train_df

Unnamed: 0,space_group,space_code,names,peak_1,h_1,k_1,l_1,peak_2,h_2,k_2,...,k_8,l_8,peak_9,h_9,k_9,l_9,peak_10,h_10,k_10,l_10
0,Fm3m,0,Sulphohalite_0018630.txt,5.814495,1,1,1,5.035500,2,1,...,1,0,2.055734,4,2,1,1.938165,3,3,1
1,Pn3,1,Burtite_0005567.txt,5.743192,1,1,0,4.689297,1,1,...,1,0,2.448905,3,1,1,2.344648,2,2,1
2,Pn3,1,CaC2__0017416.txt,4.051722,1,1,0,3.308217,1,1,...,1,0,1.727660,3,1,1,1.654109,2,2,1
3,Pn3,1,Cafarsite_0015405.txt,11.302395,1,1,0,9.228367,1,1,...,1,0,4.819357,3,1,1,4.614183,2,2,1
4,Pn3,1,Jeanbandyite_0013052.txt,5.485027,1,1,0,4.478506,1,1,...,1,0,2.338823,3,1,1,2.239253,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Im3,6,Menezesite_0004492.txt,9.204409,1,1,0,6.508500,2,1,...,1,1,3.478940,2,1,1,3.254250,4,1,0
73,Im3,6,Perovskite group_0016660.txt,5.258046,1,1,0,3.718000,2,1,...,1,1,1.987355,3,1,1,1.859000,4,1,0
74,Im3,6,RhAs3__0017607.txt,5.975547,1,1,0,4.225350,2,1,...,1,1,2.258545,3,1,1,2.112675,4,1,0
75,Im3,6,RhSb3__0017611.txt,6.528151,1,1,0,4.616100,2,1,...,1,1,2.467409,3,1,1,2.308050,4,1,0


In [5]:
train_df["space_group"].unique()

array(['Fm3m', 'Pn3', 'Fd3m', 'Fd3', 'Pa3', 'Im3'], dtype=object)

In [6]:
train_df["a"] = [10.071, 8.1221, 5.73, 15.984,7.757, 7.7449, 7.80, 7.8744, 7.885, 11.07343, 9.478, 10.4396, 9.457,
                 9.98850, 16.749, 16.80, 25.104, 13.962,  16.710, 14.069, 15.0850, 13.9038,12.1640, 12.213, 6.649,
                 14.04078, 5.5385, 7.89, 11.370, 5.4942, 5.582, 6.7378, 8.09, 5.6194, 5.329, 5.585, 5.7053, 6.4423,
                 12.205, 6.097, 6.7014, 6.119, 5.77, 12.2305, 5.6106, 6.943, 6.417, 5.72, 12.832, 6.4162, 5.4706,
                 8.1184, 5.644, 5.9629, 5.38,5.9665, 5.8593, 12.240, 5.6765, 5.7891, 7.391, 7.46718, 7.359, 7.455, 7.383,
                7.9743, 9.0904, 7.645, 7.5875, 8.4673, 9.2533, 9.0411, 13.017, 7.436, 8.4507, 9.2322, 8.2055 ]

In [7]:
augmentation = []
for i in range(len(train_df)):
    
    ind = train_df.iloc[[i],]
    ind=ind.reindex(ind.index.repeat(1000))
    ind = ind.reset_index(drop=True)
    
    for j in range(len(ind)):
        
        data = ind.iloc[j][["peak_1", "peak_2", "peak_3", "peak_4","peak_5", "peak_6", "peak_7", "peak_8", "peak_9", "peak_10"]].values
        cell_param = ind.iloc[j]["a"]
        
        param = []
        delta_a = np.random.randint(1,150)*0.001
        p = np.random.randint(2)
        if p > 0:
            param = cell_param + delta_a
        else:
            param = cell_param - delta_a
                    
        peaks = []
        a_prop = delta_a / cell_param
        for k in range(len(data)):
            
            delta_d = data[k] * a_prop
            
            if p > 0:
                peaks.append(delta_d + data[k])
            else:
                peaks.append(data[k] - delta_d)
                        
        ind.loc[j,["peak_1", "peak_2", "peak_3", "peak_4","peak_5", "peak_6", "peak_7", "peak_8", "peak_9", "peak_10"]] = peaks
        ind.loc[j,["a"]] = param
    augmentation.append(ind)

In [8]:
len(augmentation)

77

In [9]:
aug_df = pd.concat(augmentation, ignore_index=True)

In [10]:
train_data = pd.concat([aug_df,train_df], ignore_index=True)

In [11]:
train_data

Unnamed: 0,space_group,space_code,names,peak_1,h_1,k_1,l_1,peak_2,h_2,k_2,...,l_8,peak_9,h_9,k_9,l_9,peak_10,h_10,k_10,l_10,a
0,Fm3m,0,Sulphohalite_0018630.txt,5.774080,1,1,1,5.00050,2,1,...,0,2.041445,4,2,1,1.924693,3,3,1,10.0010
1,Fm3m,0,Sulphohalite_0018630.txt,5.838166,1,1,1,5.05600,2,1,...,0,2.064103,4,2,1,1.946055,3,3,1,10.1120
2,Fm3m,0,Sulphohalite_0018630.txt,5.820846,1,1,1,5.04100,2,1,...,0,2.057979,4,2,1,1.940282,3,3,1,10.0820
3,Fm3m,0,Sulphohalite_0018630.txt,5.765998,1,1,1,4.99350,2,1,...,0,2.038588,4,2,1,1.921999,3,3,1,9.9870
4,Fm3m,0,Sulphohalite_0018630.txt,5.767152,1,1,1,4.99450,2,1,...,0,2.038996,4,2,1,1.922384,3,3,1,9.9890
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77072,Im3,6,Menezesite_0004492.txt,9.204409,1,1,0,6.50850,2,1,...,1,3.478940,2,1,1,3.254250,4,1,0,13.0170
77073,Im3,6,Perovskite group_0016660.txt,5.258046,1,1,0,3.71800,2,1,...,1,1.987355,3,1,1,1.859000,4,1,0,7.4360
77074,Im3,6,RhAs3__0017607.txt,5.975547,1,1,0,4.22535,2,1,...,1,2.258545,3,1,1,2.112675,4,1,0,8.4507
77075,Im3,6,RhSb3__0017611.txt,6.528151,1,1,0,4.61610,2,1,...,1,2.467409,3,1,1,2.308050,4,1,0,9.2322


In [12]:
train_data.to_csv("training_dataset_cubic_big.csv", sep='\t', index=False)