In [67]:
import numpy as np
import pandas as pd

# library to upsample the classes
from sklearn.utils import resample

In [68]:
df=pd.read_excel('Formatted_Data.xlsx')
df.columns

Index(['Dilation_Average', 'Erosion_Average', 'C_Kurtosis', 'C_ Max',
       'C_Skewness', 'C_Variance', 'D_Kurtosis', 'D_ Max', 'D_Skewness',
       'D_Variance', 'E_Kurtosis', 'E_ Max', 'E_Skewness', 'E_Variance',
       'O_Kurtosis', 'O_ Max', 'O_Skewness', 'O_Variance', 'Nc_D', 'Nc_E',
       'Nmsw_D', 'Class'],
      dtype='object')

In [69]:
# function to generate upsampled data
def generateNoise(D):
    # for that noise mean should be zero, variance is also low but not zero
    mu=0;sigma=0.001
    # generate the noise
    # -1 as we need to exclude the response variable column
    noise = np.random.normal(mu, sigma, [D.shape[0],D.shape[1]-1])
    
    # drop the response variable named 'Class'
    response=D['Class']
    D.drop(['Class'],axis=1,inplace=True)
    D=D+noise
    D['Class']=response
    return D

def upSampling(data,requiredSamples):
    # upsample the data
    upsampledData=resample(data,replace=True,n_samples=requiredSamples,random_state=123)
    # concat with the main data
    df_upsampled = pd.concat([data, upsampledData])
    # generate a gaussian noise to simulate real life scenario
    df_upsampled=generateNoise(df_upsampled) 
    return df_upsampled

In [70]:
SampleCount=50000

In [71]:
df=upSampling(df,SampleCount)
df.head()

Unnamed: 0,Dilation_Average,Erosion_Average,C_Kurtosis,C_ Max,C_Skewness,C_Variance,D_Kurtosis,D_ Max,D_Skewness,D_Variance,...,E_Skewness,E_Variance,O_Kurtosis,O_ Max,O_Skewness,O_Variance,Nc_D,Nc_E,Nmsw_D,Class
0,1.005025,-1.002908,1.523871,0.473403,0.000777,0.099156,1.542107,1.472998,-0.006917,0.099124,...,0.006496,0.099466,1.526059,0.472104,0.000275,0.099121,1.2567,-0.521177,101.484009,Very_Light
1,0.993316,-1.012023,1.524415,0.463618,-0.00023,0.100421,1.541501,1.460817,-0.006714,0.100438,...,0.008397,0.099548,1.522642,0.462198,-0.000367,0.096548,1.260661,-0.494107,101.586946,Very_Light
2,0.998489,-1.009219,1.525321,0.468282,0.001303,0.09924,1.5411,1.469462,-0.008148,0.099068,...,0.008137,0.100705,1.525556,0.465166,-0.001598,0.100958,1.258329,-0.505472,101.532778,Very_Light
3,1.000499,-1.007116,1.525339,0.469114,-0.001708,0.099181,1.540017,1.469426,-0.00731,0.100589,...,0.00904,0.100271,1.524009,0.468294,0.00142,0.100961,1.258298,-0.512631,101.513196,Very_Light
4,1.005222,-1.002099,1.525669,0.475433,0.000591,0.100357,1.542298,1.474034,-0.009715,0.101331,...,0.009223,0.101484,1.526413,0.472763,-0.000591,0.101906,1.25649,-0.525279,101.463403,Very_Light


In [72]:
df.shape

(50025, 22)

In [73]:
# value counts for each and every class
df.Class.value_counts()

High          10171
Very_Light    10042
Moderate      10014
Light          9915
Very_High      9883
Name: Class, dtype: int64

# Remarks:

As you had seen that the number of values for each class was too small to do any kind of ML algorithm as it will most likely to overfit any model.
So we have upsampled the data and add some gaussian noise to simulate real life measurements.

After that we have a dataset with 1 lakhs+ data with some noise.

In [74]:
# create a csv file to store the upsampled data. we will apply ML algorithm on top of that

In [75]:
df.to_csv('upsampledData.csv',index=False)