In [92]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [93]:
df = pd.read_csv('../assests/feature_selected_voice_data.csv')

In [94]:
df.columns

Index(['meanfreq', 'sd', 'median', 'Q25', 'Q75', 'kurt', 'sp.ent', 'sfm',
       'mode', 'meanfun', 'minfun', 'modindx', 'label'],
      dtype='object')

In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   meanfreq  3168 non-null   float64
 1   sd        3168 non-null   float64
 2   median    3168 non-null   float64
 3   Q25       3168 non-null   float64
 4   Q75       3168 non-null   float64
 5   kurt      3168 non-null   float64
 6   sp.ent    3168 non-null   float64
 7   sfm       3168 non-null   float64
 8   mode      3168 non-null   float64
 9   meanfun   3168 non-null   float64
 10  minfun    3168 non-null   float64
 11  modindx   3168 non-null   float64
 12  label     3168 non-null   object 
dtypes: float64(12), object(1)
memory usage: 321.9+ KB


##### Number of Null values in every columns

In [96]:
df.isnull().sum()

meanfreq    0
sd          0
median      0
Q25         0
Q75         0
kurt        0
sp.ent      0
sfm         0
mode        0
meanfun     0
minfun      0
modindx     0
label       0
dtype: int64

In [97]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
meanfreq,3168.0,0.180907,0.029918,0.039363,0.163662,0.184838,0.199146,0.251124
sd,3168.0,0.057126,0.016652,0.018363,0.041954,0.059155,0.06702,0.115273
median,3168.0,0.185621,0.03636,0.010975,0.169593,0.190032,0.210618,0.261224
Q25,3168.0,0.140456,0.04868,0.000229,0.111087,0.140286,0.175939,0.247347
Q75,3168.0,0.224765,0.023639,0.042946,0.208747,0.225684,0.24366,0.273469
kurt,3168.0,36.568461,134.928661,2.068455,5.669547,8.318463,13.648905,1309.612887
sp.ent,3168.0,0.895127,0.04498,0.738651,0.861811,0.901767,0.928713,0.981997
sfm,3168.0,0.408216,0.177521,0.036876,0.258041,0.396335,0.533676,0.842936
mode,3168.0,0.165282,0.077203,0.0,0.118016,0.186599,0.221104,0.28
meanfun,3168.0,0.142807,0.032304,0.055565,0.116998,0.140519,0.169581,0.237636


##### Duplicates values in Data

Only two Duplicate Data in Datset

Not much duplicates present, no need to remove

In [98]:
df[df.duplicated()]

Unnamed: 0,meanfreq,sd,median,Q25,Q75,kurt,sp.ent,sfm,mode,meanfun,minfun,modindx,label
298,0.213732,0.057705,0.242573,0.141701,0.257984,7.890927,0.859712,0.084934,0.248978,0.133667,0.028319,0.229051,male
2403,0.21219,0.04319,0.215153,0.188957,0.245644,6.10979,0.877669,0.314398,0.188957,0.139942,0.047198,0.121344,female


In [99]:
from sklearn.preprocessing import StandardScaler

Normalize the Dataset

In [100]:
scalar = StandardScaler()

scalar.fit(df.drop('label', axis=1))

scaled_df = scalar.transform(df.drop('label', axis=1))

scaled_df = pd.DataFrame(scaled_df, columns=df.columns[:-1])

Shuffle the dataset randomly (because all males & female were grouped)

In [101]:
scaled_df['gender'] = df['label']

scaled_df = scaled_df.sample(frac=1)

scaled_df.index = np.arange(scaled_df.shape[0])

In [102]:
scaled_df.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,kurt,sp.ent,sfm,mode,meanfun,minfun,modindx,gender
0,-1.017139,2.012536,0.155201,-1.784546,0.116305,-0.230102,1.407711,1.638474,0.699608,1.235429,0.317097,0.185051,female
1,-0.242956,1.167716,0.209318,-0.236093,0.295384,-0.244366,1.617131,1.963082,0.471396,0.061898,-1.072361,0.505507,female
2,0.594624,-0.628856,0.383436,0.459674,0.778532,-0.223221,0.329348,-1.721108,-0.005193,0.131058,0.996114,-0.042381,male
3,0.410492,-1.41122,0.30699,0.786895,-0.472162,-0.225505,-1.279564,-1.02981,0.466451,1.474043,3.100587,0.767026,female
4,0.804964,-1.414826,0.70937,0.811088,0.051118,-0.234219,-0.741627,-1.293786,0.70713,0.488538,0.692233,-0.538644,female


In [103]:
scaled_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3168 entries, 0 to 3167
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   meanfreq  3168 non-null   float64
 1   sd        3168 non-null   float64
 2   median    3168 non-null   float64
 3   Q25       3168 non-null   float64
 4   Q75       3168 non-null   float64
 5   kurt      3168 non-null   float64
 6   sp.ent    3168 non-null   float64
 7   sfm       3168 non-null   float64
 8   mode      3168 non-null   float64
 9   meanfun   3168 non-null   float64
 10  minfun    3168 non-null   float64
 11  modindx   3168 non-null   float64
 12  gender    3168 non-null   object 
dtypes: float64(12), object(1)
memory usage: 346.5+ KB


In [104]:
scaled_df.to_csv('../assests/scaled_voice_data.csv', index=False)