In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../assests/feature_selected_voice_data.csv')

In [3]:
df.columns

Index(['meanfreq', 'sd', 'median', 'Q25', 'Q75', 'IQR', 'kurt', 'sp.ent',
       'sfm', 'mode', 'meanfun', 'minfun', 'modindx', 'label'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   meanfreq  3168 non-null   float64
 1   sd        3168 non-null   float64
 2   median    3168 non-null   float64
 3   Q25       3168 non-null   float64
 4   Q75       3168 non-null   float64
 5   IQR       3168 non-null   float64
 6   kurt      3168 non-null   float64
 7   sp.ent    3168 non-null   float64
 8   sfm       3168 non-null   float64
 9   mode      3168 non-null   float64
 10  meanfun   3168 non-null   float64
 11  minfun    3168 non-null   float64
 12  modindx   3168 non-null   float64
 13  label     3168 non-null   object 
dtypes: float64(13), object(1)
memory usage: 346.6+ KB


##### Number of Null values in every columns

In [5]:
df.isnull().sum()

meanfreq    0
sd          0
median      0
Q25         0
Q75         0
IQR         0
kurt        0
sp.ent      0
sfm         0
mode        0
meanfun     0
minfun      0
modindx     0
label       0
dtype: int64

In [6]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
meanfreq,3168.0,0.180907,0.029918,0.039363,0.163662,0.184838,0.199146,0.251124
sd,3168.0,0.057126,0.016652,0.018363,0.041954,0.059155,0.06702,0.115273
median,3168.0,0.185621,0.03636,0.010975,0.169593,0.190032,0.210618,0.261224
Q25,3168.0,0.140456,0.04868,0.000229,0.111087,0.140286,0.175939,0.247347
Q75,3168.0,0.224765,0.023639,0.042946,0.208747,0.225684,0.24366,0.273469
IQR,3168.0,0.084309,0.042783,0.014558,0.04256,0.09428,0.114175,0.252225
kurt,3168.0,36.568461,134.928661,2.068455,5.669547,8.318463,13.648905,1309.612887
sp.ent,3168.0,0.895127,0.04498,0.738651,0.861811,0.901767,0.928713,0.981997
sfm,3168.0,0.408216,0.177521,0.036876,0.258041,0.396335,0.533676,0.842936
mode,3168.0,0.165282,0.077203,0.0,0.118016,0.186599,0.221104,0.28


##### Duplicates values in Data

Only two Duplicate Data in Datset

Not much duplicates present, no need to remove

In [7]:
df[df.duplicated()]

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,kurt,sp.ent,sfm,mode,meanfun,minfun,modindx,label
298,0.213732,0.057705,0.242573,0.141701,0.257984,0.116283,7.890927,0.859712,0.084934,0.248978,0.133667,0.028319,0.229051,male
2403,0.21219,0.04319,0.215153,0.188957,0.245644,0.056687,6.10979,0.877669,0.314398,0.188957,0.139942,0.047198,0.121344,female


In [8]:
from sklearn.preprocessing import StandardScaler

Normalize the Dataset

In [9]:
scalar = StandardScaler()

scalar.fit(df.drop('label', axis=1))

scaled_df = scalar.transform(df.drop('label', axis=1))

scaled_df = pd.DataFrame(scaled_df, columns=df.columns[:-1])

Shuffle the dataset randomly (because all males & female were grouped)

In [10]:
scaled_df['gender'] = df['label']

scaled_df = scaled_df.sample(frac=1)

scaled_df.index = np.arange(scaled_df.shape[0])

In [11]:
scaled_df.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,kurt,sp.ent,sfm,mode,meanfun,minfun,modindx,gender
0,0.053833,0.182412,-0.043344,-0.211354,0.539997,0.538854,-0.240888,0.835319,-0.042416,0.207412,-0.837226,0.540973,-0.366495,male
1,-1.948336,1.299573,-2.059818,-1.855442,-0.519379,1.824196,0.163259,1.02512,1.077412,-2.14121,-1.247501,-1.0899,0.159773,male
2,0.646042,0.495259,0.871554,0.640148,1.145548,-0.095419,-0.201828,0.608758,0.670084,0.694611,0.64814,0.567828,-0.76124,female
3,0.676572,-0.985618,0.417501,0.876052,-0.107268,-1.056066,-0.218332,-0.515259,-0.533573,0.230766,0.775463,0.602869,-0.112419,female
4,0.945822,-0.909062,0.667029,1.041938,0.402815,-0.962975,-0.21986,-0.7156,-0.588908,0.517069,0.611636,0.652038,-0.295663,female


In [12]:
scaled_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3168 entries, 0 to 3167
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   meanfreq  3168 non-null   float64
 1   sd        3168 non-null   float64
 2   median    3168 non-null   float64
 3   Q25       3168 non-null   float64
 4   Q75       3168 non-null   float64
 5   IQR       3168 non-null   float64
 6   kurt      3168 non-null   float64
 7   sp.ent    3168 non-null   float64
 8   sfm       3168 non-null   float64
 9   mode      3168 non-null   float64
 10  meanfun   3168 non-null   float64
 11  minfun    3168 non-null   float64
 12  modindx   3168 non-null   float64
 13  gender    3168 non-null   object 
dtypes: float64(13), object(1)
memory usage: 371.2+ KB


In [13]:
scaled_df.to_csv('../assests/scaled_voice_data.csv', index=False)