In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../assests/feature_selected_voice_data.csv')

In [3]:
df.columns

Index(['meanfreq', 'sd', 'Q75', 'skew', 'sfm', 'mode', 'meanfun', 'minfun',
       'maxfun', 'meandom', 'maxdom', 'modindx', 'label'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   meanfreq  3168 non-null   float64
 1   sd        3168 non-null   float64
 2   Q75       3168 non-null   float64
 3   skew      3168 non-null   float64
 4   sfm       3168 non-null   float64
 5   mode      3168 non-null   float64
 6   meanfun   3168 non-null   float64
 7   minfun    3168 non-null   float64
 8   maxfun    3168 non-null   float64
 9   meandom   3168 non-null   float64
 10  maxdom    3168 non-null   float64
 11  modindx   3168 non-null   float64
 12  label     3168 non-null   object 
dtypes: float64(12), object(1)
memory usage: 321.9+ KB


##### Number of Null values in every columns

In [5]:
df.isnull().sum()

meanfreq    0
sd          0
Q75         0
skew        0
sfm         0
mode        0
meanfun     0
minfun      0
maxfun      0
meandom     0
maxdom      0
modindx     0
label       0
dtype: int64

In [6]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
meanfreq,3168.0,0.180907,0.029918,0.039363,0.163662,0.184838,0.199146,0.251124
sd,3168.0,0.057126,0.016652,0.018363,0.041954,0.059155,0.06702,0.115273
Q75,3168.0,0.224765,0.023639,0.042946,0.208747,0.225684,0.24366,0.273469
skew,3168.0,3.140168,4.240529,0.141735,1.649569,2.197101,2.931694,34.725453
sfm,3168.0,0.408216,0.177521,0.036876,0.258041,0.396335,0.533676,0.842936
mode,3168.0,0.165282,0.077203,0.0,0.118016,0.186599,0.221104,0.28
meanfun,3168.0,0.142807,0.032304,0.055565,0.116998,0.140519,0.169581,0.237636
minfun,3168.0,0.036802,0.01922,0.009775,0.018223,0.04611,0.047904,0.204082
maxfun,3168.0,0.258842,0.030077,0.103093,0.253968,0.271186,0.277457,0.279114
meandom,3168.0,0.829211,0.525205,0.007812,0.419828,0.765795,1.177166,2.957682


##### Duplicates values in Data

Only two Duplicate Data in Datset

Not much duplicates present, no need to remove

In [7]:
df[df.duplicated()]

Unnamed: 0,meanfreq,sd,Q75,skew,sfm,mode,meanfun,minfun,maxfun,meandom,maxdom,modindx,label
298,0.213732,0.057705,0.257984,2.113598,0.084934,0.248978,0.133667,0.028319,0.253968,0.818125,4.164062,0.229051,male
2403,0.21219,0.04319,0.245644,1.862573,0.314398,0.188957,0.139942,0.047198,0.27907,1.925551,15.609375,0.121344,female


In [8]:
from sklearn.preprocessing import StandardScaler

Normalize the Dataset

In [9]:
scalar = StandardScaler()

scalar.fit(df.drop('label', axis=1))

scaled_df = scalar.transform(df.drop('label', axis=1))

scaled_df = pd.DataFrame(scaled_df, columns=df.columns[:-1])

Shuffle the dataset randomly (because all males & female were grouped)

In [10]:
scaled_df['gender'] = df['label']

scaled_df = scaled_df.sample(frac=1)

scaled_df.index = np.arange(scaled_df.shape[0])

In [11]:
scaled_df.head()

Unnamed: 0,meanfreq,sd,Q75,skew,sfm,mode,meanfun,minfun,maxfun,meandom,maxdom,modindx,gender
0,-0.85153,0.28875,-0.702135,0.045661,0.658078,-0.808182,-1.0703,-1.049225,-1.217722,0.428449,-0.310231,1.490041,male
1,2.084694,-1.169544,1.722539,-0.050675,-1.279251,1.180453,1.367092,0.739336,0.618983,1.463027,1.309144,-0.48152,female
2,-0.517073,0.054301,-1.078278,-0.160767,0.305147,0.249806,-1.231903,0.484349,-0.666264,0.564103,0.108621,1.352695,male
3,0.381459,0.621918,1.226999,-0.2211,0.196662,1.146302,-0.213571,0.697687,0.672624,0.498357,0.636762,-0.582489,male
4,1.109734,-1.411762,0.757406,-0.40996,-1.490037,1.033517,0.629344,0.540973,0.565959,0.660845,1.14937,-0.812012,female


In [12]:
scaled_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3168 entries, 0 to 3167
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   meanfreq  3168 non-null   float64
 1   sd        3168 non-null   float64
 2   Q75       3168 non-null   float64
 3   skew      3168 non-null   float64
 4   sfm       3168 non-null   float64
 5   mode      3168 non-null   float64
 6   meanfun   3168 non-null   float64
 7   minfun    3168 non-null   float64
 8   maxfun    3168 non-null   float64
 9   meandom   3168 non-null   float64
 10  maxdom    3168 non-null   float64
 11  modindx   3168 non-null   float64
 12  gender    3168 non-null   object 
dtypes: float64(12), object(1)
memory usage: 346.5+ KB


In [14]:
scaled_df.to_csv('../assests/scaled_voice_data.csv', index=False)