In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('../assests/voice.csv')

In [4]:
df.columns

Index(['meanfreq', 'sd', 'median', 'Q25', 'Q75', 'IQR', 'skew', 'kurt',
       'sp.ent', 'sfm', 'mode', 'centroid', 'meanfun', 'minfun', 'maxfun',
       'meandom', 'mindom', 'maxdom', 'dfrange', 'modindx', 'label'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   meanfreq  3168 non-null   float64
 1   sd        3168 non-null   float64
 2   median    3168 non-null   float64
 3   Q25       3168 non-null   float64
 4   Q75       3168 non-null   float64
 5   IQR       3168 non-null   float64
 6   skew      3168 non-null   float64
 7   kurt      3168 non-null   float64
 8   sp.ent    3168 non-null   float64
 9   sfm       3168 non-null   float64
 10  mode      3168 non-null   float64
 11  centroid  3168 non-null   float64
 12  meanfun   3168 non-null   float64
 13  minfun    3168 non-null   float64
 14  maxfun    3168 non-null   float64
 15  meandom   3168 non-null   float64
 16  mindom    3168 non-null   float64
 17  maxdom    3168 non-null   float64
 18  dfrange   3168 non-null   float64
 19  modindx   3168 non-null   float64
 20  label     3168 non-null   obje

##### Number of Null values in every columns

In [6]:
df.isnull().sum()

meanfreq    0
sd          0
median      0
Q25         0
Q75         0
IQR         0
skew        0
kurt        0
sp.ent      0
sfm         0
mode        0
centroid    0
meanfun     0
minfun      0
maxfun      0
meandom     0
mindom      0
maxdom      0
dfrange     0
modindx     0
label       0
dtype: int64

In [7]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
meanfreq,3168.0,0.180907,0.029918,0.039363,0.163662,0.184838,0.199146,0.251124
sd,3168.0,0.057126,0.016652,0.018363,0.041954,0.059155,0.06702,0.115273
median,3168.0,0.185621,0.03636,0.010975,0.169593,0.190032,0.210618,0.261224
Q25,3168.0,0.140456,0.04868,0.000229,0.111087,0.140286,0.175939,0.247347
Q75,3168.0,0.224765,0.023639,0.042946,0.208747,0.225684,0.24366,0.273469
IQR,3168.0,0.084309,0.042783,0.014558,0.04256,0.09428,0.114175,0.252225
skew,3168.0,3.140168,4.240529,0.141735,1.649569,2.197101,2.931694,34.725453
kurt,3168.0,36.568461,134.928661,2.068455,5.669547,8.318463,13.648905,1309.612887
sp.ent,3168.0,0.895127,0.04498,0.738651,0.861811,0.901767,0.928713,0.981997
sfm,3168.0,0.408216,0.177521,0.036876,0.258041,0.396335,0.533676,0.842936


##### Duplicates values in Data

Only two Duplicate Data in Datset

Not much duplicates present, no need to remove

In [8]:
df[df.duplicated()]

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
298,0.213732,0.057705,0.242573,0.141701,0.257984,0.116283,2.113598,7.890927,0.859712,0.084934,...,0.213732,0.133667,0.028319,0.253968,0.818125,0.132812,4.164062,4.03125,0.229051,male
2403,0.21219,0.04319,0.215153,0.188957,0.245644,0.056687,1.862573,6.10979,0.877669,0.314398,...,0.21219,0.139942,0.047198,0.27907,1.925551,0.023438,15.609375,15.585938,0.121344,female


In [9]:
from sklearn.preprocessing import StandardScaler

Normalize the Dataset

In [10]:
scalar = StandardScaler()

scalar.fit(df.drop('label', axis=1))

scaled_df = scalar.transform(df.drop('label', axis=1))

scaled_df = pd.DataFrame(scaled_df, columns=df.columns[:-1])

Shuffle the dataset randomly (because all males & female were grouped)

In [11]:
scaled_df['gender'] = df['label']

scaled_df = scaled_df.sample(frac=1)

scaled_df.index = np.arange(scaled_df.shape[0])

In [12]:
scaled_df.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,gender
0,0.073836,0.591989,0.154365,-0.232308,0.651033,0.624047,0.100932,-0.008774,1.005184,0.919347,...,0.073836,-1.238665,-0.830959,0.565959,0.793202,2.377614,0.481427,0.438824,-0.288187,male
1,1.154676,-0.955422,0.786977,1.120652,1.067673,-0.685178,-0.225935,-0.197671,-0.471031,-0.801835,...,1.154676,0.286996,0.550671,0.672624,-0.084909,-0.461523,1.142713,1.151375,-0.861679,female
2,1.147551,-0.249945,1.152662,0.669617,1.659033,0.154771,-0.291758,-0.214387,0.049465,-0.37417,...,1.147551,0.689055,0.567828,0.618983,1.027454,-0.461523,1.056169,1.064803,-0.380272,female
3,0.985811,-0.627594,0.70234,1.136456,0.774764,-0.865004,-0.269119,-0.219037,-0.291,-0.3585,...,0.985811,0.454646,0.53856,0.672624,1.278238,2.130732,0.243985,0.205746,-0.247591,female
4,-0.462124,0.438431,-0.107705,-0.675807,-0.170199,0.67491,-0.299355,-0.208935,0.883454,0.635008,...,-0.462124,-1.243998,-0.830959,-0.29403,-0.364441,2.624495,0.044268,-0.002914,-1.271045,male


In [13]:
scaled_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3168 entries, 0 to 3167
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   meanfreq  3168 non-null   float64
 1   sd        3168 non-null   float64
 2   median    3168 non-null   float64
 3   Q25       3168 non-null   float64
 4   Q75       3168 non-null   float64
 5   IQR       3168 non-null   float64
 6   skew      3168 non-null   float64
 7   kurt      3168 non-null   float64
 8   sp.ent    3168 non-null   float64
 9   sfm       3168 non-null   float64
 10  mode      3168 non-null   float64
 11  centroid  3168 non-null   float64
 12  meanfun   3168 non-null   float64
 13  minfun    3168 non-null   float64
 14  maxfun    3168 non-null   float64
 15  meandom   3168 non-null   float64
 16  mindom    3168 non-null   float64
 17  maxdom    3168 non-null   float64
 18  dfrange   3168 non-null   float64
 19  modindx   3168 non-null   float64
 20  gender    3168 non-null   obje

In [14]:
scaled_df.to_csv('../assests/scaled_voice_data.csv', index=False)