In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv('/voice.csv')
df.head()


In [None]:
df.shape

In [None]:
df.isna().sum()

## Pie chart


In [None]:
df['label'].value_counts()

In [None]:
val= [1584,1584]
label = ['male','female']
plt.figure(figsize=(6,8))
plt.pie(val,labels=label)
plt.legend()
plt.show()


## Exploratory Data Analysis

In [None]:
corr = df.corr()
sns.set(font_scale=1.5)
plt.figure(figsize=(20,20))
sns.heatmap(corr,annot = True,cmap='coolwarm')
plt.show()

## We will do EDA on similar columns with correlation higher than 0.8 and remove the columns which will not fetch us important patterns.

In [None]:
corr = df.corr()
corr = corr[corr>0.85]
plt.figure(figsize=(15,15))
sns.heatmap(corr,annot = True,cmap='coolwarm')
plt.show()


In [None]:
df = df.drop(['meanfreq','centroid',],axis=1) 
#we removed meanfreq and centroid as it had the maximum correlated columns

In [None]:
sns.boxplot(x=df.maxdom,y=df.label)
plt.show()
sns.boxplot(x=df.dfrange,y=df.label)
plt.show()


In [None]:
df = df.drop(['dfrange','maxdom'],axis=1)

In [None]:
sns.boxplot(x=df['skew'],y=df.label)
plt.show()
sns.boxplot(x=df['kurt'],y=df.label)
plt.show()

In [None]:
df = df.drop(['kurt','skew'],axis=1)

### We removed both kurt and skew as it had too many outliers. It is bad for the training.


In [None]:
sns.boxplot(x=df['sd'],y=df.label)
plt.show()
sns.boxplot(x=df['IQR'],y=df.label)
plt.show()

In [None]:
df = df.drop(['IQR'],axis=1)

In [None]:
sns.boxplot(x=df['sfm'],y=df.label)
plt.show()
sns.boxplot(x=df['sp.ent'],y=df.label)
plt.show()

In [None]:
df = df.drop('sp.ent',axis=1)
#We will remove sp.ent as it has more outliers

In [None]:
sns.pairplot(df,kind = 'scatterplot',hue='label')
plt.show()

In [None]:
df = df.drop(['maxfun','modindx','minfun'],axis=1) 
#these columns have too many outliers

In [None]:
corr = df.corr()
corr = corr[corr>0.85]
plt.figure(figsize=(5,5))
sns.heatmap(corr,annot = True,cmap='coolwarm')
plt.show()


## Train-Test Split¶

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.iloc[:,:-1]
y = df.label

In [None]:
X_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 1)

## Metric Functions

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
def eval(y_pred,ytest):
    print("Confusion matrix:\n")
    cm = confusion_matrix(y_pred,ytest)
    sns.heatmap(cm,annot = True,xticklabels=["Female","Male"],yticklabels=["Female","Male"])
    plt.show()
    print("Classification Report\n",classification_report(y_pred,ytest))

In [None]:
def score(model):
    print("Training score: ",model.score(X_train,y_train))
    print("Test score: ",model.score(x_test,y_test))

## 1) DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
DTmodel = DecisionTreeClassifier(min_samples_split = 5,max_depth = 10,random_state = 0)

In [None]:
DTmodel.fit(X_train,y_train)

In [None]:
ypred1 = DTmodel.predict(x_test)
ypred1[:5]

In [None]:
score(DTmodel)

In [None]:
eval(ypred1,y_test)

### Conclusion: Slightly overfit model.

## 2) LogisticRegression¶

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
LRmodel = LogisticRegression(n_jobs=3,max_iter=1000,class_weight=0.001,random_state=0)

In [None]:
LRmodel.fit(X_train,y_train)

In [None]:
ypred2 = LRmodel.predict(x_test)

In [None]:
score(LRmodel)

In [None]:
eval(ypred2,y_test)

### Conclusion: Slightly inaccurate and underfit model with overall less training and test score.

# 3) Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
SVMmodel = SVC(kernel = 'rbf', C=2.0,random_state=0,degree = 3)

In [None]:
SVMmodel.fit(X_train,y_train)

In [None]:
ypred3 = SVMmodel.predict(x_test)

In [None]:
score(SVMmodel)

In [None]:
eval(ypred3,y_test)

### Conclusion: Properly fit as both training and testing scores are approximately same.

# 4) K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
Kmodel = KNeighborsClassifier(n_neighbors = 4,metric ='minkowski',p=1,n_jobs=5,algorithm='ball_tree')

In [None]:
Kmodel.fit(X_train,y_train)

In [None]:
ypred4 = Kmodel.predict(x_test)

In [None]:
score(Kmodel)

In [None]:
eval(ypred4,y_test)

### Conclusion : Good training and testing accuracy

# 5) Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RFmodel = RandomForestClassifier(n_estimators = 1000,max_depth = 11,n_jobs=5,criterion='gini',warm_start=True,min_samples_split=4,oob_score=True)

In [None]:
RFmodel.fit(X_train,y_train)

In [None]:
ypred5 = RFmodel.predict(x_test)

In [None]:
score(RFmodel)

In [None]:
eval(ypred5,y_test)

### Conclusion
### Random Forest classifier was proved to be the best classifier with only 13-15 missclassification in testing set.