In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn import metrics
from sklearn.model_selection import KFold

In [16]:
import warnings
warnings.filterwarnings('ignore')

In [17]:
df = pd.read_csv('../assests/scaled_voice_data.csv')

In [18]:
df.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,gender
0,0.073836,0.591989,0.154365,-0.232308,0.651033,0.624047,0.100932,-0.008774,1.005184,0.919347,...,0.073836,-1.238665,-0.830959,0.565959,0.793202,2.377614,0.481427,0.438824,-0.288187,male
1,1.154676,-0.955422,0.786977,1.120652,1.067673,-0.685178,-0.225935,-0.197671,-0.471031,-0.801835,...,1.154676,0.286996,0.550671,0.672624,-0.084909,-0.461523,1.142713,1.151375,-0.861679,female
2,1.147551,-0.249945,1.152662,0.669617,1.659033,0.154771,-0.291758,-0.214387,0.049465,-0.37417,...,1.147551,0.689055,0.567828,0.618983,1.027454,-0.461523,1.056169,1.064803,-0.380272,female
3,0.985811,-0.627594,0.70234,1.136456,0.774764,-0.865004,-0.269119,-0.219037,-0.291,-0.3585,...,0.985811,0.454646,0.53856,0.672624,1.278238,2.130732,0.243985,0.205746,-0.247591,female
4,-0.462124,0.438431,-0.107705,-0.675807,-0.170199,0.67491,-0.299355,-0.208935,0.883454,0.635008,...,-0.462124,-1.243998,-0.830959,-0.29403,-0.364441,2.624495,0.044268,-0.002914,-1.271045,male


In [19]:
x = df.drop('gender',axis=1)
y = df['gender']

In [20]:
x.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,mode,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx
0,0.073836,0.591989,0.154365,-0.232308,0.651033,0.624047,0.100932,-0.008774,1.005184,0.919347,-1.49363,0.073836,-1.238665,-0.830959,0.565959,0.793202,2.377614,0.481427,0.438824,-0.288187
1,1.154676,-0.955422,0.786977,1.120652,1.067673,-0.685178,-0.225935,-0.197671,-0.471031,-0.801835,0.389979,1.154676,0.286996,0.550671,0.672624,-0.084909,-0.461523,1.142713,1.151375,-0.861679
2,1.147551,-0.249945,1.152662,0.669617,1.659033,0.154771,-0.291758,-0.214387,0.049465,-0.37417,1.423891,1.147551,0.689055,0.567828,0.618983,1.027454,-0.461523,1.056169,1.064803,-0.380272
3,0.985811,-0.627594,0.70234,1.136456,0.774764,-0.865004,-0.269119,-0.219037,-0.291,-0.3585,0.444788,0.985811,0.454646,0.53856,0.672624,1.278238,2.130732,0.243985,0.205746,-0.247591
4,-0.462124,0.438431,-0.107705,-0.675807,-0.170199,0.67491,-0.299355,-0.208935,0.883454,0.635008,-0.946032,-0.462124,-1.243998,-0.830959,-0.29403,-0.364441,2.624495,0.044268,-0.002914,-1.271045


In [21]:
y.head()

0      male
1    female
2    female
3    female
4      male
Name: gender, dtype: object

### Perform K-Cross Validation for k = 10

Logistic regression model

In [63]:
def perform_logistic_regression(x_train, y_train, x_test, y_test, i, k, display_stats=False):
    log_reg_model = LogisticRegression(penalty='l2', max_iter=500)
    log_reg_model.fit(x_train, y_train)
    y_pred = log_reg_model.predict(x_test)

    # Accuracy, Precision, Recall, F1-score of Model
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average='weighted')
    recall = metrics.recall_score(y_test, y_pred, average='weighted')
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')

    if display_stats:
        print(f"Statistics for Fold {i} in K-fold (k={k}) 'Logistic Regression' algorithm")
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1-score: {f1_score}")
        print()

    return np.array([accuracy, precision, recall, f1_score])

Perform K-corss validation

In [64]:
k = 10
i = 0
k_fold = KFold(n_splits=k)

# Holds Mean of {accuracy, precision, recall, f1_score}
mean_stats = np.zeros(4)

for train_index, test_index in k_fold.split(x):
    x_train, x_test = x.loc[train_index], x.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    stats = perform_logistic_regression(x_train, y_train, x_test, y_test, i:=i+1, k, display_stats=False)
    mean_stats = mean_stats + stats
    
# Take mean of {accuracy, precision, recall, f1_score}
mean_stats = mean_stats / k

print(f"Mean accuracy: {mean_stats[0]}")
print(f"Mean Precision: {mean_stats[1]}")
print(f"Mean Recall: {mean_stats[2]}")
print(f"MeanF1-score: {mean_stats[3]}")

Mean accuracy: 0.972855688216268
Mean Precision: 0.97295340803819
Mean Recall: 0.972855688216268
MeanF1-score: 0.972859897665046
