In [1]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import os

In [2]:
test_df = pd.read_csv("data/test_new.csv")
# Drop the null columns where all values are null
# df = df.dropna(axis='columns', how='all')
# Drop the null rows
test_df = test_df.dropna()
test_df = test_df.drop(columns = ['Date of Joining'])

test_df.count()

Gender                  12250
Company Type            12250
WFH Setup Available     12250
Designation             12250
Resource Allocation     12250
Mental Fatigue Score    12250
dtype: int64

In [3]:
test_df.head()

Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score
0,1,0,0,2.0,5.0,7.7
1,1,1,1,1.0,2.0,5.2
2,0,1,1,1.0,3.0,5.9
3,1,0,0,3.0,6.0,4.6
4,1,1,0,2.0,5.0,6.4


In [4]:
selected_features = test_df.drop(columns = ['Gender'])

selected_features

Unnamed: 0,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score
0,0,0,2.0,5.0,7.7
1,1,1,1.0,2.0,5.2
2,1,1,1.0,3.0,5.9
3,0,0,3.0,6.0,4.6
4,1,0,2.0,5.0,6.4
...,...,...,...,...,...
12245,0,1,1.0,2.0,6.1
12246,1,1,2.0,4.0,5.9
12247,0,0,4.0,7.0,9.6
12248,0,0,3.0,6.0,6.7


In [5]:
#split train test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(selected_features, test_df['Gender'], random_state=1)

In [6]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(kernel='linear')

In [7]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.575


In [9]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                           target_names = ['male', 'female']))

              precision    recall  f1-score   support

        male       0.57      0.46      0.51      1459
      female       0.58      0.68      0.62      1604

    accuracy                           0.57      3063
   macro avg       0.57      0.57      0.57      3063
weighted avg       0.57      0.57      0.57      3063



In [10]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [11]:
# Fit the model using the grid search estimator. 

grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.587, total=  16.9s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   17.0s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.590, total=  16.4s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   33.4s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.584, total=  12.5s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.584, total=  23.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.591, total=   8.9s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.587, total=  15.1s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.590, total=  14.4s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.584, total=  12.6s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.584, total=  12.4s
[CV] C=1, gamma=0.001 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed: 14.7min finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=3)

In [12]:
# List the best parameters for this dataset
print(grid.best_params_)

{'C': 1, 'gamma': 0.0001}


In [13]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test)

In [14]:
print('Test Acc: %.3f' % grid.score(X_test, y_test))

Test Acc: 0.575


In [15]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                           target_names = ['male', 'female']))

              precision    recall  f1-score   support

        male       0.57      0.46      0.51      1459
      female       0.58      0.68      0.62      1604

    accuracy                           0.57      3063
   macro avg       0.57      0.57      0.57      3063
weighted avg       0.57      0.57      0.57      3063

