In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dados = pd.read_csv("gender_classification_v7.csv")
dados

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female
...,...,...,...,...,...,...,...,...
4996,1,13.6,5.1,0,0,0,0,Female
4997,1,11.9,5.4,0,0,0,0,Female
4998,1,12.9,5.7,0,0,0,0,Female
4999,1,13.2,6.2,0,0,0,0,Female


In [3]:
dados.isna().sum()

long_hair                    0
forehead_width_cm            0
forehead_height_cm           0
nose_wide                    0
nose_long                    0
lips_thin                    0
distance_nose_to_lip_long    0
gender                       0
dtype: int64

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV


In [5]:
pipeline = Pipeline([
    ("stan",StandardScaler())
])
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
X = dados.drop("gender",axis=1)
y = dados["gender"].copy()

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [6]:
x_train_prepared = pipeline.fit_transform(x_train)
y_train_prepared = (y_train == "Male")


In [7]:
from sklearn.metrics import recall_score,precision_score,roc_auc_score 
svc = SVC()
svc.fit(x_train_prepared,y_train_prepared)
x_train_predict = svc.predict(x_train_prepared)
recall_score(x_train_predict,y_train_prepared)

0.9897907095456866

In [8]:
precision_score(x_train_predict,y_train_prepared)

0.9690154922538731

In [9]:
roc_auc_score(x_train_predict,y_train_prepared)

0.9797067217498153

In [10]:
x_test_prepared = pipeline.fit_transform(x_test)
y_test_prepared = (y_test == "Male")

In [11]:
y_test_predict = svc.predict(x_test_prepared)
recall_score(y_test_predict,y_test_prepared)

0.979381443298969

In [12]:
precision_score(y_test_predict,y_test_prepared)

0.9519038076152304

In [13]:
roc_auc_score(y_test_predict,y_test_prepared)

0.9664349076959963

In [14]:
teste = y_test_predict[:10]

In [15]:
teste

array([ True, False, False,  True,  True,  True,  True,  True,  True,
       False])

In [16]:
teste2 = y_test_prepared[:10]

In [17]:
teste2

1501     True
2586    False
2653    False
1055     True
705      True
106     False
589      True
2468     True
2413     True
1600    False
Name: gender, dtype: bool

In [18]:
print("predito  real")
for x,y in zip(teste,teste2):
   
    print(x,f"    {y}")

predito  real
True     True
False     False
False     False
True     True
True     True
True     False
True     True
True     True
True     True
False     False


In [19]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
tree = DecisionTreeClassifier()
param_grid = {
    "max_leaf_nodes":[2,4,6,8,10,11,12,13,14,15,16,17,18,19,20,21,22],
    "max_depth":[1,2,3,4,5,6,7,8,9,10],
    "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10,11,12,13]
}

model_final = GridSearchCV(tree,param_grid,cv = 3,scoring="accuracy")
model_final.fit(x_train_prepared,y_train_prepared)
final = model_final.best_estimator_
svc = SVC()
random = RandomForestClassifier()
voting = VotingClassifier(estimators = [("tree",final),("random",random),("svc",svc)],voting="hard")



In [20]:
voting.fit(x_train_prepared,y_train_prepared)

In [24]:
x_test_prepared = pipeline.fit_transform(x_test)

y_test_pred = final.predict(x_test_prepared)

precision_score(y_test_prepared,y_test_pred)

0.975103734439834

In [25]:
recall_score(y_test_prepared,y_test_pred)

0.9418837675350702

In [26]:
roc_auc_score(y_test_prepared,y_test_pred)

0.9589896925324753