In [1]:
import pandas as pd
import numpy as np

# Manually creating a gender classification dataset
data = {
    'Height_cm': np.random.normal(170, 10, 200),  # Average male height ~170 cm
    'Weight_kg': np.random.normal(70, 15, 200),  # Average male weight ~70 kg
    'Hair_Length_cm': np.random.choice([5, 10, 20, 40, 50, np.nan], 200),  # Women generally have longer hair
    'Voice_Pitch_Hz': np.random.normal(160, 30, 200),  # Female voice ~200 Hz, Male ~120 Hz
    'Strength_Score': np.random.randint(1, 10, 200),  # Arbitrary strength score
    'Favorite_Color': np.random.choice(['Blue', 'Pink', 'Black', 'Red', 'Green'], 200),  # Cultural color preference
    'Shopping_Frequency': np.random.randint(1, 20, 200),  # Frequency of shopping in a month
    'Target': np.random.choice(['Male', 'Female'], 200)  # Gender classification target
}

# Creating DataFrame
df = pd.DataFrame(data)

# Introduce some missing values manually
df.loc[df.sample(frac=0.05).index, 'Hair_Length_cm'] = np.nan  # 5% missing values in Hair Length

# Save dataset
df.to_csv('gender_classification_dataset.csv', index=False)
print("Manually created gender classification dataset with realistic traits.")


Manually created gender classification dataset with realistic traits.


In [2]:
import matplotlib.pyplot as plt

In [3]:
dataset = pd.read_csv('gender_classification_dataset.csv')
dataset

Unnamed: 0,Height_cm,Weight_kg,Hair_Length_cm,Voice_Pitch_Hz,Strength_Score,Favorite_Color,Shopping_Frequency,Target
0,163.343936,73.759010,40.0,159.118795,6,Red,11,Male
1,174.422236,73.474158,5.0,164.474443,4,Pink,2,Female
2,162.507675,49.564809,5.0,132.132629,5,Green,10,Female
3,167.364915,52.072813,10.0,186.507616,5,Red,4,Male
4,166.177937,80.659561,20.0,167.786582,6,Pink,6,Male
...,...,...,...,...,...,...,...,...
195,163.233744,73.204842,,161.530426,6,Black,6,Female
196,171.787482,59.781836,40.0,162.615282,6,Blue,2,Male
197,150.618536,68.980083,5.0,209.706646,8,Red,4,Male
198,173.706177,86.280613,20.0,151.332958,8,Pink,6,Male


In [4]:
dataset.isnull().sum()

Height_cm              0
Weight_kg              0
Hair_Length_cm        34
Voice_Pitch_Hz         0
Strength_Score         0
Favorite_Color         0
Shopping_Frequency     0
Target                 0
dtype: int64

In [9]:
## Handelling the missing values 
dataset_cleaned = dataset.dropna()

In [11]:
dataset_cleaned.isnull().sum()

Height_cm             0
Weight_kg             0
Hair_Length_cm        0
Voice_Pitch_Hz        0
Strength_Score        0
Favorite_Color        0
Shopping_Frequency    0
Target                0
dtype: int64

In [12]:
x = dataset_cleaned.iloc[:,:-1].values
y = dataset_cleaned.iloc[:,-1].values

In [15]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [5])], remainder = 'passthrough')
x = np.array(ct.fit_transform(x))

In [16]:
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()
y = le.fit_transform(y)

In [17]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state=42)

In [18]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [19]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression()
regressor.fit(x_train,y_train)

In [20]:
y_pred = regressor.predict(x_test)

In [21]:
print(y_pred)

[1 0 1 1 0 0 0 1 1 1 1 1 1 1 1 0 0 1 1 0 1 0 1 0 0 0 0 1 0 1 0 1 1 0]


In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
accuracy = accu