In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
# Load the dataset
file_path = '/kaggle/input/gender-classification-dataset/gender_classification_v7.csv'  # Replace with your file path
data = pd.read_csv(file_path)

In [None]:
missing_values = data.isnull().sum()

In [None]:
label_encoder = LabelEncoder()
data['gender'] = label_encoder.fit_transform(data['gender'])
X = data.drop('gender', axis=1)
y = data['gender']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Calculating the mean of each feature for each gender
feature_means = data.groupby('gender').mean()
feature_means = feature_means.rename(index={0: 'Female', 1: 'Male'})

# Transposing the dataframe for easier plotting
feature_means = feature_means.T

# Plotting
plt.figure(figsize=(12, 8))
feature_means.plot(kind='bar', color=['skyblue', 'salmon'], figsize=(12, 8))
plt.title('Average Feature Values by Gender', fontsize=16)
plt.xlabel('Features', fontsize=14)
plt.ylabel('Average Value', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.legend(title='Gender', title_fontsize='13', fontsize='12')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Train the Random Forest model
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)

In [None]:
# Feature importances visualization
feature_importances = random_forest_model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Features': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values('Importance', ascending=False)

In [None]:
# Enhanced plot
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='Importance', y='Features', palette='viridis')
plt.title('Feature Importances in Predicting Gender', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Features', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
# Initializing the models
logistic_model = LogisticRegression()

In [None]:
decision_tree_model = DecisionTreeClassifier()

In [None]:
logistic_model.fit(X_train, y_train)

In [None]:
decision_tree_model.fit(X_train, y_train)

In [None]:
# Making predictions on the test set
logistic_pred = logistic_model.predict(X_test)
decision_tree_pred = decision_tree_model.predict(X_test)
random_forest_pred = random_forest_model.predict(X_test)

In [None]:
# Calculating the accuracy of each model
logistic_accuracy = accuracy_score(y_test, logistic_pred)
decision_tree_accuracy = accuracy_score(y_test, decision_tree_pred)
random_forest_accuracy = accuracy_score(y_test, random_forest_pred)

# Print the accuracies
print(f"Logistic Regression Accuracy: {logistic_accuracy:.2%}")
print(f"Decision Tree Accuracy: {decision_tree_accuracy:.2%}")
print(f"Random Forest Accuracy: {random_forest_accuracy:.2%}")

In [None]:
# Calculate precision, recall, and F1 score for each model
logistic_precision = precision_score(y_test, logistic_pred)
logistic_recall = recall_score(y_test, logistic_pred)
logistic_f1 = f1_score(y_test, logistic_pred)

decision_tree_precision = precision_score(y_test, decision_tree_pred)
decision_tree_recall = recall_score(y_test, decision_tree_pred)
decision_tree_f1 = f1_score(y_test, decision_tree_pred)

random_forest_precision = precision_score(y_test, random_forest_pred)
random_forest_recall = recall_score(y_test, random_forest_pred)
random_forest_f1 = f1_score(y_test, random_forest_pred)

(logistic_precision, logistic_recall, logistic_f1,
 decision_tree_precision, decision_tree_recall, decision_tree_f1,
 random_forest_precision, random_forest_recall, random_forest_f1)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Confusion Matrices for each model
logistic_cm = confusion_matrix(y_test, logistic_pred)
decision_tree_cm = confusion_matrix(y_test, decision_tree_pred)
random_forest_cm = confusion_matrix(y_test, random_forest_pred)

# Function to plot confusion matrix
def plot_confusion_matrix(cm, title):
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(title)

# Plotting confusion matrices
plot_confusion_matrix(logistic_cm, 'Logistic Regression Confusion Matrix')
plot_confusion_matrix(decision_tree_cm, 'Decision Tree Confusion Matrix')
plot_confusion_matrix(random_forest_cm, 'Random Forest Confusion Matrix')

In [None]:
# Making a prediction
sample_data = {'long_hair': [1], 'forehead_width_cm': [12.5], 'forehead_height_cm': [5.8],
               'nose_wide': [1], 'nose_long': [0], 'lips_thin': [1], 'distance_nose_to_lip_long': [0]}
sample_df = pd.DataFrame(sample_data)
predicted_gender = random_forest_model.predict(sample_df)
predicted_gender_label = label_encoder.inverse_transform(predicted_gender)

# Print the prediction with an explanatory message
print(f"The model predicts the gender as '{predicted_gender_label[0]}' for the given features.")