### Open Source Number of Men/Women Coder

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import cv2
from deepface import DeepFace
import numpy as np

class DeepFacePeopleGenderAnalyzer:
    """
    Analyzes images to detect people, predict their gender, and calculate gender proportions
    using the open-source DeepFace library.
    """

    def __init__(self):
        """
        Initialize the DeepFace People Gender Analyzer.
        """
        # Detection and confidence thresholds
        self.detection_confidence = 0.6
        
        print("Initialized DeepFace People Gender Analyzer")

    def analyze_image(self, image_path):
        """
        Analyze a single image to detect people and predict their gender.

        Args:
            image_path (str): Path to the image file

        Returns:
            dict: Analysis results containing detected people and gender predictions
        """
        if not os.path.exists(image_path):
            return {"error": f"Image file not found: {image_path}"}

        try:
            # Read the image
            img = cv2.imread(image_path)
            if img is None:
                return {"error": f"Failed to read image: {image_path}"}
            
            # Get face detections using DeepFace
            # Note: We'll use RetinaFace detector as it's more accurate
            try:
                faces = DeepFace.extract_faces(
                    img_path=image_path,
                    detector_backend="retinaface",
                    enforce_detection=False,
                    align=True
                )
            except Exception as e:
                # If RetinaFace fails, try with OpenCV
                try:
                    faces = DeepFace.extract_faces(
                        img_path=image_path,
                        detector_backend="opencv",
                        enforce_detection=False,
                        align=True
                    )
                except:
                    # If all detectors fail, return empty results
                    return {
                        "image_path": image_path,
                        "people_count": 0,
                        "has_people": False,
                        "gender_predictions": [],
                        "female_proportion": 0,
                        "female_count": 0,
                        "male_count": 0
                    }
            
            # Filter faces based on confidence
            valid_faces = [face for face in faces if face.get("confidence", 0) >= self.detection_confidence]
            
            # Analyze each detected face for gender
            gender_predictions = []
            female_count = 0
            male_count = 0
            
            for i, face in enumerate(valid_faces):
                try:
                    # Extract face region for analysis
                    face_img = face['face']
                    
                    # Analyze with DeepFace
                    analysis = DeepFace.analyze(
                        img_path=face_img,  # We can pass the face image directly
                        actions=['gender'],
                        enforce_detection=False,
                        detector_backend="skip"  # Skip detection as we already have the face
                    )
                    
                    # Extract gender prediction
                    if isinstance(analysis, list):
                        analysis = analysis[0]  # Take first result if it's a list
                    
                    gender = analysis.get('dominant_gender', '')
                    gender_score = 0.0
                    
                    # Get the actual probabilities
                    if gender == 'Man':
                        gender = 'male'
                        gender_score = analysis.get('gender', {}).get('Man', 0.0)
                    else:
                        gender = 'female'
                        gender_score = analysis.get('gender', {}).get('Woman', 0.0)
                    
                    # Update counts
                    if gender == 'female':
                        female_count += 1
                    else:
                        male_count += 1
                    
                    # Add to predictions
                    gender_predictions.append({
                        "person_id": i + 1,
                        "gender": gender,
                        "confidence": gender_score
                    })
                    
                except Exception as e:
                    # If analysis fails for a face, skip it
                    continue
            
            # Calculate the proportion of women
            total_count = len(gender_predictions)
            female_proportion = female_count / total_count if total_count > 0 else 0
            
            # Create a formatted gender prediction string
            gender_prediction_str = ", ".join([
                f"person{pred['person_id']}: {pred['gender']} ({pred['confidence']:.2f})"
                for pred in gender_predictions
            ])
            
            return {
                "image_path": image_path,
                "people_count": total_count,
                "has_people": total_count > 0,
                "gender_predictions": gender_predictions,
                "gender_prediction_str": gender_prediction_str,
                "female_proportion": female_proportion,
                "female_count": female_count,
                "male_count": male_count
            }

        except Exception as e:
            return {"error": f"Error analyzing image: {str(e)}"}

    def analyze_directory(self, images_dir, output_csv=None):
        """
        Analyze all images in a directory and output results to CSV.

        Args:
            images_dir (str): Directory containing images
            output_csv (str): Path to save CSV results

        Returns:
            pd.DataFrame: Analysis results for all images
        """
        if not os.path.exists(images_dir):
            print(f"Error: Directory not found: {images_dir}")
            return None

        # Create output directory if it doesn't exist
        if output_csv:
            output_dir = os.path.dirname(output_csv)
            if output_dir and not os.path.exists(output_dir):
                os.makedirs(output_dir)

        # Find all image files
        image_extensions = ['.jpg', '.jpeg', '.png']
        image_files = []

        for root, _, files in os.walk(images_dir):
            for file in files:
                if any(file.lower().endswith(ext) for ext in image_extensions):
                    image_files.append(os.path.join(root, file))

        print(f"Found {len(image_files)} images to analyze")

        all_results = []

        # Process each image
        for image_file in tqdm(image_files, desc="Analyzing images for people and gender"):
            # Analyze image
            results = self.analyze_image(image_file)

            if "error" not in results:
                # Create summary for DataFrame
                summary = {
                    "image_path": image_file,
                    "has_people": results.get("has_people", False),
                    "people_count": results.get("people_count", 0),
                    "female_count": results.get("female_count", 0),
                    "male_count": results.get("male_count", 0),
                    "female_proportion": results.get("female_proportion", 0),
                    "gender_predictions": results.get("gender_prediction_str", "")
                }

                all_results.append(summary)
            else:
                print(f"Error analyzing {image_file}: {results['error']}")

        # Convert results to DataFrame
        results_df = pd.DataFrame(all_results)

        # Save results to CSV if path is provided
        if output_csv:
            results_df.to_csv(output_csv, index=False)
            print(f"Results saved to {output_csv}")

        return results_df


# Main function to process all images in a directory
def analyze_people_in_images(images_dir, output_csv=None, generate_charts=False, charts_dir=None):
    """
    Process all images in a directory using DeepFace
    and output people count, gender predictions, and female proportion to CSV.

    Args:
        images_dir (str): Directory containing images
        output_csv (str): Path to save CSV results
        generate_charts (bool): Whether to generate summary charts
        charts_dir (str): Directory to save charts if generated

    Returns:
        pd.DataFrame: Analysis results
    """
    # Initialize the analyzer
    analyzer = DeepFacePeopleGenderAnalyzer()

    # Analyze all images
    results_df = analyzer.analyze_directory(images_dir, output_csv)

    # Generate charts if requested
    if generate_charts and charts_dir and results_df is not None and not results_df.empty:
        analyzer.generate_summary_charts(results_df, charts_dir)

    # Print summary statistics
    if results_df is not None and not results_df.empty:
        print("\nSummary Statistics:")
        print(f"Total images analyzed: {len(results_df)}")
        print(f"Images with people: {results_df['has_people'].sum()} ({results_df['has_people'].mean()*100:.1f}%)")

        # Filter for images with people
        people_df = results_df[results_df['has_people'] == True]

        if not people_df.empty:
            total_people = people_df['people_count'].sum()
            total_female = people_df['female_count'].sum()
            total_male = people_df['male_count'].sum()
            
            print(f"Total people detected: {total_people}")
            print(f"Average people per image: {people_df['people_count'].mean():.2f}")
            print(f"Total females: {total_female} ({total_female/(total_female+total_male)*100:.1f}%)")
            print(f"Total males: {total_male} ({total_male/(total_female+total_male)*100:.1f}%)")
            print(f"Average female proportion: {people_df['female_proportion'].mean()*100:.1f}%")

            # Print distribution of number of people
            print("\nPeople Count Distribution:")
            people_count_dist = people_df['people_count'].value_counts().sort_index()
            for count, frequency in people_count_dist.items():
                print(f"{count} people: {frequency} images ({frequency/len(people_df)*100:.1f}%)")

    return results_df

In [None]:
IMAGES_DIR = "../sv_images/mumbai"
OUTPUT_CSV = "results/people_gender_analysis.csv"
CHARTS_DIR = "results/charts"

results = analyze_people_in_images(
        images_dir=IMAGES_DIR,
        output_csv=OUTPUT_CSV,
        generate_charts=True,
        charts_dir=CHARTS_DIR
    )