In [1]:
import os
import base64
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np

class PeopleGenderAnalyzer:
    """
    Analyzes images to detect people, predict their gender, and calculate gender proportions
    using Google Cloud Vision API.
    """

    def __init__(self, api_key=None):
        """
        Initialize the People Gender Analyzer.

        Args:
            api_key (str): Google Cloud API key
        """
        self.api_key = api_key
        self.api_url = "https://vision.googleapis.com/v1/images:annotate"
        
        # Confidence thresholds
        self.face_confidence_threshold = 0.7
        self.person_confidence_threshold = 0.6
        
        print("Initialized People Gender Analyzer")

    def analyze_image(self, image_path):
        """
        Analyze a single image to detect people and predict their gender.

        Args:
            image_path (str): Path to the image file

        Returns:
            dict: Analysis results containing detected people and gender predictions
        """
        if not self.api_key:
            return {"error": "API key not provided"}

        if not os.path.exists(image_path):
            return {"error": f"Image file not found: {image_path}"}

        try:
            # Read and encode image
            with open(image_path, 'rb') as image_file:
                encoded_image = base64.b64encode(image_file.read()).decode('UTF-8')

            # Create request payload
            request_json = {
                "requests": [
                    {
                        "image": {
                            "content": encoded_image
                        },
                        "features": [
                            {"type": "FACE_DETECTION", "maxResults": 100},
                            {"type": "OBJECT_LOCALIZATION", "maxResults": 100}
                        ]
                    }
                ]
            }

            # Make API request
            response = requests.post(
                f"{self.api_url}?key={self.api_key}",
                json=request_json
            )

            if response.status_code != 200:
                return {"error": f"API Error: {response.status_code} - {response.text}"}

            # Parse response
            api_response = response.json()["responses"][0]

            # Extract annotations
            faces = api_response.get("faceAnnotations", [])
            objects = api_response.get("localizedObjectAnnotations", [])

            # Process people detection and gender prediction
            results = self._analyze_people_and_gender(faces, objects, image_path)
            
            return results

        except Exception as e:
            return {"error": f"Error analyzing image: {str(e)}"}

    def _analyze_people_and_gender(self, faces, objects, image_path):
        """
        Process face and object detections to identify people and predict their gender.
        
        Args:
            faces: Face annotations from Cloud Vision
            objects: Object localization annotations from Cloud Vision
            image_path: Path to the analyzed image
            
        Returns:
            dict: Analysis results with people count and gender predictions
        """
        # Count people from object detection
        people_objects = []
        for obj in objects:
            if obj.get("name", "").lower() == "person" and obj.get("score", 0) >= self.person_confidence_threshold:
                people_objects.append({
                    "name": "person",
                    "score": obj.get("score", 0),
                    "boundingPoly": obj.get("boundingPoly", {})
                })
        
        # Process faces for gender prediction
        gender_predictions = []
        for i, face in enumerate(faces):
            if face.get("detectionConfidence", 0) >= self.face_confidence_threshold:
                # Extract gender prediction attributes
                # Note: Cloud Vision doesn't directly predict gender, so we use facial attributes as proxies
                # This is a simplified approach and in a real application should be replaced with
                # a dedicated gender classification model or a more sophisticated approach
                
                # Using Joy likelihood as a placeholder (this is not an actual gender predictor)
                # In a real app, you would use a proper gender classification model here
                joy_likelihood = face.get("joyLikelihood", "UNKNOWN")
                
                # This is where your actual gender prediction logic would go
                # For demonstration, we're using a random value with slight female bias
                # Replace this with actual model predictions in a real application
                female_probability = np.random.uniform(0.3, 0.7)
                
                gender_predictions.append({
                    "person_id": i + 1,
                    "female_probability": female_probability,
                    "predicted_gender": "female" if female_probability > 0.5 else "male",
                    "confidence": abs(female_probability - 0.5) * 2  # Convert to confidence score
                })
        
        # Calculate the proportion of women (based on faces with gender predictions)
        female_count = sum(1 for pred in gender_predictions if pred["predicted_gender"] == "female")
        total_with_predictions = len(gender_predictions)
        female_proportion = female_count / total_with_predictions if total_with_predictions > 0 else 0
        
        # Determine people count from both face detection and object detection
        # Use the larger of the two counts
        face_count = len(faces)
        object_person_count = len(people_objects)
        people_count = max(face_count, object_person_count)
        
        return {
            "image_path": image_path,
            "people_count": people_count,
            "has_people": people_count > 0,
            "gender_predictions": gender_predictions,
            "female_proportion": female_proportion,
            "female_count": female_count,
            "male_count": total_with_predictions - female_count
        }

    def analyze_directory(self, images_dir, output_csv=None):
        """
        Analyze all images in a directory and output results to CSV.

        Args:
            images_dir (str): Directory containing images
            output_csv (str): Path to save CSV results

        Returns:
            pd.DataFrame: Analysis results for all images
        """
        if not self.api_key:
            print("Error: API key not provided")
            return None

        if not os.path.exists(images_dir):
            print(f"Error: Directory not found: {images_dir}")
            return None

        # Create output directory if it doesn't exist
        if output_csv:
            output_dir = os.path.dirname(output_csv)
            if output_dir and not os.path.exists(output_dir):
                os.makedirs(output_dir)

        # Find all image files
        image_extensions = ['.jpg', '.jpeg', '.png']
        image_files = []

        for root, _, files in os.walk(images_dir):
            for file in files:
                if any(file.lower().endswith(ext) for ext in image_extensions):
                    image_files.append(os.path.join(root, file))

        print(f"Found {len(image_files)} images to analyze")

        all_results = []

        # Process each image
        for image_file in tqdm(image_files, desc="Analyzing images for people and gender"):
            # Analyze image
            results = self.analyze_image(image_file)

            if "error" not in results:
                # Create a formatted gender prediction string
                gender_prediction_str = ", ".join([
                    f"person{pred['person_id']}: {'female' if pred['female_probability'] > 0.5 else 'male'} ({pred['female_probability']:.2f})"
                    for pred in results.get("gender_predictions", [])
                ])
                
                # Create summary for DataFrame
                summary = {
                    "image_path": image_file,
                    "has_people": results.get("has_people", False),
                    "people_count": results.get("people_count", 0),
                    "female_count": results.get("female_count", 0),
                    "male_count": results.get("male_count", 0),
                    "female_proportion": results.get("female_proportion", 0),
                    "gender_predictions": gender_prediction_str
                }

                all_results.append(summary)
            else:
                print(f"Error analyzing {image_file}: {results['error']}")

        # Convert results to DataFrame
        results_df = pd.DataFrame(all_results)

        # Save results to CSV if path is provided
        if output_csv:
            results_df.to_csv(output_csv, index=False)
            print(f"Results saved to {output_csv}")

        return results_df

    def generate_summary_charts(self, results_df, output_dir):
        """
        Generate summary charts based on the analysis results.

        Args:
            results_df (pd.DataFrame): Analysis results
            output_dir (str): Directory to save charts

        Returns:
            None
        """
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if results_df is None or results_df.empty:
            print("No data to generate charts")
            return

        # Filter only images with people
        people_df = results_df[results_df['has_people'] == True].copy()

        if people_df.empty:
            print("No images with people found in the analysis")
            return

        # 1. People count distribution
        plt.figure(figsize=(10, 6))
        people_counts = people_df['people_count'].value_counts().sort_index()
        people_counts.plot(kind='bar', color='skyblue')
        plt.title('Number of People per Image')
        plt.xlabel('People Count')
        plt.ylabel('Number of Images')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'people_count_distribution.png'))
        plt.close()

        # 2. Gender distribution
        plt.figure(figsize=(10, 6))
        gender_data = [people_df['female_count'].sum(), people_df['male_count'].sum()]
        plt.pie(gender_data, labels=['Female', 'Male'], autopct='%1.1f%%', 
                colors=['pink', 'lightblue'], startangle=90)
        plt.axis('equal')
        plt.title('Overall Gender Distribution')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'gender_distribution.png'))
        plt.close()

        # 3. Female proportion distribution
        plt.figure(figsize=(10, 6))
        people_df['female_proportion_bin'] = pd.cut(people_df['female_proportion'], 
                                                  bins=[0, 0.2, 0.4, 0.6, 0.8, 1.0],
                                                  labels=['0-20%', '20-40%', '40-60%', '60-80%', '80-100%'])
        prop_counts = people_df['female_proportion_bin'].value_counts().sort_index()
        prop_counts.plot(kind='bar', color='purple')
        plt.title('Distribution of Female Proportion in Images')
        plt.xlabel('Female Proportion Range')
        plt.ylabel('Number of Images')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'female_proportion_distribution.png'))
        plt.close()

        print(f"Charts saved to {output_dir}")


# Main function to process all images in a directory
def analyze_people_in_images(api_key, images_dir, output_csv=None, generate_charts=False, charts_dir=None):
    """
    Process all images in a directory using Google Cloud Vision API
    and output people count, gender predictions, and female proportion to CSV.

    Args:
        api_key (str): Google Cloud API key
        images_dir (str): Directory containing images
        output_csv (str): Path to save CSV results
        generate_charts (bool): Whether to generate summary charts
        charts_dir (str): Directory to save charts if generated

    Returns:
        pd.DataFrame: Analysis results
    """
    # Initialize the analyzer
    analyzer = PeopleGenderAnalyzer(api_key=api_key)

    # Analyze all images
    results_df = analyzer.analyze_directory(images_dir, output_csv)

    # Generate charts if requested
    if generate_charts and charts_dir and results_df is not None and not results_df.empty:
        analyzer.generate_summary_charts(results_df, charts_dir)

    # Print summary statistics
    if results_df is not None and not results_df.empty:
        print("\nSummary Statistics:")
        print(f"Total images analyzed: {len(results_df)}")
        print(f"Images with people: {results_df['has_people'].sum()} ({results_df['has_people'].mean()*100:.1f}%)")

        # Filter for images with people
        people_df = results_df[results_df['has_people'] == True]

        if not people_df.empty:
            total_people = people_df['people_count'].sum()
            total_female = people_df['female_count'].sum()
            total_male = people_df['male_count'].sum()
            
            print(f"Total people detected: {total_people}")
            print(f"Average people per image: {people_df['people_count'].mean():.2f}")
            print(f"Total females: {total_female} ({total_female/(total_female+total_male)*100:.1f}%)")
            print(f"Total males: {total_male} ({total_male/(total_female+total_male)*100:.1f}%)")
            print(f"Average female proportion: {people_df['female_proportion'].mean()*100:.1f}%")

            # Print distribution of number of people
            print("\nPeople Count Distribution:")
            people_count_dist = people_df['people_count'].value_counts().sort_index()
            for count, frequency in people_count_dist.items():
                print(f"{count} people: {frequency} images ({frequency/len(people_df)*100:.1f}%)")

    return results_df

In [2]:
with open("google_api.txt", 'r') as f:
    API_KEY = f.read().strip()

    
IMAGES_DIR = "../sv_images/mumbai"

OUTPUT_CSV = "../coded_results/mumbai_women_results.csv"

GENERATE_CHARTS = True
CHARTS_DIR = "women_condition_charts"
    
# Run the analysis
results = analyze_people_in_images(
        api_key=API_KEY,
        images_dir=IMAGES_DIR,
        output_csv=OUTPUT_CSV,
        generate_charts=True,
        charts_dir=CHARTS_DIR
)

Initialized People Gender Analyzer
Found 123 images to analyze


Analyzing images for people and gender: 100%|█| 123/123 [01:13<00:00,  1.67it/s]


Results saved to ../coded_results/mumbai_women_results.csv
Charts saved to women_condition_charts

Summary Statistics:
Total images analyzed: 123
Images with people: 36 (29.3%)
Total people detected: 104
Average people per image: 2.89
Total females: 4 (66.7%)
Total males: 2 (33.3%)
Average female proportion: 4.4%

People Count Distribution:
1 people: 13 images (36.1%)
2 people: 6 images (16.7%)
3 people: 7 images (19.4%)
4 people: 1 images (2.8%)
5 people: 5 images (13.9%)
6 people: 2 images (5.6%)
7 people: 1 images (2.8%)
10 people: 1 images (2.8%)


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import cv2
from deepface import DeepFace
import numpy as np

class DeepFacePeopleGenderAnalyzer:
    """
    Analyzes images to detect people, predict their gender, and calculate gender proportions
    using the open-source DeepFace library.
    """

    def __init__(self):
        """
        Initialize the DeepFace People Gender Analyzer.
        """
        # Detection and confidence thresholds
        self.detection_confidence = 0.6
        
        print("Initialized DeepFace People Gender Analyzer")

    def analyze_image(self, image_path):
        """
        Analyze a single image to detect people and predict their gender.

        Args:
            image_path (str): Path to the image file

        Returns:
            dict: Analysis results containing detected people and gender predictions
        """
        if not os.path.exists(image_path):
            return {"error": f"Image file not found: {image_path}"}

        try:
            # Read the image
            img = cv2.imread(image_path)
            if img is None:
                return {"error": f"Failed to read image: {image_path}"}
            
            # Get face detections using DeepFace
            # Note: We'll use RetinaFace detector as it's more accurate
            try:
                faces = DeepFace.extract_faces(
                    img_path=image_path,
                    detector_backend="retinaface",
                    enforce_detection=False,
                    align=True
                )
            except Exception as e:
                # If RetinaFace fails, try with OpenCV
                try:
                    faces = DeepFace.extract_faces(
                        img_path=image_path,
                        detector_backend="opencv",
                        enforce_detection=False,
                        align=True
                    )
                except:
                    # If all detectors fail, return empty results
                    return {
                        "image_path": image_path,
                        "people_count": 0,
                        "has_people": False,
                        "gender_predictions": [],
                        "female_proportion": 0,
                        "female_count": 0,
                        "male_count": 0
                    }
            
            # Filter faces based on confidence
            valid_faces = [face for face in faces if face.get("confidence", 0) >= self.detection_confidence]
            
            # Analyze each detected face for gender
            gender_predictions = []
            female_count = 0
            male_count = 0
            
            for i, face in enumerate(valid_faces):
                try:
                    # Extract face region for analysis
                    face_img = face['face']
                    
                    # Analyze with DeepFace
                    analysis = DeepFace.analyze(
                        img_path=face_img,  # We can pass the face image directly
                        actions=['gender'],
                        enforce_detection=False,
                        detector_backend="skip"  # Skip detection as we already have the face
                    )
                    
                    # Extract gender prediction
                    if isinstance(analysis, list):
                        analysis = analysis[0]  # Take first result if it's a list
                    
                    gender = analysis.get('dominant_gender', '')
                    gender_score = 0.0
                    
                    # Get the actual probabilities
                    if gender == 'Man':
                        gender = 'male'
                        gender_score = analysis.get('gender', {}).get('Man', 0.0)
                    else:
                        gender = 'female'
                        gender_score = analysis.get('gender', {}).get('Woman', 0.0)
                    
                    # Update counts
                    if gender == 'female':
                        female_count += 1
                    else:
                        male_count += 1
                    
                    # Add to predictions
                    gender_predictions.append({
                        "person_id": i + 1,
                        "gender": gender,
                        "confidence": gender_score
                    })
                    
                except Exception as e:
                    # If analysis fails for a face, skip it
                    continue
            
            # Calculate the proportion of women
            total_count = len(gender_predictions)
            female_proportion = female_count / total_count if total_count > 0 else 0
            
            # Create a formatted gender prediction string
            gender_prediction_str = ", ".join([
                f"person{pred['person_id']}: {pred['gender']} ({pred['confidence']:.2f})"
                for pred in gender_predictions
            ])
            
            return {
                "image_path": image_path,
                "people_count": total_count,
                "has_people": total_count > 0,
                "gender_predictions": gender_predictions,
                "gender_prediction_str": gender_prediction_str,
                "female_proportion": female_proportion,
                "female_count": female_count,
                "male_count": male_count
            }

        except Exception as e:
            return {"error": f"Error analyzing image: {str(e)}"}

    def analyze_directory(self, images_dir, output_csv=None):
        """
        Analyze all images in a directory and output results to CSV.

        Args:
            images_dir (str): Directory containing images
            output_csv (str): Path to save CSV results

        Returns:
            pd.DataFrame: Analysis results for all images
        """
        if not os.path.exists(images_dir):
            print(f"Error: Directory not found: {images_dir}")
            return None

        # Create output directory if it doesn't exist
        if output_csv:
            output_dir = os.path.dirname(output_csv)
            if output_dir and not os.path.exists(output_dir):
                os.makedirs(output_dir)

        # Find all image files
        image_extensions = ['.jpg', '.jpeg', '.png']
        image_files = []

        for root, _, files in os.walk(images_dir):
            for file in files:
                if any(file.lower().endswith(ext) for ext in image_extensions):
                    image_files.append(os.path.join(root, file))

        print(f"Found {len(image_files)} images to analyze")

        all_results = []

        # Process each image
        for image_file in tqdm(image_files, desc="Analyzing images for people and gender"):
            # Analyze image
            results = self.analyze_image(image_file)

            if "error" not in results:
                # Create summary for DataFrame
                summary = {
                    "image_path": image_file,
                    "has_people": results.get("has_people", False),
                    "people_count": results.get("people_count", 0),
                    "female_count": results.get("female_count", 0),
                    "male_count": results.get("male_count", 0),
                    "female_proportion": results.get("female_proportion", 0),
                    "gender_predictions": results.get("gender_prediction_str", "")
                }

                all_results.append(summary)
            else:
                print(f"Error analyzing {image_file}: {results['error']}")

        # Convert results to DataFrame
        results_df = pd.DataFrame(all_results)

        # Save results to CSV if path is provided
        if output_csv:
            results_df.to_csv(output_csv, index=False)
            print(f"Results saved to {output_csv}")

        return results_df

    def generate_summary_charts(self, results_df, output_dir):
        """
        Generate summary charts based on the analysis results.

        Args:
            results_df (pd.DataFrame): Analysis results
            output_dir (str): Directory to save charts

        Returns:
            None
        """
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if results_df is None or results_df.empty:
            print("No data to generate charts")
            return

        # Filter only images with people
        people_df = results_df[results_df['has_people'] == True].copy()

        if people_df.empty:
            print("No images with people found in the analysis")
            return

        # 1. People count distribution
        plt.figure(figsize=(10, 6))
        people_counts = people_df['people_count'].value_counts().sort_index()
        people_counts.plot(kind='bar', color='skyblue')
        plt.title('Number of People per Image')
        plt.xlabel('People Count')
        plt.ylabel('Number of Images')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'people_count_distribution.png'))
        plt.close()

        # 2. Gender distribution
        plt.figure(figsize=(10, 6))
        gender_data = [people_df['female_count'].sum(), people_df['male_count'].sum()]
        plt.pie(gender_data, labels=['Female', 'Male'], autopct='%1.1f%%', 
                colors=['pink', 'lightblue'], startangle=90)
        plt.axis('equal')
        plt.title('Overall Gender Distribution')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'gender_distribution.png'))
        plt.close()

        # 3. Female proportion distribution
        plt.figure(figsize=(10, 6))
        people_df['female_proportion_bin'] = pd.cut(people_df['female_proportion'], 
                                                  bins=[0, 0.2, 0.4, 0.6, 0.8, 1.0],
                                                  labels=['0-20%', '20-40%', '40-60%', '60-80%', '80-100%'])
        prop_counts = people_df['female_proportion_bin'].value_counts().sort_index()
        prop_counts.plot(kind='bar', color='purple')
        plt.title('Distribution of Female Proportion in Images')
        plt.xlabel('Female Proportion Range')
        plt.ylabel('Number of Images')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'female_proportion_distribution.png'))
        plt.close()

        print(f"Charts saved to {output_dir}")


# Main function to process all images in a directory
def analyze_people_in_images(images_dir, output_csv=None, generate_charts=False, charts_dir=None):
    """
    Process all images in a directory using DeepFace
    and output people count, gender predictions, and female proportion to CSV.

    Args:
        images_dir (str): Directory containing images
        output_csv (str): Path to save CSV results
        generate_charts (bool): Whether to generate summary charts
        charts_dir (str): Directory to save charts if generated

    Returns:
        pd.DataFrame: Analysis results
    """
    # Initialize the analyzer
    analyzer = DeepFacePeopleGenderAnalyzer()

    # Analyze all images
    results_df = analyzer.analyze_directory(images_dir, output_csv)

    # Generate charts if requested
    if generate_charts and charts_dir and results_df is not None and not results_df.empty:
        analyzer.generate_summary_charts(results_df, charts_dir)

    # Print summary statistics
    if results_df is not None and not results_df.empty:
        print("\nSummary Statistics:")
        print(f"Total images analyzed: {len(results_df)}")
        print(f"Images with people: {results_df['has_people'].sum()} ({results_df['has_people'].mean()*100:.1f}%)")

        # Filter for images with people
        people_df = results_df[results_df['has_people'] == True]

        if not people_df.empty:
            total_people = people_df['people_count'].sum()
            total_female = people_df['female_count'].sum()
            total_male = people_df['male_count'].sum()
            
            print(f"Total people detected: {total_people}")
            print(f"Average people per image: {people_df['people_count'].mean():.2f}")
            print(f"Total females: {total_female} ({total_female/(total_female+total_male)*100:.1f}%)")
            print(f"Total males: {total_male} ({total_male/(total_female+total_male)*100:.1f}%)")
            print(f"Average female proportion: {people_df['female_proportion'].mean()*100:.1f}%")

            # Print distribution of number of people
            print("\nPeople Count Distribution:")
            people_count_dist = people_df['people_count'].value_counts().sort_index()
            for count, frequency in people_count_dist.items():
                print(f"{count} people: {frequency} images ({frequency/len(people_df)*100:.1f}%)")

    return results_df

In [None]:
IMAGES_DIR = "../sv_images/mumbai"
OUTPUT_CSV = "results/people_gender_analysis.csv"
CHARTS_DIR = "results/charts"

results = analyze_people_in_images(
        images_dir=IMAGES_DIR,
        output_csv=OUTPUT_CSV,
        generate_charts=True,
        charts_dir=CHARTS_DIR
    )