## Creating Dataset B
- Balanced distributions of ethnicities reflected in the original cleaned dataset
- Disgarded "unknown"

In [24]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Any
import copy
from collections import defaultdict
import hashlib
import time

In [25]:


class DonorDataAugmenter:
    def __init__(self, input_path: str, output_dir: str):
        self.input_path = input_path
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.original_data = None
        self.augmented_data = None
        self.transformation_summary = defaultdict(dict)
    
    def standardize_ethnicity(self, ethnicity: str) -> str:
        """Standardize ethnicity names and handle combined ethnicities."""
        if pd.isna(ethnicity):
            return "Unknown"
        
        # Handle the special case of mother/father ethnic origin
        if ethnicity.startswith("Mother:"):
            return "Mixed or Multi Ethnic"  # Simplify complex parent combinations
            
        # Basic standardization map
        standard_names = {
            'Caucasian/White': 'Caucasian',
            'Latino/Hispanic': 'Hispanic or Latino',
            'Latino': 'Hispanic or Latino',
            'African American/Black': 'Black or African American',
            'Black': 'Black or African American',
            'Multi/Mixed': 'Mixed or Multi Ethnic',
            'Multi': 'Mixed or Multi Ethnic',
            'Middle Eastern/African': 'Middle Eastern or Arabic',
            'Pacific Islander': 'Native Hawaiian or Other Pacific Islander',
            'American Indian': 'American Indian or Alaska Native'
        }
        
        # If it's a combined ethnicity, split, standardize each part, and sort
        if ',' in ethnicity:
            parts = [part.strip() for part in ethnicity.split(',')]
            standardized_parts = []
            for part in parts:
                # Standardize each individual ethnicity
                std_part = standard_names.get(part, part)
                standardized_parts.append(std_part)
            # Sort to ensure consistent ordering
            return ', '.join(sorted(set(standardized_parts)))  # Use set to remove duplicates
        
        return standard_names.get(ethnicity, ethnicity)

    def preprocess_ethnicity_data(self) -> None:
        """Preprocess ethnicity data to combine similar categories."""
        if 'ethnic_background' not in self.df.columns:
            return
            
        # First standardize all ethnicities
        self.df['ethnic_background'] = self.df['ethnic_background'].apply(self.standardize_ethnicity)
        
        # Group similar multi-ethnic combinations
        def simplify_multi_ethnic(ethnicity: str) -> str:
            if pd.isna(ethnicity) or ethnicity == "Unknown":
                return ethnicity
                
            parts = set(part.strip() for part in ethnicity.split(','))
            if len(parts) > 2:  # If more than two ethnicities
                return "Mixed or Multi Ethnic"
                
            return ethnicity
        
        self.df['ethnic_background'] = self.df['ethnic_background'].apply(simplify_multi_ethnic)
        
        # Count frequency of each category
        ethnicity_counts = self.df['ethnic_background'].value_counts()
        
        # Combine rare categories (those with only 1-2 examples) into broader categories
        rare_threshold = 3
        for idx, count in ethnicity_counts.items():
            if count < rare_threshold and ',' in idx:
                self.df.loc[self.df['ethnic_background'] == idx, 'ethnic_background'] = 'Mixed or Multi Ethnic'
        
    def load_data(self) -> None:
        """Load and preprocess the original JSON dataset."""
        with open(self.input_path, 'r') as f:
            self.original_data = json.load(f)
        
        # Convert to DataFrame for easier processing
        self.df = pd.DataFrame(self.original_data)
        
        # Preprocess ethnicity data
        self.preprocess_ethnicity_data()
        
        # Store original distribution
        self.transformation_summary['original_distribution'] = self.get_distribution_stats(self.df)

    def get_distribution_stats(self, df: pd.DataFrame) -> Dict:
        """Calculate distribution statistics for the dataset."""
        ethnic_dist = df['ethnic_background'].value_counts().to_dict()
        
        education_by_ethnicity = {}
        for ethnicity in df['ethnic_background'].unique():
            if pd.isna(ethnicity) or ethnicity == "Unknown":
                continue
            education_counts = df[df['ethnic_background'] == ethnicity]['education_level'].value_counts()
            education_by_ethnicity[ethnicity] = education_counts.to_dict()
        
        stats = {
            'ethnic_distribution': ethnic_dist,
            'education_by_ethnicity': education_by_ethnicity,
            'total_profiles': len(df),
            'null_values': df.isnull().sum().to_dict()
        }
        return stats

    def get_profile_hash(self, profile: Dict) -> str:
        """Create a more robust hash of profile attributes."""
        key_attrs = [
            'donor_description',
            'height',
            'weight',
            'education_level',
            'education_field',
            'eye_color',
            'hair_color',
            'ethnic_background'
        ]
        
        # Include attribute names and timestamp in hash
        hash_parts = []
        for attr in key_attrs:
            hash_parts.append(f"{attr}:{str(profile.get(attr, ''))}")
        
        # Add timestamp to make hash more unique
        hash_parts.append(f"timestamp:{time.time()}")
        
        hash_str = '|'.join(hash_parts)
        return hashlib.md5(hash_str.encode()).hexdigest()

    def modify_text(self, text: str) -> str:
        """Make more substantial modifications to text while maintaining meaning."""
        if not text:
            return text
            
        # Expanded replacements dictionary with more variations
        replacements = {
            'enjoys': ['likes', 'loves', 'is passionate about', 'takes pleasure in', 'has a fondness for', 'delights in'],
            'currently': ['presently', 'now', 'at present', 'at the moment', 'these days', 'as of now'],
            'studying': ['pursuing', 'working on', 'learning', 'focused on', 'specializing in', 'dedicated to'],
            'passionate': ['enthusiastic', 'dedicated', 'committed', 'devoted', 'driven', 'motivated'],
            'loves': ['enjoys', 'is passionate about', 'is enthusiastic about', 'adores', 'cherishes', 'treasures'],
            'working': ['employed', 'pursuing a career', 'engaged', 'involved', 'active', 'focused'],
            'interested': ['passionate', 'keen', 'focused', 'devoted', 'engaged', 'invested'],
            'like': ['enjoy', 'appreciate', 'value', 'favor', 'prefer', 'gravitate towards'],
            'very': ['quite', 'particularly', 'especially', 'notably', 'remarkably', 'exceptionally'],
            'good': ['great', 'excellent', 'exceptional', 'outstanding', 'impressive', 'remarkable']
        }
        
        modified_text = text
        # Apply multiple replacements with randomization
        for _ in range(np.random.randint(2, 5)):  # Variable number of replacements
            for word, alternatives in replacements.items():
                if word in modified_text.lower():
                    replacement = np.random.choice(alternatives)
                    modified_text = modified_text.replace(word, replacement)
        
        # Add sentence modifiers with more variety
        modifiers = [
            "In general, ",
            "For the most part, ",
            "Primarily, ",
            "Characteristically, ",
            "Typically, ",
            "As a person, ",
            "Overall, ",
            "By nature, "
        ]
        
        if np.random.random() < 0.4:  # 40% chance to add modifier
            modified_text = np.random.choice(modifiers) + modified_text.lower()
        
        return modified_text

    def create_synthetic_profile(self, ethnic_group: pd.DataFrame, used_hashes: set, max_attempts: int = 100) -> Dict:
        """Create a more diverse synthetic profile with enhanced variation."""
        ethnic_profiles = ethnic_group.to_dict('records')
        
        for attempt in range(max_attempts):
            # Select more templates for mixing
            n_templates = min(5, len(ethnic_profiles))
            templates = np.random.choice(ethnic_profiles, size=n_templates, replace=False)
            
            # Start with base template
            synthetic_profile = copy.deepcopy(templates[0])
            synthetic_profile['source'] = 'synthetic'
            
            # Mix features more aggressively
            if len(templates) > 1:
                for attr in ['height', 'weight', 'eye_color', 'hair_color', 'education_level', 'education_field']:
                    if np.random.random() < 0.6:  # 60% chance to take from another template
                        template = np.random.choice(templates[1:])
                        if template.get(attr):
                            synthetic_profile[attr] = template[attr]
            
            # Add variations to numerical values
            if synthetic_profile.get('height'):
                try:
                    height_val = float(synthetic_profile['height'].replace("'", "."))
                    height_val += np.random.uniform(-0.3, 0.3)  # Larger random adjustment
                    synthetic_profile['height'] = f"{height_val:.1f}'"
                except (ValueError, AttributeError):
                    pass
                
            if synthetic_profile.get('weight'):
                try:
                    weight_val = float(synthetic_profile['weight'])
                    weight_val += np.random.randint(-8, 9)  # Larger random adjustment
                    synthetic_profile['weight'] = str(int(weight_val))
                except (ValueError, TypeError):
                    pass
            
            # Enhanced description modification
            if synthetic_profile.get('donor_description'):
                modified_description = synthetic_profile['donor_description']
                for _ in range(np.random.randint(2, 5)):  # Variable number of modifications
                    modified_description = self.modify_text(modified_description)
                synthetic_profile['donor_description'] = modified_description
            
            # Add timestamp to ensure uniqueness
            synthetic_profile['generation_timestamp'] = time.time()
            
            profile_hash = self.get_profile_hash(synthetic_profile)
            if profile_hash not in used_hashes:
                used_hashes.add(profile_hash)
                return synthetic_profile
        
        # If we still couldn't create a unique profile
        base_profile = copy.deepcopy(np.random.choice(ethnic_profiles))
        base_profile['source'] = 'synthetic'
        timestamp = int(time.time() * 1000000)  # microsecond timestamp
        base_profile['donor_description'] = f"{base_profile.get('donor_description', '')} [Variant {timestamp}]"
        base_profile['generation_timestamp'] = time.time()
        
        profile_hash = self.get_profile_hash(base_profile)
        used_hashes.add(profile_hash)
        
        return base_profile

    def create_balanced_dataset(self) -> None:
        """Create a balanced dataset with equal ethnic representation."""
        df = self.df.copy()
        
        # Remove 'Unknown' category and NA values
        df = df[df['ethnic_background'].notna()]
        df = df[df['ethnic_background'] != 'Unknown']
        
        # Calculate target number per category
        total_target = len(self.original_data)
        n_categories = len(df['ethnic_background'].unique())
        target_per_category = total_target // n_categories
        
        balanced_data = []
        used_hashes = set()
        
        # Process each ethnic category
        for ethnicity in sorted(df['ethnic_background'].unique()):
            ethnic_group = df[df['ethnic_background'] == ethnicity]
            print(f"\nProcessing ethnicity: {ethnicity}")
            print(f"Original profiles: {len(ethnic_group)}")
            
            if len(ethnic_group) >= target_per_category:
                selected = ethnic_group.sample(n=target_per_category, random_state=42)
                for _, row in selected.iterrows():
                    profile = row.to_dict()
                    profile['source'] = 'original'
                    profile_hash = self.get_profile_hash(profile)
                    used_hashes.add(profile_hash)
                    balanced_data.append(profile)
            else:
                # Include all original profiles
                for _, row in ethnic_group.iterrows():
                    profile = row.to_dict()
                    profile['source'] = 'original'
                    profile_hash = self.get_profile_hash(profile)
                    used_hashes.add(profile_hash)
                    balanced_data.append(profile)
                
                # Synthesize additional profiles
                n_synthetic = target_per_category - len(ethnic_group)
                created_synthetic = 0
                print(f"Creating {n_synthetic} synthetic profiles")
                
                while created_synthetic < n_synthetic:
                    try:
                        synthetic_profile = self.create_synthetic_profile(ethnic_group, used_hashes)
                        balanced_data.append(synthetic_profile)
                        created_synthetic += 1
                        if created_synthetic % 5 == 0:  # Progress update every 5 profiles
                            print(f"Created {created_synthetic}/{n_synthetic} synthetic profiles")
                    except Exception as e:
                        print(f"Warning: Failed to create synthetic profile: {e}")
                        continue
                
                print(f"Completed {created_synthetic}/{n_synthetic} synthetic profiles")
        
        self.augmented_data = balanced_data
        augmented_df = pd.DataFrame(balanced_data)
        self.transformation_summary['augmented_distribution'] = self.get_distribution_stats(augmented_df)
        print(f"\nFinal dataset size: {len(balanced_data)} profiles")
        
        # Print distribution comparison
        print("\nEthnic distribution comparison:")
        original_dist = pd.Series(self.transformation_summary['original_distribution']['ethnic_distribution'])
        augmented_dist = pd.Series(self.transformation_summary['augmented_distribution']['ethnic_distribution'])
        comparison = pd.DataFrame({
            'Original': original_dist,
            'Augmented': augmented_dist
        }).fillna(0)
        print(comparison)

    def save_outputs(self) -> None:
        """Save all output files."""
        # Save augmented dataset as JSON
        with open(self.output_dir / 'augmented_dataset.json', 'w') as f:
            json.dump(self.augmented_data, f, indent=2)
        
        # Save as CSV
        pd.DataFrame(self.augmented_data).to_csv(
            self.output_dir / 'augmented_dataset.csv', index=False
        )
        
        # Add transformation metadata
        self.transformation_summary['metadata'] = {
            'original_file': self.input_path,
            'total_original_profiles': len(self.original_data),
            'total_augmented_profiles': len(self.augmented_data),
            'synthetic_profiles_count': sum(1 for p in self.augmented_data if p['source'] == 'synthetic'),
            'original_profiles_count': sum(1 for p in self.augmented_data if p['source'] == 'original'),
            'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
        }
        
        # Save transformation summary
        with open(self.output_dir / 'transformation_summary.json', 'w') as f:
            json.dump(dict(self.transformation_summary), f, indent=2)



In [26]:
def main():
    # File paths
    input_path = "/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/processed/merged_donor_data.json"
    output_dir = "/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/generated/augmented_dataset"
    
    # Create and run augmenter
    augmenter = DonorDataAugmenter(input_path, output_dir)
    augmenter.load_data()
    augmenter.create_balanced_dataset()
    augmenter.save_outputs()
    
    print("\nData augmentation complete. Files saved to:", output_dir)

if __name__ == "__main__":
    main()


Processing ethnicity: American Indian or Alaska Native, Caucasian
Original profiles: 9
Creating 69 synthetic profiles
Created 5/69 synthetic profiles
Created 10/69 synthetic profiles
Created 15/69 synthetic profiles
Created 20/69 synthetic profiles
Created 25/69 synthetic profiles
Created 30/69 synthetic profiles
Created 35/69 synthetic profiles
Created 40/69 synthetic profiles
Created 45/69 synthetic profiles
Created 50/69 synthetic profiles
Created 55/69 synthetic profiles
Created 60/69 synthetic profiles
Created 65/69 synthetic profiles
Completed 69/69 synthetic profiles

Processing ethnicity: Asian
Original profiles: 219

Processing ethnicity: Asian, Caucasian
Original profiles: 19
Creating 59 synthetic profiles
Created 5/59 synthetic profiles
Created 10/59 synthetic profiles
Created 15/59 synthetic profiles
Created 20/59 synthetic profiles
Created 25/59 synthetic profiles
Created 30/59 synthetic profiles
Created 35/59 synthetic profiles
Created 40/59 synthetic profiles
Created 45

## Checking Quality of the Augmented Dataset

In [27]:
import json
import pandas as pd
import numpy as np
from collections import defaultdict
from difflib import SequenceMatcher
import re

class DataQualityAnalyzer:
    def __init__(self, file_path):
        """Initialize analyzer with path to augmented dataset."""
        self.file_path = file_path
        with open(file_path, 'r') as f:
            self.data = json.load(f)
        self.df = pd.DataFrame(self.data)
        
    def find_exact_duplicates(self):
        """Find completely identical profiles."""
        duplicates = self.df.duplicated(keep='first')
        return {
            'total_duplicates': duplicates.sum(),
            'duplicate_indices': duplicates[duplicates].index.tolist()
        }
    
    def find_similar_descriptions(self, threshold=0.85):
        """Find profiles with very similar descriptions."""
        similar_pairs = []
        descriptions = self.df['donor_description'].dropna().tolist()
        
        for i in range(len(descriptions)):
            for j in range(i + 1, len(descriptions)):
                similarity = SequenceMatcher(None, 
                                          descriptions[i], 
                                          descriptions[j]).ratio()
                if similarity > threshold:
                    similar_pairs.append({
                        'index1': i,
                        'index2': j,
                        'similarity': similarity,
                        'text1': descriptions[i][:100],
                        'text2': descriptions[j][:100]
                    })
        return similar_pairs
    
    def check_height_weight_validity(self):
        """Check for unrealistic height/weight combinations."""
        issues = []
        
        def parse_height(height):
            try:
                return float(height.replace("'", "."))
            except:
                return None
        
        def parse_weight(weight):
            try:
                return float(weight)
            except:
                return None
        
        # Convert height and weight to numeric
        self.df['height_num'] = self.df['height'].apply(parse_height)
        self.df['weight_num'] = self.df['weight'].apply(parse_weight)
        
        # Check for unrealistic values
        height_issues = self.df[
            (self.df['height_num'] < 4.5) | 
            (self.df['height_num'] > 7.0)
        ].index.tolist()
        
        weight_issues = self.df[
            (self.df['weight_num'] < 80) | 
            (self.df['weight_num'] > 300)
        ].index.tolist()
        
        # Check height-weight ratio
        bmi_issues = self.df[
            (self.df['weight_num'] / ((self.df['height_num'] * 0.3048) ** 2) < 16) |
            (self.df['weight_num'] / ((self.df['height_num'] * 0.3048) ** 2) > 45)
        ].index.tolist()
        
        return {
            'height_issues': height_issues,
            'weight_issues': weight_issues,
            'bmi_issues': bmi_issues
        }
    
    def check_text_quality(self):
        """Check for potential issues in text descriptions."""
        issues = []
        
        # Patterns to check
        patterns = {
            'very_short': lambda x: len(x.split()) < 10,
            'very_long': lambda x: len(x.split()) > 200,
            'repeated_words': lambda x: len(re.findall(r'\b(\w+)(?:\s+\1\b)+', x)) > 0,
            'unusual_punctuation': lambda x: len(re.findall(r'[!?]{2,}', x)) > 0
        }
        
        text_issues = defaultdict(list)
        for idx, desc in enumerate(self.df['donor_description'].dropna()):
            for pattern_name, pattern_func in patterns.items():
                if pattern_func(desc):
                    text_issues[pattern_name].append(idx)
        
        return dict(text_issues)
    
    def check_missing_values(self):
        """Check for missing values in important fields."""
        return {
            'total_missing': self.df.isnull().sum().to_dict(),
            'profiles_with_missing': len(self.df[self.df.isnull().any(axis=1)])
        }
    
    def analyze_synthetic_distribution(self):
        """Analyze distribution of original vs synthetic profiles."""
        synthetic_count = len(self.df[self.df['source'] == 'synthetic'])
        original_count = len(self.df[self.df['source'] == 'original'])
        
        return {
            'synthetic_profiles': synthetic_count,
            'original_profiles': original_count,
            'synthetic_ratio': synthetic_count / (original_count + synthetic_count),
            'distribution_by_ethnicity': self.df.groupby(['ethnic_background', 'source']).size().to_dict()
        }
    
    def generate_full_report(self):
        """Generate comprehensive quality report."""
        report = {
            'exact_duplicates': self.find_exact_duplicates(),
            'similar_descriptions': self.find_similar_descriptions(),
            'physical_issues': self.check_height_weight_validity(),
            'text_issues': self.check_text_quality(),
            'missing_values': self.check_missing_values(),
            'synthetic_analysis': self.analyze_synthetic_distribution()
        }
        
        return report

def print_report(report):
    """Print formatted report."""
    print("\n=== DATA QUALITY REPORT ===\n")
    
    print("EXACT DUPLICATES:")
    print(f"Found {report['exact_duplicates']['total_duplicates']} exact duplicates")
    
    print("\nSIMILAR DESCRIPTIONS:")
    similar_count = len(report['similar_descriptions'])
    print(f"Found {similar_count} pairs of very similar descriptions")
    if similar_count > 0:
        print("\nExample of similar pair:")
        print(f"Text 1: {report['similar_descriptions'][0]['text1']}")
        print(f"Text 2: {report['similar_descriptions'][0]['text2']}")
        print(f"Similarity: {report['similar_descriptions'][0]['similarity']:.2f}")
    
    print("\nPHYSICAL ATTRIBUTE ISSUES:")
    print(f"Height issues: {len(report['physical_issues']['height_issues'])}")
    print(f"Weight issues: {len(report['physical_issues']['weight_issues'])}")
    print(f"BMI issues: {len(report['physical_issues']['bmi_issues'])}")
    
    print("\nTEXT QUALITY ISSUES:")
    for issue, indices in report['text_issues'].items():
        print(f"{issue}: {len(indices)} instances")
    
    print("\nMISSING VALUES:")
    print(f"Profiles with any missing values: {report['missing_values']['profiles_with_missing']}")
    
    print("\nSYNTHETIC VS ORIGINAL:")
    synth = report['synthetic_analysis']
    print(f"Original profiles: {synth['original_profiles']}")
    print(f"Synthetic profiles: {synth['synthetic_profiles']}")
    print(f"Synthetic ratio: {synth['synthetic_ratio']:.2f}")

# Example usage
if __name__ == "__main__":
    analyzer = DataQualityAnalyzer("/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/generated/augmented_dataset/augmented_dataset.json")
    report = analyzer.generate_full_report()
    print_report(report)


=== DATA QUALITY REPORT ===

EXACT DUPLICATES:
Found 3 exact duplicates

SIMILAR DESCRIPTIONS:
Found 1395 pairs of very similar descriptions

Example of similar pair:
Text 1: Caring Military Man. This donor’s proudest moment came in the military, when he was inspecting an ai
Text 2: Characteristically, in general, caring military man. this donor’s proudest moment came in the milita
Similarity: 0.87

PHYSICAL ATTRIBUTE ISSUES:
Height issues: 0
Weight issues: 1
BMI issues: 0

TEXT QUALITY ISSUES:
very_short: 56 instances
unusual_punctuation: 1 instances
repeated_words: 1 instances
very_long: 10 instances

MISSING VALUES:
Profiles with any missing values: 1014

SYNTHETIC VS ORIGINAL:
Original profiles: 455
Synthetic profiles: 559
Synthetic ratio: 0.55
