In [1]:
# Cell 1: Test privacy analyzer
import sys
sys.path.append('../src')

from privacy_analyzer import PrivacyAnalyzer
import pandas as pd

# Load our dataset
loan_data = pd.read_csv('../data/loan_dataset_with_gender_bias.csv')

# Create privacy analyzer
privacy_analyzer = PrivacyAnalyzer()

print("Testing Privacy Analyzer on loan dataset...")
print(f"Dataset shape: {loan_data.shape}")
loan_data.head()


Testing Privacy Analyzer on loan dataset...
Dataset shape: (1000, 5)


Unnamed: 0,age,income,credit_score,gender,approved
0,40.0,70990.0,603.0,Female,0
1,34.0,63870.0,640.0,Female,0
2,41.0,50894.0,595.0,Male,0
3,50.0,40296.0,628.0,Female,0
4,33.0,60473.0,517.0,Male,0


In [2]:
# Cell 2: Run privacy analysis
privacy_results = privacy_analyzer.analyze_dataset(loan_data)

print("\nPrivacy Analysis Results:")
print(f"PII detected: {privacy_results['pii_detected']}")
print(f"Quasi-identifiers: {privacy_results['quasi_identifiers']}")
print(f"Uniqueness risks: {privacy_results['uniqueness_risk']}")


=== PRIVACY RISK ANALYSIS ===

--- PII Detection ---
✅ No obvious PII detected

--- Quasi-Identifier Detection ---
⚠️  Age quasi-identifiers: ['age']
⚠️  Demographic quasi-identifiers: ['gender']
⚠️  Professional quasi-identifiers: ['income']

--- Uniqueness Analysis ---
⚠️  High uniqueness risk in 'income': 98.30%
⚠️  High re-identification risk from column combinations: 100.00%

--- Privacy Recommendations ---
🔒 Apply k-anonymity or l-diversity to quasi-identifiers
🔒 Consider generalization/suppression of sensitive attributes
🔒 Reduce granularity of highly unique columns
🔒 Consider data aggregation or binning
🔒 Implement differential privacy for model training
🔒 Use secure multi-party computation for sensitive data
🔒 Regular privacy audits and monitoring

Privacy Analysis Results:
PII detected: {}
Quasi-identifiers: ['age', 'gender', 'income']
Uniqueness risks: {'age': 0.047, 'income': 0.983, 'credit_score': 0.287, 'gender': 0.002, 'approved': 0.002, 'combination_2-3_cols': 1.0}


In [3]:
# Cell 3: Test with synthetic PII data
# Create a more realistic dataset with PII for testing
synthetic_pii_data = pd.DataFrame({
    'name': ['John Smith', 'Mary Johnson', 'James Brown'],
    'email': ['john@email.com', 'mary@email.com', 'james@email.com'],
    'age': [25, 30, 35],
    'zipcode': ['12345', '67890', '11111'],
    'income': [50000, 60000, 70000],
    'approved': [1, 0, 1]
})

print("Testing with synthetic PII data:")
pii_results = privacy_analyzer.analyze_dataset(synthetic_pii_data)


Testing with synthetic PII data:
=== PRIVACY RISK ANALYSIS ===

--- PII Detection ---
⚠️  Column 'name': potential_first_names: 3, potential_last_names: 3
⚠️  Column 'email': email: 3 instances, potential_first_names: 3
⚠️  Column 'zipcode': zipcode: 3 instances

--- Quasi-Identifier Detection ---
⚠️  Age quasi-identifiers: ['age']
⚠️  Location quasi-identifiers: ['zipcode']
⚠️  Professional quasi-identifiers: ['income']

--- Uniqueness Analysis ---
⚠️  High uniqueness risk in 'name': 100.00%
⚠️  High uniqueness risk in 'email': 100.00%
⚠️  High uniqueness risk in 'age': 100.00%
⚠️  High uniqueness risk in 'zipcode': 100.00%
⚠️  High uniqueness risk in 'income': 100.00%
⚠️  Medium uniqueness risk in 'approved': 66.67%
⚠️  High re-identification risk from column combinations: 100.00%

--- Privacy Recommendations ---
🔒 Remove or encrypt detected PII before model training
🔒 Consider data anonymization techniques
🔒 Apply k-anonymity or l-diversity to quasi-identifiers
🔒 Consider generaliza

In [4]:
# Cell 4: Generate privacy report
report = privacy_analyzer.generate_privacy_report(loan_data)
print("\nPrivacy Report:")
print(report)

# Save report
import os
os.makedirs('../reports', exist_ok=True)
with open('../reports/privacy_report.md', 'w') as f:
    f.write(report)
print("\nReport saved to reports/privacy_report.md")


=== PRIVACY RISK ANALYSIS ===

--- PII Detection ---
✅ No obvious PII detected

--- Quasi-Identifier Detection ---
⚠️  Age quasi-identifiers: ['age']
⚠️  Demographic quasi-identifiers: ['gender']
⚠️  Professional quasi-identifiers: ['income']

--- Uniqueness Analysis ---
⚠️  High uniqueness risk in 'income': 98.30%
⚠️  High re-identification risk from column combinations: 100.00%

--- Privacy Recommendations ---
🔒 Apply k-anonymity or l-diversity to quasi-identifiers
🔒 Consider generalization/suppression of sensitive attributes
🔒 Reduce granularity of highly unique columns
🔒 Consider data aggregation or binning
🔒 Implement differential privacy for model training
🔒 Use secure multi-party computation for sensitive data
🔒 Regular privacy audits and monitoring

Privacy Report:
# Privacy Risk Assessment Report

## Dataset Privacy Analysis

### ✅ No PII Detected

### ⚠️ Quasi-Identifiers Found:
- age
- gender
- income

### Recommendations:
- 🔒 Apply k-anonymity or l-diversity to quasi-identi