In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def generate_identity_theft_data(start_date='1999-01-01', end_date='2024-09-28', num_records=10000):
    # Generate date range
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # Define states and their populations (approximate)
    states = {
        'CA': 39.5, 'TX': 29.1, 'FL': 21.5, 'NY': 19.5, 'PA': 12.8, 'IL': 12.7, 
        'OH': 11.7, 'GA': 10.6, 'NC': 10.4, 'MI': 10.0
    }
    
    # Generate random data
    data = {
        'Reported_Date': np.random.choice(date_range, num_records),
        'Type': np.random.choice([
            'Credit Card Fraud', 'Bank Fraud', 'Phone or Utilities Fraud', 
            'Employment Fraud', 'Government Documents or Benefits Fraud',
            'Loan or Lease Fraud', 'Tax-Related Fraud'
        ], num_records, p=[0.3, 0.2, 0.15, 0.1, 0.1, 0.1, 0.05]),  # Weighted probabilities
        'State': np.random.choice(list(states.keys()), num_records, p=[pop/sum(states.values()) for pop in states.values()]),
        'Age_Group': np.random.choice(['18-29', '30-39', '40-49', '50-59', '60+'], num_records, p=[0.2, 0.25, 0.25, 0.2, 0.1]),
        'Gender': np.random.choice(['Male', 'Female', 'Other'], num_records, p=[0.48, 0.51, 0.01])
    }
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Sort by date
    df = df.sort_values('Reported_Date')
    
    # Add trends and seasonality
    df['Year'] = df['Reported_Date'].dt.year
    df['Month'] = df['Reported_Date'].dt.month
    
    # Base number of cases (increasing over time)
    df['Cases'] = 50 + (df['Year'] - df['Year'].min()) * 30
    
    # Add seasonal effect
    df['Cases'] += np.sin(df['Month'] * 2 * np.pi / 12) * 20
    
    # Add random noise
    df['Cases'] += np.random.normal(0, 10, len(df))
    
    # Ensure Cases are always positive integers
    df['Cases'] = df['Cases'].clip(lower=1).round().astype(int)
    
    # Generate realistic monetary loss (log-normal distribution)
    df['Monetary_Loss'] = np.random.lognormal(mean=8, sigma=1, size=len(df)).round(2)
    
    # Add some missing values to Monetary_Loss (about 5%)
    mask = np.random.choice([True, False], size=len(df), p=[0.05, 0.95])
    df.loc[mask, 'Monetary_Loss'] = np.nan
    
    # Drop Year and Month columns as they were just used for data generation
    df = df.drop(['Year', 'Month'], axis=1)
    
    return df

# Generate the data
df = generate_identity_theft_data()

# Save to CSV
df.to_csv('../identity_theft_cases2.csv', index=False)

print("Realistic synthetic data has been generated and saved to 'identity_theft_cases.csv'")

# Display the first few rows of the generated data
print(df.head())

# Display summary statistics
print(df.describe())

Realistic synthetic data has been generated and saved to 'identity_theft_cases.csv'
     Reported_Date                 Type State Age_Group  Gender  Cases  \
1797    1999-01-01    Credit Card Fraud    TX     30-39    Male     57   
7949    1999-01-01    Credit Card Fraud    CA     18-29    Male     55   
1395    1999-01-03  Loan or Lease Fraud    TX     30-39    Male     67   
9408    1999-01-03           Bank Fraud    MI     40-49  Female     67   
7229    1999-01-03    Tax-Related Fraud    OH       60+    Male     80   

      Monetary_Loss  
1797        5095.46  
7949        3680.11  
1395         809.52  
9408        4863.04  
7229        5162.30  
                       Reported_Date         Cases  Monetary_Loss
count                          10000  10000.000000    9527.000000
mean   2011-12-17 14:38:23.999999744    424.126700    4912.680091
min              1999-01-01 00:00:00     10.000000      57.170000
25%              2005-08-18 00:00:00    233.000000    1490.820000
50%      