In [13]:
pip install faker

Collecting faker
  Downloading Faker-27.4.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-27.4.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faker
Successfully installed faker-27.4.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Credit Score Bias: Adjust the Interest_Rate and Application_Status based on the Credit_Score. Higher credit scores will have lower interest rates and a higher likelihood of approval.
# Geographic Bias: Influence the Promotions field based on the Location. Certain locations will have a higher chance of receiving better promotional offers.
# increase Sample Size: Update n_samples to 70,000.
# Introduce Missing Data: Use probabilities to assign None values to certain fields at random, representing missing data.

In [62]:
import pandas as pd
import numpy as np
import random
from faker import Faker

# Initialize Faker to generate synthetic data
fake = Faker()

# Sample size
n_samples = 70000

# Generate synthetic data
data = {
    "User_ID": [fake.uuid4() for _ in range(n_samples)],
    "Age": np.random.randint(20, 60, size=n_samples),
    "Gender": np.random.choice(['Male', 'Female'], size=n_samples),
    "Income": np.random.randint(30000, 120000, size=n_samples),
    "Employment_Status": np.random.choice(['Employed', 'Self-Employed', 'Unemployed'], size=n_samples),
    "Credit_Score": np.random.randint(300, 850, size=n_samples),
    "Location": [fake.city() for _ in range(n_samples)],
    "Vehicle_Type": np.random.choice(['New', 'Used'], size=n_samples),
    "Vehicle_Make": np.random.choice(['Toyota', 'Honda', 'Ford', 'Chevrolet', 'BMW'], size=n_samples),
    "Vehicle_Model": np.random.choice(['Camry', 'Civic', 'F-150', 'Silverado', '3 Series'], size=n_samples),
    "Vehicle_Year": np.random.randint(2005, 2024, size=n_samples),
    "Vehicle_Mileage": np.random.randint(0, 150000, size=n_samples),
    "Loan_Amount": np.random.randint(5000, 50000, size=n_samples),
    "Down_Payment": np.random.randint(1000, 10000, size=n_samples),
    "Loan_Tenure_Years": np.random.choice([3, 4, 5, 6, 7], size=n_samples),
    "Interest_Rate": np.random.uniform(1.9, 6.5, size=n_samples).round(2),
    "Application_Status": np.random.choice(['Approved', 'Rejected', 'Pending'], size=n_samples),
    "Session_Duration_Minutes": np.random.randint(5, 60, size=n_samples),
    "Number_of_Interactions": np.random.randint(10, 100, size=n_samples),
    "Notifications_Responded": np.random.choice([0, 1], size=n_samples, p=[0.7, 0.3]),
    "Support_Queries": np.random.choice([0, 1, 2, 3], size=n_samples, p=[0.5, 0.3, 0.15, 0.05]),
    "Application_Submitted": np.random.choice([True, False], size=n_samples, p=[0.8, 0.2])
}

# Additional fields to be added
education_levels = ['High School', 'Associate Degree', 'Bachelor’s Degree', 'Master’s Degree', 'Doctorate']
marital_statuses = ['Single', 'Married', 'Divorced', 'Widowed']
device_types = ['iPhone', 'Android', 'Windows Phone']
os_versions = ['iOS 15', 'iOS 14', 'Android 11', 'Android 10', 'Windows 10 Mobile']
app_versions = ['1.0', '1.1', '1.2']
network_types = ['Wi-Fi', '4G', '5G']
dealer_info = ['Dealer A', 'Dealer B', 'Dealer C', 'Dealer D']
promotions = ['0% APR', '$1000 Cashback', 'No Payments for 90 Days', 'Low Down Payment']
event_sequences = ['Application Start', 'Vehicle Selection', 'Loan Calculator', 'Document Upload', 'Credit Check', 'Approval']

# Update the existing data dictionary with new fields
data.update({
    "Education_Level": np.random.choice(education_levels, size=n_samples),
    "Marital_Status": np.random.choice(marital_statuses, size=n_samples),
    "Existing_Debt": np.random.randint(0, 100000, size=n_samples),
    "Monthly_Expenses": np.random.randint(1000, 10000, size=n_samples),
    "Savings": np.random.randint(1000, 50000, size=n_samples),
    "Previous_Vehicle_Ownership": np.random.choice([True, False], size=n_samples, p=[0.7, 0.3]),
    "Trade_In_Details": np.random.choice([None, 'Old Car Trade-In'], size=n_samples, p=[0.7, 0.3]),
    "Session_Start_Time": [fake.date_time_this_year() for _ in range(n_samples)],
    "Session_End_Time": [fake.date_time_this_year() for _ in range(n_samples)],
    "Navigation_Paths": [random.sample(event_sequences, k=random.randint(3, len(event_sequences))) for _ in range(n_samples)],
    "Device_Type": np.random.choice(device_types, size=n_samples),
    "OS_Version": np.random.choice(os_versions, size=n_samples),
    "App_Version": np.random.choice(app_versions, size=n_samples),
    "Network_Type": np.random.choice(network_types, size=n_samples),
    "Dealer_Info": np.random.choice(dealer_info, size=n_samples),
    "Promotions": np.random.choice(promotions, size=n_samples),
    "Regulatory_Compliance": np.random.choice(['Compliant', 'Non-Compliant'], size=n_samples, p=[0.95, 0.05]),
    "Consent_Provided": np.random.choice([True, False], size=n_samples, p=[0.98, 0.02]),
    "User_Type": np.random.choice(['New', 'Returning'], size=n_samples),
    "Behavioral_Segment": np.random.choice(['Low Engagement', 'Medium Engagement', 'High Engagement'], size=n_samples),
    "User_Feedback_Rating": np.random.randint(1, 5, size=n_samples),
    "Common_Issues_Faced": np.random.choice([None, 'Document Upload Failed', 'Credit Check Issue', 'App Crash'], size=n_samples, p=[0.7, 0.1, 0.1, 0.1]),
    "User_Satisfaction": np.random.choice(['Very Satisfied', 'Satisfied', 'Neutral', 'Dissatisfied', 'Very Dissatisfied'], size=n_samples)
})

# Additional Interaction Event data to be added
interaction_events = ['Click', 'Tap', 'Swipe', 'Form Entry']
screens = ['Home', 'Loan Calculator', 'Vehicle Selection', 'Document Upload', 'Credit Check', 'Approval']

# Adding more columns to the dataset
data.update({
    "Frequency_of_App_Usage": np.random.randint(1, 30, size=n_samples),  # Frequency of app usage in the past month
    "Clicks": np.random.randint(1, 50, size=n_samples),
    "Taps": np.random.randint(1, 50, size=n_samples),
    "Swipes": np.random.randint(1, 50, size=n_samples),
    "Form_Entries": np.random.randint(1, 20, size=n_samples),
    "Time_Spent_on_Home_Screen_Minutes": np.random.randint(1, 10, size=n_samples),
    "Time_Spent_on_Loan_Calculator_Minutes": np.random.randint(1, 15, size=n_samples),
    "Time_Spent_on_Vehicle_Selection_Minutes": np.random.randint(1, 20, size=n_samples),
    "Time_Spent_on_Document_Upload_Minutes": np.random.randint(1, 10, size=n_samples),
    "Time_Spent_on_Credit_Check_Minutes": np.random.randint(1, 5, size=n_samples),
    "Time_Spent_on_Approval_Screen_Minutes": np.random.randint(1, 5, size=n_samples),
    "Common_Paths": [random.sample(screens, k=random.randint(3, len(screens))) for _ in range(n_samples)],
    "Drop_Off_Point": np.random.choice(screens + [None], size=n_samples, p=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.4]),  # 40% complete all steps
    "Comparison_of_Loan_Options": np.random.choice([True, False], size=n_samples, p=[0.6, 0.4])
})

# Create the DataFrame
df = pd.DataFrame(data)


# Define possible treatments
treatments = ['Ads', 'No-Ads']

# Create a new column 'Treatment_Assignment' initialized with None
df['Treatment_Assignment'] = None

# Filter rows where 'Drop_Off_Point' is 'Approval', 'Document Upload', or 'Credit Check'
condition = df['Drop_Off_Point'].isin([ 'Approval','Document Upload', 'Credit Check'])

# Assign random treatment to the filtered rows
df.loc[condition, 'Treatment_Assignment'] = np.random.choice(treatments, size=condition.sum())


# Save the updated DataFrame to a CSV file
csv_file_path = "Synthetic_Auto_Loan_Application_Data_With_Treatment.csv"
df.to_csv(csv_file_path, index=False)

