In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import random

In [2]:
# random seed for same data generation
np.random.seed(42)
random.seed(42)

In [3]:
#Generating Farmers sample size 20,000
n_sample=20000
#Countries-East African Countries
countries=['Kenya','Tanzania','Uganda','Rwanda','Zambia']
country_weights=[0.35,0.30,0.20,0.10,0.05] #Country weights-Distribution by population

In [4]:
#CROP TYPES IN EAST AFRICA
crop_types=['Maize','Beans','Coffee','Tea','Rice','Cassava','tobacco','Banana','sunflower','sugarcane']

In [5]:
#Mobile Money Providers by country
mobile_providers={
    'Kenya':['MPESA','Airtel Money'],
    'Tanzania':['MPESA','YAS','Airtel Money'],
    'Uganda':['MTN Mobile Money','Airtel Money'],
    'Rwanda':['MTN Mobile Money','Airtel Money'],
    'Zambia':['MTN Mobile Money','Airtel Money']
}

In [6]:
#Demographic data generation
data={}
data['farmer_id']=[f"FARMER_{str(i).zfill(6)}"for i in range(1,n_sample+1)]

In [9]:
#Country Assignment
data['country'] = np.random.choice(countries, n_sample, p=country_weights)

In [10]:
#Age Distribution(Normal distribution mean=42,std=12)
data['age']=np.clip(
    np.random.normal(42, 12, n_sample),  # Generate ages
    18,  # Minimum age 
    80   # Maximum age 
).astype(int)  # Convert to whole numbers

In [11]:
#Gender-Assumption 60% Male and 40% Female
data['gender'] = np.random.choice(['Male', 'Female'], n_sample, p=[0.6, 0.4])

In [12]:
#Education Levels-Assuption most have primary education
education_levels=['no formal','primary','secondary','tertiary']
education_weights=[0.20,0.50,0.25,0.05]
data['education_level']=np.random.choice(
    education_levels, n_sample, p=education_weights
)


In [13]:
#Household Size (Poisson distribution: lambda=5)
data['household_size'] = np.random.poisson(5, n_sample)

In [14]:
#Marital Status (85% married )
data['marital_status'] = np.random.choice(
    ['Single', 'Married', 'Widowed', 'Divorced'],
    n_sample,
    p=[0.10, 0.85, 0.03, 0.02]
)

In [15]:
#Number of Dependents (children + elderly)
data['dependents'] = np.clip(
    data['household_size'] - 2,  # Typically household_size - 2 adults
    0,  # Minimum 0 dependents
    10  # Maximum 10 dependents
)

In [16]:
 #Mobile Money Provider (based on country)
data['mobile_provider'] = [
    random.choice(mobile_providers[country]) 
    for country in data['country']
]

In [17]:
#Account Age (months active)-Normal Distributed
data['months_as_mobile_user'] = np.clip(
    np.random.lognormal(3, 0.8, n_sample),  # ln(mean)=3, ln(std)=0.8
    6,    # Minimum 6 months (requirement for loans)
    120   # Maximum 10 years
).astype(int)

In [18]:
#Has Smartphone (impacts data usage)
# Assumption->Younger, more educated more likely to have smartphone
smartphone_prob = (
    0.1 +  # Base probability 10%
    0.3 * (data['age'] < 35) +  # +30% if under 35
    0.2 * (data['education_level'] == 'Secondary') +  # +20% if secondary
    0.4 * (data['education_level'] == 'Tertiary')     # +40% if tertiary
)
data['has_smartphone'] = (np.random.random(n_sample) < smartphone_prob).astype(int)

In [19]:
#Transaction Frequency (transactions per month)
# Log-normal: most do 10-30 transactions/month
data['avg_monthly_transactions'] = np.clip(
    np.random.lognormal(3, 0.5, n_sample),
    5,    # Minimum 5 transactions/month
    200   # Maximum 200 (very active users)
)

In [20]:
#Total Transaction Volume (USD equivalent, last 6 months)
# Depending on: household size, education, age
base_volume = np.random.lognormal(6, 1.5, n_sample)  # Base spending
volume_multiplier = (
    1.0 +
    0.1 * (data['household_size'] / 5) +  # Larger families spend more
    0.15 * (data['education_level'] == 'Secondary') +
    0.25 * (data['education_level'] == 'Tertiary')
)
data['total_transaction_volume_6mo_usd'] = base_volume * volume_multiplier


In [21]:
#Average Account Balance (USD)
# Related to transaction volume but lower
data['avg_account_balance_usd'] = np.clip(
    data['total_transaction_volume_6mo_usd'] * np.random.uniform(0.05, 0.20, n_sample),
    5,     # Minimum $5
    5000   # Maximum $5,000
)

In [22]:
#Savings Behavior (M-Shwari style - binary)
# Assmp-People with higher balances more likely to save
savings_prob = np.clip(
    0.3 + (data['avg_account_balance_usd'] / 1000) * 0.5,
    0, 1
)
data['has_mobile_savings'] = (np.random.random(n_sample) < savings_prob).astype(int)

In [23]:
#Previous Loan Repayment History (binary: 1=repaid, 0=defaulted/no history)
# 85% have good repayment if they borrowed before
# Older, educated, savers more likely to have repaid
repayment_prob = (
    0.60 +  # Base 60% repayment
    0.10 * (data['age'] > 30) +  # +10% if older
    0.10 * (data['education_level'] != 'No Formal') +  # +10% if educated
    0.05 * data['has_mobile_savings']  # +5% if they save
)
data['previous_loan_repaid'] = (
    np.random.random(n_sample) < np.clip(repayment_prob, 0, 1)
).astype(int)

In [24]:
#Airtime Purchase Frequency (per month)
data['airtime_purchases_monthly'] = np.clip(
    np.random.poisson(8, n_sample),  # Most buy airtime 8 times/month
    1, 30
)

In [25]:
#Bill Payments Consistency (0-1 score)
# Regular bill payers are lower risk
data['bill_payment_consistency'] = np.random.beta(6, 2, n_sample)
# Beta(6,2) gives distribution skewed toward high values (0.6-0.9)

In [27]:
# Peer-to-Peer Transfer Activity (monthly count)
data['p2p_transfers_monthly'] = np.clip(
    np.random.poisson(12, n_sample),
    0, 100
    
)

In [29]:
#Farm Size (acres) 
base_farm_size = np.random.lognormal(0.5, 0.8, n_sample)
# Injecting extreme outliers (commercial farms)
outlier_mask = np.random.random(n_sample) < 0.03
base_farm_size[outlier_mask] = np.random.uniform(100, 500, outlier_mask.sum())
data['farm_size_acres'] = np.clip(base_farm_size, 0.25, 1000)

In [30]:
#Farming Experience (years) 
raw_experience = data['age'] - 20 + np.random.normal(0, 5, n_sample)
# Introduce data quality issues: some experience > age (data entry errors!)
error_mask = np.random.random(n_sample) < 0.02
raw_experience[error_mask] = data['age'][error_mask] + np.random.uniform(5, 15, error_mask.sum())
data['farming_experience_years'] = np.clip(raw_experience, 2, 70).astype(int)

In [31]:
# 5.3 Primary Crop - ORDINAL-LIKE with implicit value hierarchy
data['primary_crop'] = np.random.choice(
    crop_types,
    n_sample,
    p=[0.35, 0.15, 0.10, 0.08, 0.08, 0.07, 0.05, 0.05, 0.04, 0.03]
)

In [33]:
#Crop Diversity - integer count with zero-inflation
base_diversity = np.random.poisson(2.5, n_sample)
zero_mask = np.random.random(n_sample) < 0.05
base_diversity[zero_mask] = 0
data['crop_diversity_count'] = np.clip(base_diversity, 0, 10)

In [34]:
#Livestock counts - ZERO-INFLATED distributions
data['cattle_count'] = np.random.poisson(1.5, n_sample)
data['goats_count'] = np.random.poisson(3, n_sample)
data['chickens_count'] = np.random.poisson(8, n_sample)


In [35]:
livestock_outliers = np.random.random(n_sample) < 0.01
data['cattle_count'][livestock_outliers] = np.random.randint(50, 200, livestock_outliers.sum())

In [36]:
#Total Livestock Units
# Some records have calculation mistakes from field officers
correct_tlu = (
    data['cattle_count'] * 0.7 +
    data['goats_count'] * 0.1 +
    data['chickens_count'] * 0.01
)
calculation_errors = np.random.random(n_sample) < 0.03
incorrect_tlu = correct_tlu * np.random.uniform(1.5, 3.0, n_sample)
data['total_livestock_tlu'] = np.where(calculation_errors, incorrect_tlu, correct_tlu)


In [38]:
# Land Ownership 
ownership_types = ['Owned with Title', 'Owned without Title', 'Rented', 'Communal']
base_ownership = np.random.choice(ownership_types, n_sample, p=[0.20, 0.45, 0.25, 0.10])

typo_mask = np.random.random(n_sample) < 0.02
typo_variants = ['owned with title', 'Rented Land', 'Communal Land', 'Own no title']
base_ownership[typo_mask] = np.random.choice(typo_variants, typo_mask.sum())
data['land_ownership'] = base_ownership

#Irrigation Access 
irrigation_prob = (
    0.15 +
    0.20 * (data['farm_size_acres'] > 2) +
    0.10 * (data['education_level'] == 'Secondary') +
    0.15 * (data['education_level'] == 'Tertiary')
)
data['has_irrigation'] = (
    np.random.random(n_sample) < np.clip(irrigation_prob, 0, 1)
).astype(int)

# Improved Seeds Usage
improved_seed_prob = (
    0.40 +
    0.20 * (data['education_level'] != 'No Formal') +
    0.10 * data['has_irrigation']
)
data['uses_improved_seeds'] = (
    np.random.random(n_sample) < np.clip(improved_seed_prob, 0, 1)
).astype(int)

# Fertilizer Usage 
base_fertilizer = np.random.lognormal(3, 1, n_sample)
# Zero-inflation: many use no fertilizer
zero_fert = np.random.random(n_sample) < 0.25
base_fertilizer[zero_fert] = 0
# Extreme users (commercial farmers)
extreme_fert = np.random.random(n_sample) < 0.02
base_fertilizer[extreme_fert] = np.random.uniform(300, 1000, extreme_fert.sum())
data['fertilizer_kg_per_acre'] = base_fertilizer

# Previous Yield 
base_yield = data['farm_size_acres'] * np.random.uniform(8, 15, n_sample)
yield_multiplier = (
    1.0 +
    0.30 * data['has_irrigation'] +
    0.20 * data['uses_improved_seeds'] +
    0.15 * (data['fertilizer_kg_per_acre'] > 50)
)
calculated_yield = base_yield * yield_multiplier
# Introduce negative values (data entry errors!)
error_yield = np.random.random(n_sample) < 0.01
calculated_yield[error_yield] = np.random.uniform(-50, -5, error_yield.sum())
data['previous_yield_bags'] = calculated_yield

# Binary features with various distributions
data['has_off_farm_income'] = np.random.choice([0, 1], n_sample, p=[0.45, 0.55])
data['received_extension_services'] = np.random.choice([0, 1], n_sample, p=[0.60, 0.40])

coop_prob = (
    0.30 +
    0.20 * (data['farming_experience_years'] > 5) +
    0.15 * (data['education_level'] != 'No Formal')
)
data['cooperative_member'] = (
    np.random.random(n_sample) < np.clip(coop_prob, 0, 1)
).astype(int)

market_linkage_prob = (
    0.25 * data['cooperative_member'] +
    0.15 * (data['farm_size_acres'] > 1)
)
data['has_buyer_contract'] = (
    np.random.random(n_sample) < np.clip(market_linkage_prob, 0, 1)
).astype(int)

In [40]:
#NDVI Current (Normalized Difference Vegetation Index)
# Measures crop health: 0.2-0.9, higher = healthier crops
# Correlated with irrigation, fertilizer use
base_ndvi = np.random.uniform(0.3, 0.7, n_sample)
ndvi_boost = (
    0.15 * data['has_irrigation'] +
    0.10 * data['uses_improved_seeds'] +
    0.05 * (data['fertilizer_kg_per_acre'] > 50)
)
data['ndvi_current'] = np.clip(base_ndvi + ndvi_boost, 0.2, 0.9)



In [41]:
#NDVI Historical Average (past 3 seasons)
# Slightly lower than current (random variation)
data['ndvi_historical_avg'] = np.clip(
    data['ndvi_current'] + np.random.normal(-0.05, 0.1, n_sample),
    0.2, 0.9
)


In [42]:
#NDVI Trend (improvement or decline)
data['ndvi_trend'] = data['ndvi_current'] - data['ndvi_historical_avg']
# Positive = improving, Negative = declining


In [43]:
 #Farm Area Visible from Satellite (% of claimed area)
# Some farms obscured by clouds, trees
data['satellite_visible_area_pct'] = np.random.uniform(70, 100, n_sample)

# Soil Moisture Index (0-1 scale from satellite)
# Higher in rainy season, with irrigation
base_moisture = np.random.beta(4, 4, n_sample)  # Centers around 0.5
moisture_boost = 0.2 * data['has_irrigation']
data['soil_moisture_index'] = np.clip(base_moisture + moisture_boost, 0, 1)

# Annual Rainfall (mm) - by region
# Kenya/Uganda: 800-1200mm, Tanzania: 600-1000mm, Rwanda: 1000-1400mm
rainfall_by_country = {
    'Kenya': (800, 1200),
    'Tanzania': (600, 1000),
    'Uganda': (900, 1300),
    'Rwanda': (1000, 1400),
    'Zambia': (700, 1100)
}
data['annual_rainfall_mm'] = [
    np.random.uniform(
        rainfall_by_country[country][0],
        rainfall_by_country[country][1]
    ) for country in data['country']
]

# Distance to Nearest Market (km)
# Log-normal: most farms are 5-15km from market
data['distance_to_market_km'] = np.clip(
    np.random.lognormal(2, 0.8, n_sample),
    0.5,  # Minimum 0.5km
    100   # Maximum 100km (very remote)
)

# Distance to Nearest Input Dealer (km)
# Similar to market distance but slightly closer on average
data['distance_to_input_dealer_km'] = np.clip(
    data['distance_to_market_km'] * np.random.uniform(0.7, 1.3, n_sample),
    0.5, 100
)

# Distance to Water Source (km)
data['distance_to_water_km'] = np.clip(
    np.random.lognormal(0.5, 0.8, n_sample),
    0.1,  # 100m minimum
    20    # 20km maximum
)

# Road Quality to Farm (1-5 scale: 1=poor, 5=excellent)
# Better roads if closer to market
road_quality_prob = 5 - (data['distance_to_market_km'] / 25)
data['road_quality_score'] = np.clip(
    road_quality_prob + np.random.normal(0, 0.8, n_sample),
    1, 5
).astype(int)

# Elevation (meters above sea level)
# Affects crops grown, climate
data['elevation_masl'] = np.clip(
    np.random.normal(1200, 300, n_sample),
    500,   # Minimum 500m (lowlands)
    2500   # Maximum 2500m (highlands)
).astype(int)

# Slope of Land (degrees)
data['land_slope_degrees'] = np.clip(
    np.random.exponential(5, n_sample),  # Most land is flat/gentle
    0, 30  # Maximum 30 degrees
)


In [44]:
# Savings Group Membership (chama/table banking)
savings_group_prob = (
    0.35 +  # Base 35%
    0.20 * (data['gender'] == 'Female') +  # Women more likely
    0.10 * data['cooperative_member']
)
data['savings_group_member'] = (
    np.random.random(n_sample) < np.clip(savings_group_prob, 0, 1)
).astype(int)

# Community Leadership Role (village elder, group chair, etc.)
leadership_prob = (
    0.05 +  # Base 5%
    0.10 * (data['age'] > 45) +  # Elders more likely
    0.05 * (data['farming_experience_years'] > 15) +
    0.05 * (data['education_level'] == 'Secondary') +
    0.10 * (data['education_level'] == 'Tertiary')
)
data['has_leadership_role'] = (
    np.random.random(n_sample) < np.clip(leadership_prob, 0, 1)
).astype(int)

# Number of Farmer References/Guarantors Available
# More established farmers have more references
data['available_references'] = np.clip(
    np.random.poisson(
        2 + data['farming_experience_years'] * 0.1
    , n_sample),
    0, 10
)

# Phone Usage Intensity (calls + SMS per day)
data['daily_phone_activity'] = np.clip(
    np.random.lognormal(2, 1, n_sample),
    5,    # Minimum 5 per day
    100   # Maximum 100 per day
)

# Internet/Data Usage (MB per month)
# Correlated with smartphone ownership
data['monthly_data_usage_mb'] = (
    data['has_smartphone'] * 
    np.clip(np.random.lognormal(6, 1.5, n_sample), 100, 10000)
)

# Input Supplier Trade Credit History (binary)
# Have they received inputs on credit before?
trade_credit_prob = (
    0.30 +
    0.20 * data['cooperative_member'] +
    0.15 * (data['farming_experience_years'] > 5)
)
data['has_trade_credit_history'] = (
    np.random.random(n_sample) < np.clip(trade_credit_prob, 0, 1)
).astype(int)

# Agricultural Insurance Coverage (binary)
insurance_prob = (
    0.10 +  # Base 10% (low penetration)
    0.15 * (data['farm_size_acres'] > 2) +
    0.10 * data['cooperative_member'] +
    0.05 * (data['education_level'] != 'No Formal')
)
data['has_crop_insurance'] = (
    np.random.random(n_sample) < np.clip(insurance_prob, 0, 1)
).astype(int)

# Extension Officer Visit Frequency (visits per year)
data['extension_visits_annual'] = (
    data['received_extension_services'] * 
    np.random.poisson(4, n_sample)  # Average 4 visits if they have access
)

# Years in Current Location (residential stability)
# Correlated with age
data['years_in_location'] = np.clip(
    data['age'] - 18 + np.random.normal(-5, 8, n_sample),
    1, 60
).astype(int)

# National ID Verification Status (1=verified, 0=not verified)
# Higher education more likely to have verified ID
id_verification_prob = (
    0.70 +  # Base 70%
    0.20 * (data['education_level'] != 'No Formal') +
    0.05 * data['has_smartphone']
)
data['id_verified'] = (
    np.random.random(n_sample) < np.clip(id_verification_prob, 0, 1)
).astype(int)


In [45]:
from scipy.special import expit

default_risk_score = np.zeros(n_sample)

# Risk factors
default_risk_score += 0.25 * (data['previous_loan_repaid'] == 0)
default_risk_score += 0.20 * (data['months_as_mobile_user'] < 12)
default_risk_score += 0.15 * (data['farm_size_acres'] < 1)
default_risk_score += 0.15 * (data['ndvi_current'] < 0.4)
default_risk_score += 0.20 * (data['distance_to_market_km'] > 20)
default_risk_score += 0.15 * (data['cooperative_member'] == 0)
default_risk_score += 0.10 * (data['has_irrigation'] == 0)
default_risk_score += 0.10 * (data['land_ownership'] == 'Rented')
default_risk_score += 0.10 * (data['total_livestock_tlu'] < 1)
default_risk_score += 0.10 * (data['has_crop_insurance'] == 0)
default_risk_score += 0.10 * (data['age'] < 25)
default_risk_score += 0.10 * (data['farming_experience_years'] < 3)

# Protective factors
default_risk_score -= 0.20 * (data['avg_account_balance_usd'] > 100)
default_risk_score -= 0.15 * (data['bill_payment_consistency'] > 0.7)
default_risk_score -= 0.15 * data['has_buyer_contract']
default_risk_score -= 0.10 * (data['previous_yield_bags'] > 50)
default_risk_score -= 0.10 * data['uses_improved_seeds']
default_risk_score -= 0.10 * (data['education_level'] == 'Secondary')
default_risk_score -= 0.15 * (data['education_level'] == 'Tertiary')
default_risk_score -= 0.10 * data['has_off_farm_income']
default_risk_score -= 0.05 * data['id_verified']

default_risk_score += np.random.normal(0, 0.15, n_sample)

data['default_probability'] = expit(default_risk_score)
default_threshold = 0.30
data['loan_default'] = (data['default_probability'] > default_threshold).astype(int)

In [48]:
df = pd.DataFrame(data)
df['data_generated_date'] = datetime.now().strftime('%Y-%m-%d')

# Save
filename = 'east_africa_agri_credit_FULL_FEATURE_ENGINEERING.csv'
df.to_csv(filename, index=False)

In [64]:
df.isnull().sum().sum()

np.int64(21081)

In [69]:
missing_counts = {}

# MCAR - Completely Random (3-8%)

mcar_features = ['annual_rainfall_mm', 'soil_moisture_index', 'road_quality_score', 'elevation_masl']
for feature in mcar_features:
    if feature in df.columns:
        miss_rate = np.random.uniform(0.03, 0.08)
        mask = np.random.random(n_sample) < miss_rate
        df.loc[mask, feature] = np.nan
        missing_counts[feature] = mask.sum()
        

# MAR - Missing At Random (conditional)


# Non-smartphone users missing transaction data
if 'has_smartphone' in df.columns:
    non_smartphone = df['has_smartphone'] == 0
    miss_rate = np.random.uniform(0.12, 0.18)
    mar_mask1 = non_smartphone & (np.random.random(n_sample) < miss_rate)
    df.loc[mar_mask1, 'total_transaction_volume_6mo_usd'] = np.nan
    df.loc[mar_mask1, 'avg_account_balance_usd'] = np.nan
    missing_counts['total_transaction_volume_6mo_usd'] = mar_mask1.sum()
    

# Remote farmers missing distance
if 'distance_to_market_km' in df.columns:
    remote = df['distance_to_market_km'] > 30
    miss_rate = np.random.uniform(0.15, 0.25)
    mar_mask2 = remote & (np.random.random(n_sample) < miss_rate)
    df.loc[mar_mask2, 'distance_to_market_km'] = np.nan
    df.loc[mar_mask2, 'distance_to_input_dealer_km'] = np.nan
    missing_counts['distance_to_market_km'] = mar_mask2.sum()
    

# MNAR - Missing Not At Random (value-dependent)


# Low income hidden
if 'estimated_annual_income_usd' in df.columns:
    low_income = df['estimated_annual_income_usd'] < 300
    miss_rate = np.random.uniform(0.20, 0.35)
    mnar_mask1 = low_income & (np.random.random(n_sample) < miss_rate)
    df.loc[mnar_mask1, 'estimated_annual_income_usd'] = np.nan
    missing_counts['estimated_annual_income_usd'] = mnar_mask1.sum()
    

# Defaulters hide history
if 'previous_loan_repaid' in df.columns:
    defaulters = df['previous_loan_repaid'] == 0
    miss_rate = np.random.uniform(0.25, 0.40)
    mnar_mask2 = defaulters & (np.random.random(n_sample) < miss_rate)
    df.loc[mnar_mask2, 'previous_loan_repaid'] = np.nan
    missing_counts['previous_loan_repaid'] = mnar_mask2.sum()
    

# Small farms hidden
if 'farm_size_acres' in df.columns:
    small_farms = df['farm_size_acres'] < 0.5
    miss_rate = np.random.uniform(0.10, 0.20)
    mnar_mask3 = small_farms & (np.random.random(n_sample) < miss_rate)
    df.loc[mnar_mask3, 'farm_size_acres'] = np.nan
    missing_counts['farm_size_acres'] = mnar_mask3.sum()
    

# Satellite data - technical issues

satellite_features = ['ndvi_current', 'ndvi_historical_avg', 'ndvi_trend', 'satellite_visible_area_pct']
for feature in satellite_features:
    if feature in df.columns:
        miss_rate = np.random.uniform(0.05, 0.12)
        mask = np.random.random(n_sample) < miss_rate
        df.loc[mask, feature] = np.nan
        missing_counts[feature] = mask.sum()
        

# Transfer features - new rollout

transfer_features = [
    'avg_transfers_in_monthly', 'avg_transfer_in_amount_usd',
    'avg_transfers_out_monthly', 'avg_transfer_out_total_usd',
    'transfer_in_out_ratio', 'avg_expense_transaction_size'
]
for feature in transfer_features:
    if feature in df.columns:
        miss_rate = np.random.uniform(0.10, 0.18)
        mask = np.random.random(n_sample) < miss_rate
        df.loc[mask, feature] = np.nan
        missing_counts[feature] = mask.sum()
        

# Expense features - inactive users

if 'total_transaction_volume_6mo_usd' in df.columns:
    inactive = (df['total_transaction_volume_6mo_usd'] < 100) | df['total_transaction_volume_6mo_usd'].isna()
    expense_features = ['utility_payments_count_6mo', 'farm_input_purchases_6mo_usd']
    for feature in expense_features:
        if feature in df.columns:
            miss_rate = np.random.uniform(0.15, 0.25)
            mask = inactive & (np.random.random(n_sample) < miss_rate)
            df.loc[mask, feature] = np.nan
            missing_counts[feature] = mask.sum()
            

# Extreme missingness

extreme_features = ['has_trade_credit_history', 'extension_visits_annual']
for feature in extreme_features:
    if feature in df.columns:
        miss_rate = np.random.uniform(0.30, 0.50)
        mask = np.random.random(n_sample) < miss_rate
        df.loc[mask, feature] = np.nan
        missing_counts[feature] = mask.sum()
        

In [68]:
df.to_csv('east_africa_credit_with_NaN.csv', index=False)

In [73]:
df.to_csv('east_africa_credit_with_NaN1.csv', index=False)