## Install libraries
Create a cell to install the necessary libraries.


In [1]:
pip install faker

Collecting faker
  Downloading faker-37.6.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.6.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.6.0


## Import libraries


To import the necessary libraries.


In [2]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from faker import Faker

SEED  = 2025
NROWS = 1000
rng   = np.random.default_rng(SEED)

# Demographic Data Generation

Use Faker to generate demographic data

In [3]:
def gen_identity(n, ages):
    fake = Faker(); fake.seed_instance(SEED)
    names, emails, phones, addrs, jobs, dobs = [], [], [], [], [], []
    for i in range(n):
        p = fake.simple_profile()
        names.append(p["name"])
        emails.append(p["mail"])
        phones.append(fake.phone_number())
        addrs.append(p["address"].replace("\n", ", "))
        jobs.append(fake.job())
        dob = datetime(2025,1,1) - timedelta(days=int(ages[i]*365.25) + int(rng.integers(0,365)))
        dobs.append(dob.date().isoformat())
    return names, emails, phones, addrs, jobs, dobs

##  Generating other features and value
Generating Social Media, Mobile money,Transaction data, Apps related data


In [4]:
def generate_dataset(n=NROWS, seed=SEED):
    rng = np.random.default_rng(seed)

    # Demographics (context only)
    Age = rng.normal(35, 12, n).clip(18, 70).round().astype(int)
    Gender = rng.choice(["Male", "Female"], n, p=[0.61, 0.39])
    Marriage = rng.choice(["Single", "Married", "Divorced"], n, p=[0.50, 0.45, 0.05])
    Children = rng.poisson(1.5, n).astype(int)
    FamilySize = (Children + rng.integers(1, 3, n)).astype(int)
    Education = rng.choice(["Primary", "Secondary", "Tertiary"], n, p=[0.30, 0.50, 0.20])
    Income = np.exp(rng.normal(8, 0.5, n)).round(0).astype(int)  # unitless

    HouseOwn = rng.choice([0, 1], n, p=[0.6, 0.4]).astype(int)
    CarOwn = rng.choice([0, 1], n, p=[0.7, 0.3]).astype(int)

    # Phone aggregates (context)
    CallsNum = np.abs(rng.normal(32, 25, n)).round().astype(int)
    CallsMinutes = np.abs(rng.normal(42, 40, n)).round().astype(int)
    SmsSent = np.abs(rng.normal(31, 26, n)).round().astype(int)

    # Engineered phone patterns
    SmsVar = rng.normal(0, 1, n)
    Periodicity = rng.normal(0, 1, n)
    SlopeCalls = rng.normal(0, 1, n)
    MobilityClusters = (rng.poisson(3, n) + 1).astype(int)

    # Social indicators (0..1)
    SocialStability = rng.uniform(0, 1, n)
    SocialExposure = rng.uniform(0, 1, n)
    SocialQuality = rng.uniform(0, 1, n)

    # Digital finance (counts)
    MobileMoneyTx = np.abs(rng.normal(5, 3, n)).round().astype(int)
    AirtimeTopups = np.abs(rng.normal(4, 2, n)).round().astype(int)

    # SMS transaction patterns (30d)
    SMSTx_P2P_Count_30d = np.abs(rng.normal(3, 2, n)).round().astype(int)
    SMSTx_BillPay_Count_30d = np.abs(rng.normal(4, 3, n)).round().astype(int)
    SMSTx_Merchant_Count_30d = np.abs(rng.normal(2, 2, n)).round().astype(int)
    SMSTx_Amt_Mean_30d = np.exp(rng.normal(3.5, 0.6, n)).round(2)
    SMSTx_Amt_CV_30d = np.clip(rng.normal(0.8, 0.4, n), 0.05, 3.0)
    SMSTx_OnTimeBillPay_Rate = np.clip(rng.normal(0.8, 0.15, n), 0.0, 1.0)
    SMSTx_NightShare = np.clip(rng.normal(0.15, 0.10, n), 0.0, 0.6)

    # Social media
    SM_Platforms_Installed = rng.integers(0, 6, n)
    SM_MinutesPerDay = np.clip(rng.normal(90, 60, n), 0, 600).round().astype(int)
    SM_PostsPerWeek = np.clip(rng.normal(5, 5, n), 0, 60).round().astype(int)
    SM_NightUsageShare = np.clip(rng.normal(0.2, 0.15, n), 0.0, 0.9)
    SM_Sentiment_Polarity = np.clip(rng.normal(0.1, 0.3, n), -1.0, 1.0)

    # Apps
    Apps_TotalInstalled = np.clip(rng.normal(45, 20, n), 1, 200).round().astype(int)
    Apps_SessionsPerDay = np.clip(rng.normal(25, 15, n), 0, 200).round().astype(int)
    Apps_ForegroundMinPerDay = np.clip(rng.normal(180, 120, n), 0, 1000).round().astype(int)
    Apps_BackgroundMBPerDay = np.clip(rng.normal(150, 120, n), 0, 2000).round(1).astype(float)
    Apps_Time_ProductivityMin = np.clip(rng.normal(40, 35, n), 0, 300).round().astype(int)
    Apps_Time_GamingMin = np.clip(rng.normal(30, 40, n), 0, 600).round().astype(int)
    Apps_Time_FinanceMin = np.clip(rng.normal(20, 25, n), 0, 300).round().astype(int)
    Apps_ChurnRate_30d = np.clip(rng.normal(0.15, 0.10, n), 0.0, 1.0)
    Apps_RiskyCount = np.clip(rng.normal(1.5, 2.0, n), 0, 30).round().astype(int)

    # Identities
    Names, Emails, Phones, Addresses, Jobs, DoBs = gen_identity(n, Age)

    df = pd.DataFrame({
        "FullName": Names, "Email": Emails, "Phone": Phones, "Address": Addresses, "JobTitle": Jobs,
        "DateOfBirth": DoBs,
        "Age": Age, "Gender": Gender, "Marriage": Marriage, "Children": Children,
        "FamilySize": FamilySize, "Education": Education, "Income": Income,
        "HouseOwn": HouseOwn, "CarOwn": CarOwn,
        "CallsNum": CallsNum, "CallsMinutes": CallsMinutes, "SmsSent": SmsSent,
        "SmsVar": np.round(SmsVar, 2), "Periodicity": np.round(Periodicity, 2),
        "SlopeCalls": np.round(SlopeCalls, 2),
        "MobilityClusters": MobilityClusters,
        "SocialStability": np.round(SocialStability, 2),
        "SocialExposure": np.round(SocialExposure, 2),
        "SocialQuality": np.round(SocialQuality, 2),
        "MobileMoneyTx": MobileMoneyTx, "AirtimeTopups": AirtimeTopups,
        "NumFinancialAppsInstalled": rng.poisson(1.5, NROWS).clip(0, 6).astype(int),
        "NumFinancialAppsUsed": [int(rng.integers(0, inst + 1)) for inst in rng.poisson(1.5, NROWS).clip(0, 6)],
        "SMSTx_P2P_Count_30d": SMSTx_P2P_Count_30d,
        "SMSTx_BillPay_Count_30d": SMSTx_BillPay_Count_30d,
        "SMSTx_Merchant_Count_30d": SMSTx_Merchant_Count_30d,
        "SMSTx_Amt_Mean_30d": SMSTx_Amt_Mean_30d,
        "SMSTx_Amt_CV_30d": np.round(SMSTx_Amt_CV_30d, 2),
        "SMSTx_OnTimeBillPay_Rate": np.round(SMSTx_OnTimeBillPay_Rate, 2),
        "SMSTx_NightShare": np.round(SMSTx_NightShare, 2),
        "SM_Platforms_Installed": SM_Platforms_Installed,
        "SM_MinutesPerDay": SM_MinutesPerDay,
        "SM_PostsPerWeek": SM_PostsPerWeek,
        "SM_NightUsageShare": np.round(SM_NightUsageShare, 2),
        "SM_Sentiment_Polarity": np.round(SM_Sentiment_Polarity, 2),
        "Apps_TotalInstalled": Apps_TotalInstalled,
        "Apps_SessionsPerDay": Apps_SessionsPerDay,
        "Apps_ForegroundMinPerDay": Apps_ForegroundMinPerDay,
        "Apps_BackgroundMBPerDay": Apps_BackgroundMBPerDay,
        "Apps_Time_ProductivityMin": Apps_Time_ProductivityMin,
        "Apps_Time_GamingMin": Apps_Time_GamingMin,
        "Apps_Time_FinanceMin": Apps_Time_FinanceMin,
        "Apps_ChurnRate_30d": np.round(Apps_ChurnRate_30d, 2),
        "Apps_RiskyCount": Apps_RiskyCount
    })

    return df

## Feature Explanation function

Creating anpther sheet which will said the Explaination of features.


In [5]:
def feature_justification():
    rows = []
    def add(name, rationale):
        rows.append({
            "Feature": name,
            "Meaning / Rationale": rationale
        })
    # Identity
    for f in ["FullName","Email","Phone","Address","JobTitle","DateOfBirth"]:
        add(f, "Identity information for realism.")
    # Demographics
    for f in ["Age","Gender","Marriage","Children","FamilySize","Education","Income","HouseOwn","CarOwn"]:
        add(f, "Demographic context information.")
    # Phone usage
    for f in ["CallsNum","CallsMinutes","SmsSent"]:
        add(f, "Phone usage volume baseline.")
    # Phone patterns
    add("Periodicity", "Regular weekly/monthly cycles in phone usage.")
    add("SlopeCalls", "Communication trend over time.")
    add("SmsVar", "Variability in messaging patterns.")
    add("MobilityClusters", "Number of distinct location clusters.")
    # Social indicators
    add("SocialStability", "Social stability indicator (0-1).")
    add("SocialExposure", "Social network exposure (0-1).")
    add("SocialQuality", "Quality of social connections (0-1).")
    # Digital finance
    add("MobileMoneyTx", "Number of mobile money transactions.")
    add("AirtimeTopups", "Number of airtime top-ups.")
    add("NumFinancialAppsInstalled", "Number of financial apps installed.")
    add("NumFinancialAppsUsed", "Number of financial apps actively used.")
    # SMS transactions
    for f in ["SMSTx_P2P_Count_30d", "SMSTx_BillPay_Count_30d", "SMSTx_Merchant_Count_30d"]:
        add(f, "SMS-based transaction counts over 30 days.")
    add("SMSTx_Amt_Mean_30d", "Average SMS transaction amount (30 days).")
    add("SMSTx_Amt_CV_30d", "Coefficient of variation in SMS transaction amounts.")
    add("SMSTx_OnTimeBillPay_Rate", "Rate of on-time bill payments via SMS.")
    add("SMSTx_NightShare", "Share of SMS transactions during night hours.")
    # Social media
    add("SM_Platforms_Installed", "Number of social media platforms installed.")
    add("SM_MinutesPerDay", "Daily minutes spent on social media.")
    add("SM_PostsPerWeek", "Number of social media posts per week.")
    add("SM_NightUsageShare", "Share of social media usage during night hours.")
    add("SM_Sentiment_Polarity", "Sentiment polarity of social media content (-1 to 1).")
    # Apps
    add("Apps_TotalInstalled", "Total number of apps installed.")
    add("Apps_SessionsPerDay", "Number of app sessions per day.")
    add("Apps_ForegroundMinPerDay", "Daily minutes of foreground app usage.")
    add("Apps_BackgroundMBPerDay", "Daily background data usage in MB.")
    add("Apps_Time_ProductivityMin", "Daily minutes spent in productivity apps.")
    add("Apps_Time_GamingMin", "Daily minutes spent in gaming apps.")
    add("Apps_Time_FinanceMin", "Daily minutes spent in finance apps.")
    add("Apps_ChurnRate_30d", "App churn rate over 30 days.")
    add("Apps_RiskyCount", "Number of risky/side-loaded apps.")

    return pd.DataFrame(rows)

## Generate and save data



In [6]:
df = generate_dataset(NROWS, SEED)
fj = feature_justification()

# Save CSV
df.to_csv("sample_data_features_only.csv", index=False)

# Save Excel
with pd.ExcelWriter("sample_data_features_only.xlsx", engine="openpyxl") as w:
    df.to_excel(w, index=False, sheet_name="SyntheticData")
    fj.to_excel(w, index=False, sheet_name="FeatureJustification")

print("Saved sample_data_features_only.csv and sample_data_features_only.xlsx")

Saved sample_data_features_only.csv and sample_data_features_only.xlsx


In [7]:
from google.colab import files

files.download("sample_data_features_only.csv")
files.download("sample_data_features_only.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>