In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random


In [2]:
num_records = 100000

districts = [
    "Burdwan", "Hooghly", "Nadia", "Murshidabad",
    "Howrah", "Kolkata", "Bankura"
]

form_types = ["New", "Correction"]

rejection_reasons = [
    "Missing Documents",
    "Age Mismatch",
    "Duplicate Entry",
    "Address Mismatch"
]


In [3]:
application_ids = range(1, num_records + 1)

fake = Faker()

submission_dates = [
    fake.date_between(start_date="-6m", end_date="today")
    for _ in range(num_records)
]


In [4]:
district_column = random.choices(
    districts,
    weights=[0.18, 0.15, 0.14, 0.13, 0.12, 0.18, 0.10],
    k=num_records
)

form_type_column = random.choices(
    form_types,
    weights=[0.7, 0.3],
    k=num_records
)


In [5]:
processing_days = []

for ft in form_type_column:
    if ft == "New":
        processing_days.append(
            max(1, int(np.random.normal(loc=6, scale=2)))
        )
    else:
        processing_days.append(
            max(1, int(np.random.normal(loc=12, scale=4)))
        )


In [6]:
final_status = []

for days in processing_days:
    if days <= 10:
        final_status.append("Approved")
    else:
        final_status.append(
            random.choices(
                ["Approved", "Rejected"],
                weights=[0.6, 0.4]
            )[0]
        )


In [7]:
rejection_reason_column = []

for status in final_status:
    if status == "Rejected":
        rejection_reason_column.append(
            random.choice(rejection_reasons)
        )
    else:
        rejection_reason_column.append(None)


In [8]:
df = pd.DataFrame({
    "application_id": application_ids,
    "submission_date": submission_dates,
    "district": district_column,
    "form_type": form_type_column,
    "processing_days": processing_days,
    "final_status": final_status,
    "rejection_reason": rejection_reason_column
})


In [9]:
df.head()


Unnamed: 0,application_id,submission_date,district,form_type,processing_days,final_status,rejection_reason
0,1,2026-01-06,Murshidabad,New,3,Approved,
1,2,2026-01-06,Murshidabad,Correction,12,Approved,
2,3,2026-01-06,Bankura,New,4,Approved,
3,4,2026-01-06,Burdwan,New,2,Approved,
4,5,2026-01-06,Nadia,New,5,Approved,


In [10]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   application_id    100000 non-null  int64 
 1   submission_date   100000 non-null  object
 2   district          100000 non-null  object
 3   form_type         100000 non-null  object
 4   processing_days   100000 non-null  int64 
 5   final_status      100000 non-null  object
 6   rejection_reason  7168 non-null    object
dtypes: int64(2), object(5)
memory usage: 5.3+ MB


In [11]:
df.describe()


Unnamed: 0,application_id,processing_days
count,100000.0,100000.0
mean,50000.5,7.28118
std,28867.657797,3.877276
min,1.0,1.0
25%,25000.75,5.0
50%,50000.5,6.0
75%,75000.25,9.0
max,100000.0,28.0


In [12]:
df.shape


(100000, 7)

In [13]:
df.to_csv("electoral_operations_data.csv", index=False)
