In [1]:
import pandas as pd
import numpy as np
import random

# Load original dataset
df = pd.read_csv('../data/telco_churn.csv')

# Fix TotalCharges if it's object
if df['TotalCharges'].dtype == 'object':
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

df = df.dropna(subset=['TotalCharges'])
df['TotalCharges'] = df['TotalCharges'].astype(float)

# Function to slightly mutate a row
def mutate_row(row):
    row['MonthlyCharges'] = max(18, round(np.random.normal(row['MonthlyCharges'], 8), 2))
    row['TotalCharges'] = round(row['MonthlyCharges'] * row['tenure'], 2)
    row['tenure'] = max(1, row['tenure'] + np.random.randint(-3, 4))
    row['customerID'] = "CUST" + ''.join(random.choices('0123456789ABCDEF', k=8))
    return row

# Calculate number of times to repeat
num_repeats = 4  # 7,000 x 4 = ~28,000 + original = ~35,000 (we’ll trim later)

# Generate expanded dataset
new_data = []
for _ in range(num_repeats):
    temp_df = df.copy().apply(mutate_row, axis=1)
    new_data.append(temp_df)

# Combine and trim to exactly 30,000 rows
df_large = pd.concat([df] + new_data, ignore_index=True)
df_large = df_large.sample(n=30000, random_state=42).reset_index(drop=True)

# Save to file
df_large.to_csv('../data/augmented_telco_churn_30k.csv', index=False)
print("✅ Dataset expanded to:", df_large.shape)


✅ Dataset expanded to: (30000, 21)
