In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# ==================================
# 1. Load dataset
# ==================================
df = pd.read_csv("Bank_Marketing_Split_dataset_with_allocations.csv")

# ==================================
# 2. ETF columns -> convert to % (0–100)
# ==================================
etf_cols = ['US_Equity','International_Equity','Bonds','REIT','Cash']

df[etf_cols] = df[etf_cols].div(df[etf_cols].sum(axis=1), axis=0) * 100

# ==================================
# 3. Features to keep
# ==================================
columns_keep = [
    'Age', 'MaritalStatus','EducationLevel','AnnualIncome',
    'NetWorth', 'AccountBalance', 'InvestmentPortfolioValue',
    'SalaryCategory','RiskRating','CustomerSegment',
    'HasMortgage','HasPersonalLoan'
]

X = df[columns_keep]
y = df[etf_cols]

# ==================================
# 4. Categorical vs numeric
# ==================================
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(exclude=['object']).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', 'passthrough', num_cols)
    ]
)

# ==================================
# 5. Random Forest Regressor
# ==================================
rf_model = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', rf_model)
])

# ==================================
# 6. Train/test split
# ==================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==================================
# 7. Fit model
# ==================================
pipeline.fit(X_train, y_train)

# ==================================
# 8. Evaluate
# ==================================
y_pred = pipeline.predict(X_test)

print("===== Model Performance (% scale) =====")
for i, col in enumerate(etf_cols):
    print(f"{col:25s} | R2: {r2_score(y_test[col], y_pred[:,i]):.4f} "
          f"| MAE: {mean_absolute_error(y_test[col], y_pred[:,i]):.2f}%")

# ==================================
# 9. Predict for ONE customer
# ==================================
sample = X.iloc[[0]]
pred = pipeline.predict(sample)[0]

# Force sum to 100%
pred = (pred / pred.sum()) * 100

pred_rounded = {col: f"{round(val)}%" for col, val in zip(etf_cols, pred)}

print("\nPredicted ETF allocation (percent format):")
print(pred_rounded)


===== Model Performance (% scale) =====
US_Equity                 | R2: 0.4704 | MAE: 5.76%
International_Equity      | R2: 0.4704 | MAE: 4.32%
Bonds                     | R2: 0.4704 | MAE: 4.32%
REIT                      | R2: 0.4704 | MAE: 2.88%
Cash                      | R2: 0.4704 | MAE: 8.65%

Predicted ETF allocation (percent format):
{'US_Equity': '17%', 'International_Equity': '12%', 'Bonds': '18%', 'REIT': '8%', 'Cash': '45%'}


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# ==================================
# 1. Load dataset
# ==================================
df = pd.read_csv("Bank_Marketing_Split_dataset_with_allocations.csv")

# ==================================
# 2. ETF columns -> convert to % (0–100)
# ==================================
etf_cols = ['US_Equity','International_Equity','Bonds','REIT','Cash']

df[etf_cols] = df[etf_cols].div(df[etf_cols].sum(axis=1), axis=0) * 100

# ==================================
# 3. Features to keep
# ==================================
columns_keep = [
    'Age', 'MaritalStatus','EducationLevel','AnnualIncome',
    'NetWorth', 'AccountBalance', 'InvestmentPortfolioValue',
    'SalaryCategory','RiskRating','CustomerSegment',
    'HasMortgage','HasPersonalLoan'
]

X = df[columns_keep]
y = df[etf_cols]

# ==================================
# 4. Categorical vs numeric
# ==================================
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(exclude=['object']).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', 'passthrough', num_cols)
    ]
)

# ==================================
# 5. Random Forest Regressor
# ==================================
rf_model = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', rf_model)
])

# ==================================
# 6. Train/test split
# ==================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==================================
# 7. Fit model
# ==================================
pipeline.fit(X_train, y_train)

# ==================================
# 8. Evaluate
# ==================================
y_pred = pipeline.predict(X_test)

print("===== Model Performance (% scale) =====")
for i, col in enumerate(etf_cols):
    print(f"{col:25s} | R2: {r2_score(y_test[col], y_pred[:,i]):.4f} "
          f"| MAE: {mean_absolute_error(y_test[col], y_pred[:,i]):.2f}%")

# ==================================
# 9. Predict for ONE customer
# ==================================
sample = X.iloc[[0]]
pred = pipeline.predict(sample)[0]

# Force sum to 100%
pred = (pred / pred.sum()) * 100

pred_rounded = {col: f"{round(val)}%" for col, val in zip(etf_cols, pred)}

print("\nPredicted ETF allocation (percent format):")
print(pred_rounded)
