In [1]:
from google.colab import files
import io

uploaded = files.upload()
for filename in uploaded.keys():
  print('Uploaded file "{name}" with length {length} bytes'.format(
      name=filename, length=len(uploaded[filename])))

Saving sample_data_features_only (1).csv to sample_data_features_only (1).csv
Uploaded file "sample_data_features_only (1).csv" with length 297410 bytes


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

# Where to find and save files
input_file = "sample_data_features_only (1).csv"
output_folder = Path("./rule_based_outputs")
output_folder.mkdir(exist_ok=True)

# Temperature settings for different scoring approaches
temperatures = [0.9, 0.8, 0.7]
temp_names = {0.9: "T90", 0.8: "T80", 0.7: "T70"}

# The 20 features we want to focus
important_features = [
    "CallsMinutes", "Apps_Time_GamingMin", "SMSTx_Amt_Mean_30d", "SmsSent",
    "CallsNum", "Apps_Time_FinanceMin", "Apps_TotalInstalled", "SM_PostsPerWeek",
    "AirtimeTopups", "SMSTx_P2P_Count_30d", "SM_Platforms_Installed",
    "SMSTx_Merchant_Count_30d", "NumFinancialAppsInstalled", "SlopeCalls",
    "SmsVar", "NumFinancialAppsUsed", "SMSTx_Amt_CV_30d", "SocialExposure",
    "Apps_ChurnRate_30d", "SMSTx_NightShare"
]

# Words that indicate a column is demographic info
skip_words = ["age", "gender", "name", "email", "phone", "address", "birth", "job",
              "income", "house", "car", "target", "label", "creditworthy"]

def should_skip_column(column_name):
    return any(word in column_name.lower() for word in skip_words)

def pick_backup_features(data, how_many=20):

    # Find numeric columns that aren't demographic
    number_columns = data.select_dtypes(include=[np.number]).columns
    good_columns = [col for col in number_columns if not should_skip_column(col)]

    if not good_columns:
        raise ValueError("Can't find any good features to use")

    # Pick the ones with most variation
    variations = data[good_columns].var().sort_values(ascending=False)
    return variations.head(how_many).index.tolist()

# Load the data

data = pd.read_csv(input_file)

# Try to use our preferred features, otherwise pick backup ones
features_we_have = [f for f in important_features if f in data.columns]

if len(features_we_have) >= 15:  # If we have most of our preferred features
    final_features = features_we_have[:20]

else:
    final_features = pick_backup_features(data, 20)
for i, feature in enumerate(final_features, 1):
    print(f"{i:2d}. {feature}")

# Get just the features we want to work with
feature_data = data[final_features].copy()

# Fill in any missing values with the middle value (median)
feature_data = feature_data.fillna(feature_data.median())

# Find the middle value for each feature
middle_values = feature_data.median()

# For each person, mark which features are above average (1) or below (0)
above_average = (feature_data >= middle_values).astype(int)

# Count how many features each person has above average
positive_count = above_average.sum(axis=1)
negative_count = len(final_features) - positive_count
positive_percentage = positive_count / len(final_features)

# Start building our results
results = data.copy()
results["drivers_pos_count"] = positive_count
results["drivers_neg_count"] = negative_count
results["drivers_pos_ratio"] = positive_percentage.round(4)

# Make lists of which features are positive/negative for each person
positive_feature_lists = []
negative_feature_lists = []

for row_index in range(len(above_average)):
    row_data = above_average.iloc[row_index]

    # Find which features are positive (1) for this person
    positive_features = [feature for feature in final_features if row_data[feature] == 1]

    # Find which features are negative (0) for this person
    negative_features = [feature for feature in final_features if row_data[feature] == 0]

    positive_feature_lists.append(",".join(positive_features))
    negative_feature_lists.append(",".join(negative_features))

results["drivers_positive_features"] = positive_feature_lists
results["drivers_negative_features"] = negative_feature_lists

# Now create different "temperature" versions
for temp in temperatures:
    temp_name = temp_names[temp]

    # Adjust the positive ratio by temperature
    adjusted_ratio = (positive_percentage * temp).round(4)
    results[f"adj_pos_ratio_{temp_name}"] = adjusted_ratio

    # Create target: 1 if adjusted ratio >= 0.5, otherwise 0
    results[f"target_{temp_name}"] = (adjusted_ratio >= 0.5).astype(int)

# Save a separate file for each temperature
for temp in temperatures:
    temp_name = temp_names[temp]

    # Make a clean copy
    clean_data = results.copy()
    clean_data["target"] = clean_data[f"target_{temp_name}"]

    # Remove the other temperature columns we don't need
    columns_to_remove = []
    for col in clean_data.columns:
        if col.startswith("target_") and col != f"target_{temp_name}":
            columns_to_remove.append(col)
        if col.startswith("adj_pos_ratio_") and col != f"adj_pos_ratio_{temp_name}":
            columns_to_remove.append(col)

    clean_data = clean_data.drop(columns=columns_to_remove)

    # Save the file
    output_file = output_folder / f"dataset_{temp_name}.csv"
    clean_data.to_csv(output_file, index=False)
    print(f"Saved {output_file}")

# Save information about what we did
info_file = output_folder / "rule_metadata.txt"
with open(info_file, "w") as f:
    f.write("Features used for scoring (20):\n")
    for feature in final_features:
        f.write(f"- {feature}\n")

    f.write("\nMiddle values used to decide positive/negative:\n")
    for feature in final_features:
        f.write(f"{feature}: {middle_values[feature]:.6f}\n")

print(f"Saved information to {info_file}")



 1. CallsMinutes
 2. Apps_Time_GamingMin
 3. SMSTx_Amt_Mean_30d
 4. SmsSent
 5. CallsNum
 6. Apps_Time_FinanceMin
 7. Apps_TotalInstalled
 8. SM_PostsPerWeek
 9. AirtimeTopups
10. SMSTx_P2P_Count_30d
11. SM_Platforms_Installed
12. SMSTx_Merchant_Count_30d
13. NumFinancialAppsInstalled
14. SlopeCalls
15. SmsVar
16. NumFinancialAppsUsed
17. SMSTx_Amt_CV_30d
18. SocialExposure
19. Apps_ChurnRate_30d
20. SMSTx_NightShare
Saved rule_based_outputs/dataset_T90.csv
Saved rule_based_outputs/dataset_T80.csv
Saved rule_based_outputs/dataset_T70.csv
Saved information to rule_based_outputs/rule_metadata.txt


In [4]:
from google.colab import files

# Download the generated CSV files
for t in temperatures:
    tag = temp_names[t]
    out_path = output_folder / f"dataset_{tag}.csv"
    files.download(out_path)

# Download the metadata file
meta_path = output_folder / "rule_metadata.txt"
files.download(meta_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>