In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Load the data

merged_df = pd.read_csv("cleanedData.csv")

# Filter data for April 2024 and other dates

test_data_with_date = merged_df[merged_df["trxdate"] == '2024-04']

train_data_with_date = merged_df[merged_df["trxdate"] != '2024-04']

# Drop trxdate column

test_data_april = test_data_with_date.drop(columns=["trxdate"])

train_data_april = train_data_with_date.drop(columns=["trxdate"])

# Separate features and target variable

X = train_data_april.drop(columns=["Churn"])

y = train_data_april["Churn"]

# Function to create different scenarios and evaluate model

def evaluate_model_for_ratios_and_depths(churn_ratio, max_depth):

    churn_data = train_data_april[train_data_april["Churn"] == 1]

    non_churn_data = train_data_april[train_data_april["Churn"] == 0]

    # Number of non-churn samples to match the desired ratio

    non_churn_sample_size = int(len(churn_data) * (1 - churn_ratio) / churn_ratio)

    if non_churn_sample_size > len(non_churn_data):

        non_churn_sample_size = len(non_churn_data)

    non_churn_sample = non_churn_data.sample(non_churn_sample_size, random_state=42)

    # Combine churn and non-churn data

    balanced_data = pd.concat([churn_data, non_churn_sample])

    X_balanced = balanced_data.drop(columns=["Churn"])

    y_balanced = balanced_data["Churn"]
 
    # Print the number of churn and non-churn samples

    num_churn_samples = len(churn_data)

    num_non_churn_samples = len(non_churn_sample)

    print(f"Churn Samples: {num_churn_samples}, Non-Churn Samples: {num_non_churn_samples}")

    # Split data into training and validation sets

    X_train, X_temp, y_train, y_temp = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Train Decision Tree model

    model_tree = DecisionTreeClassifier(max_depth=max_depth, random_state=42)

    model_tree.fit(X_train, y_train)

    # Evaluate model on validation set

    y_val_pred = model_tree.predict(X_val)

    val_accuracy = accuracy_score(y_val, y_val_pred)

    val_precision = precision_score(y_val, y_val_pred)

    val_recall = recall_score(y_val, y_val_pred)

    val_f1 = f1_score(y_val, y_val_pred)

    # Evaluate model on test set

    y_test_pred = model_tree.predict(X_test)

    test_accuracy = accuracy_score(y_test, y_test_pred)

    test_precision = precision_score(y_test, y_test_pred)

    test_recall = recall_score(y_test, y_test_pred)

    test_f1 = f1_score(y_test, y_test_pred)

    print(f"Churn Ratio: {churn_ratio * 100}% | Max Depth: {max_depth}")

    print("Validation Metrics:")

    print("Accuracy:", val_accuracy)

    print("Precision:", val_precision)

    print("Recall:", val_recall)

    print("F1 Score:", val_f1)

    print()

    print("Test Metrics:")

    print("Accuracy:", test_accuracy)

    print("Precision:", test_precision)

    print("Recall:", test_recall)

    print("F1 Score:", test_f1)

    print()

    # Predict and evaluate on April 2024 data

    test_data_april_without_churn = test_data_april.drop(columns=["Churn"])

    april_prediction = model_tree.predict(test_data_april_without_churn)

    april_prediction_proba = model_tree.predict_proba(test_data_april_without_churn)[:, 1]

    comparison_df = pd.DataFrame({

        "Customer_id": test_data_april["Customer_id"],

        "True_Churn": test_data_april["Churn"],

        "Predicted_Churn": april_prediction,

        "Predicted_Churn_Probability": april_prediction_proba

    })

    print("Classification Report for April 2024:")

    print(classification_report(test_data_april["Churn"], april_prediction))

    print("Confusion Matrix for April 2024:")

    print(confusion_matrix(test_data_april["Churn"], april_prediction))

    print()


# Evaluate for different churn ratios and max_depth values

ratios = [ 0.5, 0.4, 0.3, 0.2]
depths = [10, 15, 20, 25, 30, 35, 40]

for ratio in ratios:

    for depth in depths:

        evaluate_model_for_ratios_and_depths(ratio, depth)


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
 
# Veriyi yükle
merged_df = pd.read_csv("cleanedData.csv")
 
# Nisan 2024 ve diğer tarihler için veriyi filtrele
test_data_with_date = merged_df[merged_df["trxdate"] == '2024-04']
train_data_with_date = merged_df[merged_df["trxdate"] != '2024-04']
 
# trxdate sütununu kaldır
test_data_april = test_data_with_date.drop(columns=["trxdate"])
train_data_april = train_data_with_date.drop(columns=["trxdate"])
 
# Özellikleri ve hedef değişkeni ayır
X = train_data_april.drop(columns=["Churn"])
y = train_data_april["Churn"]
 
# Decision Tree modelini eğit
model_tree = DecisionTreeClassifier(max_depth=10, random_state=42)
model_tree.fit(X, y)
 
# Test seti üzerinde tahminler yap
test_data_april_without_churn = test_data_april.drop(columns=["Churn"])
april_prediction_proba = model_tree.predict_proba(test_data_april_without_churn)[:, 1]
 
# Tahminleri DataFrame'e ekle
result_df = pd.DataFrame({
    "Customer_id": test_data_april["Customer_id"],
    "ChurnProbability": april_prediction_proba
})
 
# Müşterinin hangi segmente ait olduğunu belirle
def determine_value_segment(row):
    for column in ["ValueSegment_A1", "ValueSegment_A2", "ValueSegment_B", "ValueSegment_C", "ValueSegment_D"]:
        if row[column] == 1:
            return column.split("_")[1]
    return None
 
# Müşterinin hangi segmente ait olduğunu DataFrame'e ekle
test_data_april["ValueSegment"] = test_data_april.apply(determine_value_segment, axis=1)
result_df["ValueSegment"] = test_data_april["ValueSegment"]
 
# Sonuçları yazdır ve kaydet
print(result_df)
result_df.to_csv("churn_predictions_with_ValueSegment.csv", index=False)

        Customer_id  ChurnProbability ValueSegment
14         97425734          0.000000           A1
22         97425700          0.000000            B
25         97425673          0.000000            D
43         97425610          0.000000            C
48         97425607          0.000000            B
...             ...               ...          ...
465143     18081987          0.000000            C
465146     17906588          0.000000            C
465150     17052005          0.145963           A2
465161     17011971          0.000000           A1
465165     16082019          0.420455            D

[51169 rows x 3 columns]
