In [None]:
# Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree

from sklearn.preprocessing import MinMaxScaler



In [None]:

# Constants & Functions
XGB_FEATCOLS = 'clust'

NUM_CLUSTS = 13

DTC_PARAM_GRID = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'max_leaf_nodes': [None, 10, 20, 30, 40, 50],
    'min_impurity_decrease': [0.0, 0.01, 0.1, 0.2]
}

XGB_DEFAULT_PARAMS = {
    'criterion': ['gini'],
    'splitter': ['best'],
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': [None],
    'max_leaf_nodes': [None],
    'min_impurity_decrease': [0.0]
}

XGB_JUN_MDL_PARAMS = {
    'criterion': ['gini'],
    'splitter': ['random'],
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['sqrt'],
    'max_leaf_nodes': [None],
    'min_impurity_decrease': [0.0]
}

CORR_COL_ORDER = ['ACTIVATING_SALE_GROUP_NAME_grouped_Direct EComm',
    'ACTIVATING_SALE_GROUP_NAME_grouped_National Retail', 
    'PROMO_GROUPED_Deflation', 
    #'PROMO_GROUPED_Device bundle', 
    'PROMO_GROUPED_No Promo',
    'GSMA_OPERATING_SYSTEM_grouped_iOS',
    'PLAN_CYCLE_NUM_grouped', 
    'FAILED_PAYMENT_GROUPED',
    'MEMBER_OF_ACTIVE_FAMILY_FLAG', 
    #'ACS_HP_PROP',
    #'ACS_NOT_HP_ASIAN_ALONE_PROP',
    #'ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP',
    #'ACS_NOT_HP_WHITE_ALONE_PROP', 
    #'ACS_PROP_WORKERS_OVER_16',
    #'ACS_AGE_MEDIAN', 
    #'ACS_INCOME_MEDIAN', 
    #'ACS_APPROX_COMMUTE_MEDIAN', 
    #'NO_SCHOOL_PROP',
    #'ANY_DEGREE_PROP',
    #'SINGLE_MOM_PROP',
    #'NEVER_MARRIED_PROP',
    #'HH_WO_INT_ACCESS_PROP',
    #'OCCUP_HOUS_UNIT_WO_CAR_PROP',
    'UPGRADE_DOWNGRADE_DATA_FLAG', 
    'UPGRADE_DOWNGRADE_DURATION_FLAG',
    'SUB_ESIM_FLAG', 
    #'HAD_ISSUES_PORTING_IN', 
    'EVER_LOGGED_INTO_APP_FLAG',
    'LTE_BAND_71',
    #'contacted_care_last7d', 
    'contacted_care_last30d',
    'SERVICE_ISSUE_NOTE_FLAG',
    #'SIM_REPLACEMENT_NOTE_FLAG', 
    #'PAYMENT_NOTE_FLAG',
    'EXPECTED_CLV_PS',
    'PORTIN_ISSUE_DESC_NoIssues',
    'PORTIN_ISSUE_DESC_NonPortin',
    'L1_CLUST_0',
    #'L1_CLUST_1',
    'L1_CLUST_2',
    'L1_CLUST_3',
    'L1_CLUST_4',
    'L1_CLUST_5',
    'L1_CLUST_6']

JUN_OUTPUT_FOLDER = "C:/Users/davidl/OneDrive - ULTRA MOBILE/Desktop/dli_code/RetSeg/output/114/"
JUN_DF_NAME = "jun_hclust13_10p_mdl_output.csv"

MDL_EXPORT_NAME = "jun_hclust13_10p_mdl.pkl"

def dtcModelPrep(df, paramGrid, search_yn):
    X = df.drop(columns=XGB_FEATCOLS)
    y = df[XGB_FEATCOLS]
    # Split into Train & Test Sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Do a Hyperparameter Search if needed
    if search_yn.upper() == "Y":

        print("Creating DecisionTreeClassifier..")
        # Initialize the Decision Tree classifier
        dt_clf = DecisionTreeClassifier()

        print("Creating RandomSearchCV..")
        # Initialize RandomizedSearchCV
        random_search = RandomizedSearchCV(estimator=dt_clf, param_distributions=paramGrid, 
                                           n_iter=100, scoring='accuracy', cv=3, verbose=2, n_jobs=1, random_state=42)

        print("Fitting RandomSearchCV..")
        # Fit the model
        random_search.fit(X_train, y_train)
    
        # Print the best parameters and best score
        print(f"Best Parameters: {random_search.best_params_}")
        print(f"Best Score: {random_search.best_score_}")
        return X_train, X_test, y_train, y_test, random_search.best_params_, random_search.best_score_
    # Else Bypass HP Search
    else:
        return X_train, X_test, y_train, y_test, XGB_DEFAULT_PARAMS

def create_eval_dtcModel(X_train, X_test, y_train, y_test, param_grid, create_pickle):
    # Set parameters
    params = {
        'criterion': param_grid['criterion'],
        'splitter': param_grid['splitter'],
        'max_depth': param_grid['max_depth'],
        'min_samples_split': param_grid['min_samples_split'],
        'min_samples_leaf': param_grid['min_samples_leaf'],
        'max_features': param_grid['max_features'],
        'max_leaf_nodes': param_grid['max_leaf_nodes'],
        'min_impurity_decrease': param_grid['min_impurity_decrease']
    }

    print("Training Decision Tree Model..")
    # Train the model
    dt_clf = DecisionTreeClassifier(**params)
    dt_clf.fit(X_train, y_train)

    # Print the decision tree rules
    tree_rules = export_text(dt_clf, feature_names=list(X_train.columns))
    print("Decision Tree Rules:\n")
    print(tree_rules)
    
    # Save the decision tree rules to a text file
    with open('decision_tree_rules.txt', 'w') as f:
        f.write(tree_rules)

    # Visualize the decision tree and save as a high-resolution JPG file
    plt.figure(figsize=(20,10))
    plot_tree(dt_clf, max_depth = 5, feature_names=list(X_train.columns), class_names=True, filled=True)
    plt.savefig('decision_tree_visual.jpg', dpi=1200)  # Increase dpi for higher resolution
    plt.close()

    # Save the model to a file
    if create_pickle[0].upper() == "Y":
        with open(create_pickle[1], 'wb') as f:
            pickle.dump(dt_clf, f)

    print("Predicting Using Decision Tree Model..")
    # Make predictions
    preds = dt_clf.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, preds)
    precision = precision_score(y_test, preds, average='macro')
    recall = recall_score(y_test, preds, average='macro')
    f1 = f1_score(y_test, preds, average='macro')

    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    
    # Return Model Object
    return dt_clf
    

def predNewData(dt_pkl, pred_df, create_pred_file):
    # IMPORTANT: Assumes data is already scaled

    # If dt_pkl is a direct model object
    if hasattr(dt_pkl, 'predict'):
        loaded_model = dt_pkl
    # If dt_pkl is the string name of the path to the pickled model
    elif isinstance(dt_pkl, str):
        with open(dt_pkl, 'rb') as f:
            loaded_model = pickle.load(f)
    else:
        print("ERROR: MODEL OBJECT UNKNOWN")
        return 0

    print("Predicting On Given DF..")
    # Make predictions
    new_preds = loaded_model.predict(pred_df)
    
    # Add predictions to the DataFrame
    pred_df['pred_clust'] = new_preds

    # Save the predicted DataFrame to a file if requested
    if create_pred_file[0].upper() == "Y":
        pred_df.to_csv(create_pred_file[1], index=False)
    
    return pred_df

def getFeatureImp(dt_pkl, txt_file_name):
    # If dt_pkl is a direct model object
    if hasattr(dt_pkl, 'predict'):
        loaded_model = dt_pkl
    # If dt_pkl is the string name of the path to the pickled model
    elif isinstance(dt_pkl, str):
        with open(dt_pkl, 'rb') as f:
            loaded_model = pickle.load(f)
    else:
        print("ERROR: MODEL OBJECT UNKNOWN")
        return 0
    
    importances = loaded_model.feature_importances_
    
    # If you have feature names, you can pair them with their importances
    feature_importances = zip(CORR_COL_ORDER, importances)
    
    # Sort the features by importance
    sorted_feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)

    # Print the feature importances
    for feature, importance in sorted_feature_importances:
        print(f"{feature}: {importance:.4f}")
        
    # Writing to a .txt file
    with open(txt_file_name, 'w') as f:
        for feature, importance in sorted_feature_importances:
            f.write(f"{feature}: {importance:.4f}\n")
    
    return importances

In [None]:
df = pd.read_csv(JUN_OUTPUT_FOLDER + JUN_DF_NAME)

In [None]:
df.head()

In [None]:
df.drop(columns = "Unnamed: 0", axis = 1, inplace = True)
df.drop(columns = "silhouette_score", axis = 1, inplace = True)

In [None]:
df.columns.values

In [None]:
step1 = dtcModelPrep(df, DTC_PARAM_GRID, search_yn = "y")

In [None]:
step2 = create_eval_dtcModel(step1[0], step1[1], step1[2], step1[3], step1[4], create_pickle = ["Y", MDL_EXPORT_NAME])

In [None]:
step3 = predNewData("C:/Users/davidl/OneDrive - ULTRA MOBILE/Desktop/dli_code/RetSeg/" + MDL_EXPORT_NAME, step1[1], ["n", "pred_test.csv"])

In [None]:
step4 = getFeatureImp("C:/Users/davidl/OneDrive - ULTRA MOBILE/Desktop/dli_code/RetSeg/" + MDL_EXPORT_NAME, "feature_importance_written.txt")

## Feature Importance

Interpreting the feature importance output from a DecisionTreeClassifier can provide valuable insights into which features are most influential in making predictions. Here's how you can interpret the results:

Feature Importance Values: Each feature is assigned an importance score between 0 and 1. These scores represent the relative importance of each feature in the decision-making process of the tree. A higher score indicates a more important feature.

Sum of Importances: The sum of all feature importance scores is 1. This means the importance scores are normalized, making it easier to compare the relative importance of different features.

Ranking Features: By sorting the features based on their importance scores, you can identify which features have the most significant impact on the model's predictions. Features with higher importance scores contribute more to the decision-making process.

Understanding the Model: Knowing which features are most important can help you understand how the model is making decisions. For example, if a particular feature has a high importance score, it means that feature is frequently used in the decision nodes of the tree.

Feature Selection: Feature importance can also be used for feature selection. You might choose to keep only the most important features and discard less important ones to simplify the model and potentially improve its performance.

In the Example Above, L1_CLUST_6 contributes to 17% of the model's decisions, PORTIN_ISSUE_DESC_NonPortin contributes to 12% of the model's decisions, EXPECTED_CLV_PS contributes to 0.7% of the model's decisions, etc.

In [None]:
newdf = step3
newdf["act_clust"] = step1[3]

In [None]:
# Assuming df is your DataFrame with "pred_clust" and "act_clust" columns
def count_mismatched_rows(df):
    # Count the number of rows where "pred_clust" and "act_clust" don't match
    mismatched_rows = df[df['pred_clust'] != df['act_clust']].shape[0]
    return mismatched_rows
mismatched_count = count_mismatched_rows(newdf)
print(f"Number of rows where 'pred_clust' and 'act_clust' don't match: {mismatched_count}")
print(newdf.shape[0])