In [None]:
import pandas as pd

# Silhouette Score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from collections import Counter

import matplotlib.pyplot as plt

In [None]:
# CONSTANTS & FUNCTIONS
PREFIX_FILE_PATH = "C:/Users/davidl/OneDrive - ULTRA MOBILE/Desktop/dli_code/RetSeg/output/114/"
PREFIX_EXPORT_PATH = "C:/Users/davidl/OneDrive - ULTRA MOBILE/Desktop/dli_code/RetSeg/summary/114/"

#DF_CLUST_FILE_LIST = ["jun_hclust8_10p_pp_output.csv", "jun_hclust9_10p_pp_output.csv", "jun_hclust10_10p_pp_output.csv", "jun_hclust11_10p_pp_output.csv", 
#                      "jun_hclust12_10p_pp_output.csv", "jun_hclust13_10p_pp_output.csv","jun_hclust14_10p_pp_output.csv"]
DF_CLUST_FILE_LIST = ["jun_hclust13_detail_temp.csv"]

# Descriptives Columns
GRPBY_MEAN_COLS = [ 
'CHURNED_SUB_FLAG',
'SUB_AUTO_RENEWAL_FLAG', 'SUB_CREDIT_CARD_FLAG', 'SUB_ESIM_FLAG',
'PORTIN_FLAG','HAD_ISSUES_PORTING_IN',
'LTE_BAND_71', 'PLAN_CYCLE_NUM', 'PROMO_FLAG', 
'MEMBER_OF_ACTIVE_FAMILY_FLAG', 'EVER_LOGGED_INTO_APP_FLAG',
'UPGRADE_DOWNGRADE_DURATION_FLAG', 'UPGRADE_DOWNGRADE_DATA_FLAG',
'TENURE_MONTHS', 'EXPECTED_CLV_PS', 'FAILED_PAYMENT_GROUPED',
'COVERAGE_CLASS_4G_IBC', 
'COVERAGE_CLASS_4G_IBR', 
'COVERAGE_CLASS_4G_inVehicle', 
'COVERAGE_CLASS_4G_Outdoor',
'VOLTE_USAGE_LAST_7D',
'ZERO_USAGE_LAST_30D_FLAG', 'CNT_NOTES_LAST_30D',
'SERVICE_ISSUE_NOTE_FLAG',
'SIM_REPLACEMENT_NOTE_FLAG', 'PAYMENT_NOTE_FLAG',
'TOTAL_POP_MEDIAN_AGE', 'ACS_HP_PROP',
'ACS_NOT_HP_ASIAN_ALONE_PROP',
'ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP',
'ACS_NOT_HP_WHITE_ALONE_PROP', 'ACS_NOT_HP_OTHER_POP',
'ACS_PROP_WORKERS_OVER_16', 'ACS_APPROX_AGE_MEDIAN',
'ACS_APPROX_INCOME_MEDIAN', 'ACS_APPROX_COMMUTE_MEDIAN',
'NO_SCHOOL_PROP', 'ANY_DEGREE_PROP', 'SINGLE_MOM_PROP',
'NEVER_MARRIED_PROP', 'HH_WO_INT_ACCESS_PROP',
'OCCUP_HOUS_UNIT_WO_CAR_PROP', 'ACS_AGE_MEDIAN',
'ACS_INCOME_MEDIAN', 
'ACTIVATING_SALE_GROUP_NAME_grouped_Direct EComm',
'ACTIVATING_SALE_GROUP_NAME_grouped_National Retail',
'GSMA_OPERATING_SYSTEM_grouped_Android',
'GSMA_OPERATING_SYSTEM_grouped_iOS',
'PLAN_CYCLE_NUM_grouped',
'PORTIN_ISSUE_DESC_NoIssues',
    'PORTIN_ISSUE_DESC_NonPortin',
    'L1_CLUST_0',
    'L1_CLUST_1',
    'L1_CLUST_2',
    'L1_CLUST_3',
    'L1_CLUST_4',
    'L1_CLUST_5',
    'L1_CLUST_6'
]

GRPBY_MEDIAN_COLS = ['YEAR_RELEASED_grouped']

# Columns to Encode
ENCODE_COLS = ['ACTIVATING_SALE_GROUP_NAME_grouped', 'PROMO_GROUPED', 'GSMA_OPERATING_SYSTEM_grouped', 'PORTIN_ISSUE_DESC', 'L1_CLUST', 'COVERAGE_CLASS_4G']

# Dummy Result Columns
DUMMY_COLS = ['ACTIVATING_SALE_GROUP_NAME_grouped_Direct EComm',
              'ACTIVATING_SALE_GROUP_NAME_grouped_National Retail',
              'GSMA_OPERATING_SYSTEM_grouped_iOS',
              'PROMO_GROUPED_Deflation',
              'PROMO_GROUPED_No Promo',
              'COVERAGE_CLASS_4G_IBC', 
                'COVERAGE_CLASS_4G_IBR', 
                'COVERAGE_CLASS_4G_inVehicle', 
                'COVERAGE_CLASS_4G_Outdoor',
              'PORTIN_ISSUE_DESC_NoIssues',
    'PORTIN_ISSUE_DESC_NonPortin',
    'L1_CLUST_0',
    'L1_CLUST_1',
    'L1_CLUST_2',
    'L1_CLUST_3',
    'L1_CLUST_4',
    'L1_CLUST_5',
    'L1_CLUST_6']

In [None]:
# Define function to create a groupby object from df w/clust and export to csv
def createGroupByCSV(FILE_LIST, clust_label_name, GRPBY_MEAN_COLS_LIST, GRPBY_MEDIAN_COLS_LIST):
    # Create List to Store DFs
    summaries_list = []
    
    # Loop Through FIle List
    for file in FILE_LIST:
        # Get File Path
        DF_CLUST_PATH_FILE = PREFIX_FILE_PATH + file
        
        # List Progress
        print("Accessing File: " + DF_CLUST_PATH_FILE)
        # Import df
        df = pd.read_csv(DF_CLUST_PATH_FILE)    
        # Re-encode for proper groupby
        encoded_df = pd.get_dummies(df, columns = ENCODE_COLS)
        
        # Apply .astype() to specific columns
        for column in DUMMY_COLS:
            encoded_df[column] = encoded_df[column].astype('int8')
        
        # Get Counts of Clusters Sizes
        cluster_sizes_df = encoded_df.groupby(clust_label_name).agg(clust_size = ('SUB_BILLING_ID','count')).reset_index()

        # Get Proportions of Cluster Sizes
        # Calculate the sum of the 'count' column
        total_count = cluster_sizes_df['clust_size'].sum()

        # Create the 'proportion' column
        cluster_sizes_df['clust_prop'] = cluster_sizes_df['clust_size'] / total_count
        
        # Begin Creating Summary Df
        final_df = cluster_sizes_df.copy()
        
        # If Means list is specified
        if ((isinstance(GRPBY_MEAN_COLS_LIST, list)) and (len(GRPBY_MEAN_COLS_LIST) > 0)):
            mean_grpby_df = encoded_df.groupby(clust_label_name)[GRPBY_MEAN_COLS_LIST].mean().reset_index()
            final_df = final_df.merge(mean_grpby_df, how = 'left', on = clust_label_name)
        # If Median List is specified
        if ((isinstance(GRPBY_MEDIAN_COLS_LIST, list)) and (len(GRPBY_MEDIAN_COLS_LIST) > 0)):
            median_grpby_df = encoded_df.groupby(clust_label_name)[GRPBY_MEDIAN_COLS_LIST].median().reset_index()
            final_df = final_df.merge(median_grpby_df, how = 'left', on = clust_label_name)
        
        summaries_list.append(final_df)
        # Export the entire summary csv
        final_df.to_csv(PREFIX_EXPORT_PATH + file + "_summary.csv")
        # List Progress
        print("Created Summary For File: " + DF_CLUST_PATH_FILE)
        print("")
        
    
    return summaries_list
    

In [None]:
createGroupByCSV(DF_CLUST_FILE_LIST, "clust", GRPBY_MEAN_COLS, GRPBY_MEDIAN_COLS)