In [None]:
# Packages
import numpy as np
import pandas as pd
import snowflake.connector as snow
from snowflake.connector import pandas_tools
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import MinMaxScaler



In [None]:
# Create List to Hold DFs
MASTER_PRED_LIST = []
# Names of Snowflake Table to Upload to
SF_TBL_NAME = "DESENSITIZED"
# Import Model to Use for Predictions
PREFIX_PATH = "C:/Users/davidl/OneDrive - ULTRA MOBILE/Desktop/dli_code/RetSeg/models/decisiontree_jun_mdl_114/"
MDL_NAME = "jun_hclust13_10p_mdl.pkl"
SCALAR_NAME = "jun_hclust13_scalar.pkl"
clust_mdl_pkl = PREFIX_PATH + MDL_NAME
with open(clust_mdl_pkl, 'rb') as f:
    clust_mdl = pickle.load(f)
scalar_mdl_pkl = PREFIX_PATH + "/scalar/" + SCALAR_NAME
#with open(scalar_mdl_pkl, 'rb') as f:
#    scalar_mdl = pickle.load(f)

In [None]:
# CONSTANTS & FUNCTIONS Section

# CONSTANTS
ORIGIN_COLS = ['ACTIVATING_SALE_GROUP_NAME_grouped',
'PROMO_GROUPED',
'GSMA_OPERATING_SYSTEM_grouped',
'ZERO_USAGE_LAST_30D_FLAG',
'SUB_AUTO_RENEWAL_FLAG',
'SUB_CREDIT_CARD_FLAG',
'PLAN_CYCLE_NUM_grouped',
'FAILED_PAYMENT_GROUPED',
'MEMBER_OF_ACTIVE_FAMILY_FLAG',
#'ACS_HP_PROP', 
#'ACS_NOT_HP_ASIAN_ALONE_PROP',
#'ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP',
#'ACS_NOT_HP_WHITE_ALONE_PROP',
#'ACS_PROP_WORKERS_OVER_16',
#'ACS_AGE_MEDIAN',
#'ACS_INCOME_MEDIAN',
#'ACS_APPROX_COMMUTE_MEDIAN',
#'NO_SCHOOL_PROP',
#'ANY_DEGREE_PROP',
#'SINGLE_MOM_PROP',
#'NEVER_MARRIED_PROP',
#'HH_WO_INT_ACCESS_PROP',
#'OCCUP_HOUS_UNIT_WO_CAR_PROP',
'UPGRADE_DOWNGRADE_DATA_FLAG', 
'UPGRADE_DOWNGRADE_DURATION_FLAG',
'SUB_ESIM_FLAG',
'PORTIN_FLAG',
#'HAD_ISSUES_PORTING_IN',
'EVER_LOGGED_INTO_APP_FLAG',
'YEAR_RELEASED_grouped',
'LTE_BAND_71',
#'contacted_care_last7d', 
'contacted_care_last30d',
'SERVICE_ISSUE_NOTE_FLAG', 
#'SIM_REPLACEMENT_NOTE_FLAG',
#'PAYMENT_NOTE_FLAG',
'TENURE_MONTHS',
'EXPECTED_CLV_PS',
'PROMO_FLAG',
'PORTIN_ISSUE_DESC',
'L1_CLUST'
]

# All Columns
ML_COLS_L3 = ['ACTIVATING_SALE_GROUP_NAME_grouped_Direct EComm',
    'ACTIVATING_SALE_GROUP_NAME_grouped_National Retail', 
    'PROMO_GROUPED_Deflation', 
    #'PROMO_GROUPED_Device bundle', 
    'PROMO_GROUPED_No Promo',
    'GSMA_OPERATING_SYSTEM_grouped_iOS',
    'PLAN_CYCLE_NUM_grouped', 
    'FAILED_PAYMENT_GROUPED',
    'MEMBER_OF_ACTIVE_FAMILY_FLAG', 
    #'ACS_HP_PROP',
    #'ACS_NOT_HP_ASIAN_ALONE_PROP',
    #'ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP',
    #'ACS_NOT_HP_WHITE_ALONE_PROP', 
    #'ACS_PROP_WORKERS_OVER_16',
    #'ACS_AGE_MEDIAN', 
    #'ACS_INCOME_MEDIAN', 
    #'ACS_APPROX_COMMUTE_MEDIAN', 
    #'NO_SCHOOL_PROP',
    #'ANY_DEGREE_PROP',
    #'SINGLE_MOM_PROP',
    #'NEVER_MARRIED_PROP',
    #'HH_WO_INT_ACCESS_PROP',
    #'OCCUP_HOUS_UNIT_WO_CAR_PROP',
    'UPGRADE_DOWNGRADE_DATA_FLAG', 
    'UPGRADE_DOWNGRADE_DURATION_FLAG',
    'SUB_ESIM_FLAG', 
    #'HAD_ISSUES_PORTING_IN', 
    'EVER_LOGGED_INTO_APP_FLAG',
    'LTE_BAND_71',
    #'contacted_care_last7d', 
    'contacted_care_last30d',
    'SERVICE_ISSUE_NOTE_FLAG',
    #'SIM_REPLACEMENT_NOTE_FLAG', 
    #'PAYMENT_NOTE_FLAG',
    'EXPECTED_CLV_PS',
    'PORTIN_ISSUE_DESC_NoIssues',
    'PORTIN_ISSUE_DESC_NonPortin',
    'L1_CLUST_0',
    #'L1_CLUST_1',
    'L1_CLUST_2',
    'L1_CLUST_3',
    'L1_CLUST_4',
    'L1_CLUST_5',
    'L1_CLUST_6'
]

# Columns to Encode
ENCODE_COLS = ['ACTIVATING_SALE_GROUP_NAME_grouped', 'PROMO_GROUPED', 'GSMA_OPERATING_SYSTEM_grouped', 'PORTIN_ISSUE_DESC', 'L1_CLUST']

# Columns to Stratify On
STRATIFY_COLS = ['BLK_GRP', 'PROMO_FLAG', 'PROMO_GROUPED', 'LTE_BAND_71', 'ACTIVATING_SALE_GROUP_NAME']

# FUNCTIONS
def categorize_sale_group(column):
    if 'Direct EComm' in str(column):
        return 'Direct EComm'
    if 'National Retail' in str(column):
        return 'National Retail'
    if 'Campus SIMs' in str(column):
        return 'Campus SIMs'
    else:
        return 'Other'

def categorize_device_year(column):
    try:
        # Attempt to convert the column to an integer
        column = int(float(column))
    except (ValueError, TypeError):
        # If conversion fails, set column to 0
        column = 0

    # Now perform the categorization
    if column == 2024:
        return 2024
    elif column == 2023:
        return 2023
    elif column == 2022:
        return 2022
    elif column == 2021:
        return 2021
    elif column == 2020:
        return 2020
    elif column == 0:  # accounting for majority of devices being Galaxy A53 5G
        return 2022
    else:
        return 2019

def categorize_os(column):
    if 'Android' in str(column):
        return 'Android'
    if 'KaiOS' in str(column):
        return 'Other'
    if 'iOS' in str(column):
        return 'iOS'
    else:
        return 'Other'

def categorize_plan_cycle_num(column):
    if column <= 3:
        return column
    else:
        return 4

# Fit/Transforms a Scalar object on provided df, and exports a scalar object
def min_max_scale_export(df, columns_to_scale, scalar_path):
    """
    Applies Min-Max scaling to specified columns of a DataFrame to a range between 0 and 1, 
    ensuring all values are non-negative.

    Parameters:
    - df: pandas DataFrame.
    - columns_to_scale: List of column names to apply Min-Max Scaling.

    Returns:
    - DataFrame with the specified columns scaled to the range between 0 and 1.
    """
    # Adjust feature_range to (0, 1) for scaling between 0 and 1
    scaler = MinMaxScaler(feature_range=(0, 1))
    
    # Apply MinMaxScaler to the specified columns
    df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
    
    # Save the scaler to a file using pickle
    with open(scalar_path, 'wb') as file:
        pickle.dump(scaler, file)
    
    return df[columns_to_scale]

# Applies a provided scalar object to the df
def min_max_scale_import(df, columns_to_scale, scalar_path):
    """
    Loads a saved scaler and applies it to specified columns of a DataFrame.

    Parameters:
    - df: pandas DataFrame.
    - columns_to_scale: List of column names to apply the saved scaler.
    - scaler_path: Path to the saved scaler object.

    Returns:
    - DataFrame with the specified columns scaled using the saved scaler.
    """
    # Load the scaler from the file using pickle
    with open(scalar_path, 'rb') as file:
        scaler = pickle.load(file)
    
    # Apply the loaded scaler to the specified columns
    df[columns_to_scale] = scaler.transform(df[columns_to_scale])
    
    return df[columns_to_scale]

# Define Pre-processing Steps
def hierPreprocess(df):
    ## Impute ACS Statistic Measures where NA
    # Age
    df['ACS_AGE_MEDIAN'] = df['TOTAL_POP_MEDIAN_AGE'].fillna(df['ACS_APPROX_AGE_MEDIAN'])
    df['ACS_AGE_MEDIAN'] = df['ACS_AGE_MEDIAN'].fillna(df['ACS_AGE_MEDIAN'].median())
    # Income
    df['ACS_INCOME_MEDIAN'] = df['ACS_APPROX_INCOME_MEDIAN'].fillna(df['ACS_APPROX_INCOME_MEDIAN'].median())
    # Commute
    df['ACS_APPROX_COMMUTE_MEDIAN'] = df['ACS_APPROX_COMMUTE_MEDIAN'].fillna(df['ACS_APPROX_COMMUTE_MEDIAN'].median())
    # Ethnicities
    df['ACS_HP_PROP'] = df['ACS_HP_PROP'].fillna(df['ACS_HP_PROP'].median())
    df['ACS_NOT_HP_ASIAN_ALONE_PROP'] = df['ACS_NOT_HP_ASIAN_ALONE_PROP'].fillna(df['ACS_NOT_HP_ASIAN_ALONE_PROP'].median())
    df['ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP'] = df['ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP'].fillna(df['ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP'].median())
    df['ACS_NOT_HP_WHITE_ALONE_PROP'] = df['ACS_NOT_HP_WHITE_ALONE_PROP'].fillna(df['ACS_NOT_HP_WHITE_ALONE_PROP'].median())
    # Proportion of Workers
    df['ACS_PROP_WORKERS_OVER_16'] = df['ACS_PROP_WORKERS_OVER_16'].fillna(df['ACS_PROP_WORKERS_OVER_16'].median())

    # Use Categorization/Bucketing Functions
    df['ACTIVATING_SALE_GROUP_NAME_grouped'] = df['ACTIVATING_SALE_GROUP_NAME'].map(categorize_sale_group)
    df['YEAR_RELEASED_grouped'] = df['YEAR_RELEASED'].map(categorize_device_year)
    df['GSMA_OPERATING_SYSTEM_grouped'] = df['GSMA_OPERATING_SYSTEM'].map(categorize_os)
    df['PLAN_CYCLE_NUM_grouped'] = df['PLAN_CYCLE_NUM'].map(categorize_plan_cycle_num)
    df['contacted_care_last7d'] = np.where(df['CNT_NOTES_LAST_7D']>0,1,0)
    df['contacted_care_last30d'] = np.where(df['CNT_NOTES_LAST_30D']>0,1,0)
    
    # Function to determine PORTIN_ISSUE_DESC
    def determine_issue_desc(row):
        if row['PORTIN_FLAG'] == 1 and row['HAD_ISSUES_PORTING_IN'] == 1:
            return 'HadIssues'
        elif row['PORTIN_FLAG'] == 1 and row['HAD_ISSUES_PORTING_IN'] == 0:
            return 'NoIssues'
        elif row['PORTIN_FLAG'] == 0:
            return 'NonPortin'
        else:
            return 'UnKnown'  # In case there are other combinations
    # Apply the function to create the new column
    df['PORTIN_ISSUE_DESC'] = df.apply(determine_issue_desc, axis=1)

    return df

def coerceDataTypes_small(df):
    print("Memory Used Before Coercing: ")
    print(df.memory_usage(deep=True).sum()/1000/1000/1000)
    
    #df['VOLTE_USAGE_LAST_7D']=df['VOLTE_USAGE_LAST_7D'].astype('int8')
    df['PROMO_GROUPED_Deflation']=df['PROMO_GROUPED_Deflation'].astype('int8')
    df['ACTIVATING_SALE_GROUP_NAME_grouped_National Retail']=df['ACTIVATING_SALE_GROUP_NAME_grouped_National Retail'].astype('int8')
    #df['ACS_INCOME_MEDIAN']=df['ACS_INCOME_MEDIAN'].astype('float16')
    df['PLAN_CYCLE_NUM_grouped']=df['PLAN_CYCLE_NUM_grouped'].astype('int8')
    df['FAILED_PAYMENT_GROUPED']=df['FAILED_PAYMENT_GROUPED'].astype('int8')  
    df['EXPECTED_CLV_PS']=df['EXPECTED_CLV_PS'].astype('float16')   
    df['MEMBER_OF_ACTIVE_FAMILY_FLAG']=df['MEMBER_OF_ACTIVE_FAMILY_FLAG'].astype('int8')
    #df['ACS_HP_PROP']=df['ACS_HP_PROP'].astype('float16')
    #df['ACS_NOT_HP_ASIAN_ALONE_PROP']=df['ACS_NOT_HP_ASIAN_ALONE_PROP'].astype('float16')
    #df['ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP']=df['ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP'].astype('float16')
    #df['ACS_NOT_HP_WHITE_ALONE_PROP']=df['ACS_NOT_HP_WHITE_ALONE_PROP'].astype('float16')
    #df['ACS_PROP_WORKERS_OVER_16']=df['ACS_PROP_WORKERS_OVER_16'].astype('float16')
    #df['ACS_AGE_MEDIAN']=df['ACS_AGE_MEDIAN'].astype('float16')
    #df['ACS_APPROX_COMMUTE_MEDIAN']=df['ACS_APPROX_COMMUTE_MEDIAN'].astype('float16')
    df['SUB_ESIM_FLAG']=df['SUB_ESIM_FLAG'].astype('int8')
    #df['HAD_ISSUES_PORTING_IN']=df['HAD_ISSUES_PORTING_IN'].astype('int8')
    df['PORTIN_ISSUE_DESC_NoIssues']=df['PORTIN_ISSUE_DESC_NoIssues'].astype('int8')
    df['PORTIN_ISSUE_DESC_NonPortin']=df['PORTIN_ISSUE_DESC_NonPortin'].astype('int8')
    df['EVER_LOGGED_INTO_APP_FLAG']=df['EVER_LOGGED_INTO_APP_FLAG'].astype('int8')
    df['LTE_BAND_71']=df['LTE_BAND_71'].astype('int8')
    df['UPGRADE_DOWNGRADE_DATA_FLAG']=df['UPGRADE_DOWNGRADE_DATA_FLAG'].astype('int8')
    df['UPGRADE_DOWNGRADE_DURATION_FLAG']=df['UPGRADE_DOWNGRADE_DURATION_FLAG'].astype('int8')
    #df['contacted_care_last7d']=df['contacted_care_last7d'].astype('int8')
    df['contacted_care_last30d']=df['contacted_care_last30d'].astype('int8')
    df['SERVICE_ISSUE_NOTE_FLAG']=df['SERVICE_ISSUE_NOTE_FLAG'].astype('int8')
    #df['SIM_REPLACEMENT_NOTE_FLAG']=df['SIM_REPLACEMENT_NOTE_FLAG'].astype('int8')
    #df['PAYMENT_NOTE_FLAG']=df['PAYMENT_NOTE_FLAG'].astype('int8')
    df['ACTIVATING_SALE_GROUP_NAME_grouped_Direct EComm']=df['ACTIVATING_SALE_GROUP_NAME_grouped_Direct EComm'].astype('int8')
    #df['PROMO_GROUPED_Device bundle']=df['PROMO_GROUPED_Device bundle'].astype('int8')
    df['PROMO_GROUPED_No Promo']=df['PROMO_GROUPED_No Promo'].astype('int8')
    df['GSMA_OPERATING_SYSTEM_grouped_iOS']=df['GSMA_OPERATING_SYSTEM_grouped_iOS'].astype('int8')
    #df['NO_SCHOOL_PROP']=df['NO_SCHOOL_PROP'].astype('float16')
    #df['ANY_DEGREE_PROP']=df['ANY_DEGREE_PROP'].astype('float16')
    #df['SINGLE_MOM_PROP']=df['SINGLE_MOM_PROP'].astype('float16')
    #df['NEVER_MARRIED_PROP']=df['NEVER_MARRIED_PROP'].astype('float16')
    #df['HH_WO_INT_ACCESS_PROP']=df['HH_WO_INT_ACCESS_PROP'].astype('float16')
    #df['OCCUP_HOUS_UNIT_WO_CAR_PROP']=df['OCCUP_HOUS_UNIT_WO_CAR_PROP'].astype('float16')
    df['L1_CLUST_0']=df['L1_CLUST_0'].astype(pd.Int8Dtype())
    #df['L1_CLUST_1']=df['L1_CLUST_1'].astype('int8')
    df['L1_CLUST_2']=df['L1_CLUST_2'].astype(pd.Int8Dtype())
    df['L1_CLUST_3']=df['L1_CLUST_3'].astype(pd.Int8Dtype())
    df['L1_CLUST_4']=df['L1_CLUST_4'].astype(pd.Int8Dtype())
    df['L1_CLUST_5']=df['L1_CLUST_5'].astype(pd.Int8Dtype())
    df['L1_CLUST_6']=df['L1_CLUST_6'].astype(pd.Int8Dtype())

    print(df.dtypes)
    
    print("Memory Used After Coercing")
    print(df.memory_usage(deep=True).sum()/1000/1000/1000)
    print("")
    
    return df

def coerceDataTypes_med(df):
    print("Memory Used Before Coercing: ")
    print(df.memory_usage(deep=True).sum()/1000/1000/1000)
    
    #df['VOLTE_USAGE_LAST_7D']=df['VOLTE_USAGE_LAST_7D'].astype('int8')
    df['PROMO_GROUPED_Deflation']=df['PROMO_GROUPED_Deflation'].astype('int8')
    df['ACTIVATING_SALE_GROUP_NAME_grouped_National Retail']=df['ACTIVATING_SALE_GROUP_NAME_grouped_National Retail'].astype('int8')
    #df['ACS_INCOME_MEDIAN']=df['ACS_INCOME_MEDIAN'].astype('float32')
    df['PLAN_CYCLE_NUM_grouped']=df['PLAN_CYCLE_NUM_grouped'].astype('int8')
    df['FAILED_PAYMENT_GROUPED']=df['FAILED_PAYMENT_GROUPED'].astype('int8') 
    df['EXPECTED_CLV_PS']=df['EXPECTED_CLV_PS'].astype('float32')  
    df['MEMBER_OF_ACTIVE_FAMILY_FLAG']=df['MEMBER_OF_ACTIVE_FAMILY_FLAG'].astype('int8')
    #df['ACS_HP_PROP']=df['ACS_HP_PROP'].astype('float32')
    #df['ACS_NOT_HP_ASIAN_ALONE_PROP']=df['ACS_NOT_HP_ASIAN_ALONE_PROP'].astype('float32')
    #df['ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP']=df['ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP'].astype('float32')
    #df['ACS_NOT_HP_WHITE_ALONE_PROP']=df['ACS_NOT_HP_WHITE_ALONE_PROP'].astype('float32')
    #df['ACS_PROP_WORKERS_OVER_16']=df['ACS_PROP_WORKERS_OVER_16'].astype('float32')
    #df['ACS_AGE_MEDIAN']=df['ACS_AGE_MEDIAN'].astype('float32')
    #df['ACS_APPROX_COMMUTE_MEDIAN']=df['ACS_APPROX_COMMUTE_MEDIAN'].astype('float32')
    df['SUB_ESIM_FLAG']=df['SUB_ESIM_FLAG'].astype('int8')
    #df['HAD_ISSUES_PORTING_IN']=df['HAD_ISSUES_PORTING_IN'].astype('int8')
    df['PORTIN_ISSUE_DESC_NoIssues']=df['PORTIN_ISSUE_DESC_NoIssues'].astype('int8')
    df['PORTIN_ISSUE_DESC_NonPortin']=df['PORTIN_ISSUE_DESC_NonPortin'].astype('int8')
    df['EVER_LOGGED_INTO_APP_FLAG']=df['EVER_LOGGED_INTO_APP_FLAG'].astype('int8')
    df['LTE_BAND_71']=df['LTE_BAND_71'].astype('int8')
    df['UPGRADE_DOWNGRADE_DATA_FLAG']=df['UPGRADE_DOWNGRADE_DATA_FLAG'].astype('int8')
    df['UPGRADE_DOWNGRADE_DURATION_FLAG']=df['UPGRADE_DOWNGRADE_DURATION_FLAG'].astype('int8')
    #df['contacted_care_last7d']=df['contacted_care_last7d'].astype('int8')
    df['contacted_care_last30d']=df['contacted_care_last30d'].astype('int8')
    df['SERVICE_ISSUE_NOTE_FLAG']=df['SERVICE_ISSUE_NOTE_FLAG'].astype('int8')
    #df['SIM_REPLACEMENT_NOTE_FLAG']=df['SIM_REPLACEMENT_NOTE_FLAG'].astype('int8')
    #df['PAYMENT_NOTE_FLAG']=df['PAYMENT_NOTE_FLAG'].astype('int8')
    df['ACTIVATING_SALE_GROUP_NAME_grouped_Direct EComm']=df['ACTIVATING_SALE_GROUP_NAME_grouped_Direct EComm'].astype('int8')
    #df['PROMO_GROUPED_Device bundle']=df['PROMO_GROUPED_Device bundle'].astype('int8')
    df['PROMO_GROUPED_No Promo']=df['PROMO_GROUPED_No Promo'].astype('int8')
    df['GSMA_OPERATING_SYSTEM_grouped_iOS']=df['GSMA_OPERATING_SYSTEM_grouped_iOS'].astype('int8')
    #df['NO_SCHOOL_PROP']=df['NO_SCHOOL_PROP'].astype('float32')
    #df['ANY_DEGREE_PROP']=df['ANY_DEGREE_PROP'].astype('float32')
    #df['SINGLE_MOM_PROP']=df['SINGLE_MOM_PROP'].astype('float32')
    #df['NEVER_MARRIED_PROP']=df['NEVER_MARRIED_PROP'].astype('float32')
    #df['HH_WO_INT_ACCESS_PROP']=df['HH_WO_INT_ACCESS_PROP'].astype('float32')
    #df['OCCUP_HOUS_UNIT_WO_CAR_PROP']=df['OCCUP_HOUS_UNIT_WO_CAR_PROP'].astype('float32')
    df['L1_CLUST_0']=df['L1_CLUST_0'].astype(pd.Int8Dtype())
    #df['L1_CLUST_1']=df['L1_CLUST_1'].astype('int8')
    df['L1_CLUST_2']=df['L1_CLUST_2'].astype(pd.Int8Dtype())
    df['L1_CLUST_3']=df['L1_CLUST_3'].astype(pd.Int8Dtype())
    df['L1_CLUST_4']=df['L1_CLUST_4'].astype(pd.Int8Dtype())
    df['L1_CLUST_5']=df['L1_CLUST_5'].astype(pd.Int8Dtype())
    df['L1_CLUST_6']=df['L1_CLUST_6'].astype(pd.Int8Dtype())

    print(df.dtypes)
    
    print("Memory Used After Coercing")
    print(df.memory_usage(deep=True).sum()/1000/1000/1000)
    print("")
    
    return df

In [None]:
# Write to Snowflake
def get_col_types(df):
 
    '''
        Automatically parse for columns and their implicit data type form. Used for Schema inheriting for snowflake. 
        If needed to be created manually, can also be created in "A NUMERIC, B VARCHAR, C FLOAT, D DATETIME" format
        
        args:
            df: dataframe to evaluate
            
    '''
    
    # get dtypes and convert to df
    ct = df.dtypes.reset_index().rename(columns={0:'col'})
    ct = ct.apply(lambda x: x.astype(str).str.upper()) # case matching as snowflake needs it in uppers
        
    # only considers objects at this point
    # only considers objects and ints at this point

    ct['col'] = np.where(ct['col']=='OBJECT', 'VARCHAR', ct['col'])
    ct['col'] = np.where(ct['col'].str.contains('DATE'), 'DATETIME', ct['col'])
    ct['col'] = np.where(ct['col'].str.contains('INT'), 'NUMERIC', ct['col'])
    ct['col'] = np.where(ct['col'].str.contains('FLOAT'), 'FLOAT', ct['col'])
    
    # get the column dtype pair
    l = []
    for index, row in ct.iterrows():
        l.append(row['index'] + ' ' + row['col'])
    
    string = ', '.join(l) # convert from list to a string object
    
    string = string.strip()
    
    return string

def create_table(table, action, conn, col_type, df):
    
    '''
        Function to create/replace and append to tables in Snowflake
        
        args:
            table: name of the table to create/modify
            action: whether do the initial create/replace or appending; key to control logic
            col_type: string with column name associated dtype, each pair separated by a comma; comes from get_col_types() func
            df: dataframe to load
            
        dependencies: function get_col_types(); helper function to get the col and dtypes to create a table
    ''' 
   

    # set up cursor
    cur = conn.cursor()
    
    if action=='create_replace':
        
        conn.cursor().execute("USE SCHEMA SANDBOX.STRATEGY_FINANCE")
        
        setup_sql = """ CREATE OR REPLACE TABLE 
            """ + table +"""(""" + col_type + """)"""
        
        print(setup_sql)
    
        # set up execute
        cur.execute(setup_sql) 

        #prep to ensure proper case
        df.columns = [col.upper() for col in df.columns]

        # write df to table
        pandas_tools.write_pandas(conn, df, table.upper())
        
    elif action=='append':
        
        conn.cursor().execute("USE SCHEMA SANDBOX.STRATEGY_FINANCE")
        pandas_tools.write_pandas(conn, df, table.upper())

In [None]:
# Connection Details
con = snow.connect(
    user="DESENSITIZED",
    server="DESENSITIZED",
    database="DESENSITIZED",
    warehouse="DESENSITIZED",
    authenticator="externalbrowser",
    account="DESENSITIZED"
)

cur = con.cursor()

In [None]:
def predNewData(dt_pkl, pred_df, create_pred_file):
    # IMPORTANT: Assumes data is already scaled

    # If dt_pkl is a direct model object
    if hasattr(dt_pkl, 'predict'):
        loaded_model = dt_pkl
    # If dt_pkl is the string name of the path to the pickled model
    elif isinstance(dt_pkl, str):
        with open(dt_pkl, 'rb') as f:
            loaded_model = pickle.load(f)
    else:
        print("ERROR: MODEL OBJECT UNKNOWN")
        return 0

    print("Predicting On Given DF..")
    
    # Identify rows where 'L1_Clust' is null
    null_mask = pred_df[['L1_CLUST_0', 'L1_CLUST_2', 'L1_CLUST_3', 'L1_CLUST_4', 'L1_CLUST_5', 'L1_CLUST_6']].isnull().any(axis=1)
    
    # Initialize the prediction column with NaN
    pred_df['pred_clust'] = np.nan
    
    # Make predictions for rows where 'L1_Clust' is not null
    non_null_df = pred_df.loc[~null_mask]
    features = non_null_df.drop(columns=['pred_clust'])  # Drop the prediction column if it exists
    pred_df.loc[~null_mask, 'pred_clust'] = loaded_model.predict(features)
    
    # Save the predicted DataFrame to a file if requested
    if create_pred_file[0].upper() == "Y":
        pred_df.to_csv(create_pred_file[1], index=False)
    
    return pred_df
    
    # Make predictions
    #new_preds = loaded_model.predict(pred_df)
    
    # Add predictions to the DataFrame
    #pred_df['pred_clust'] = new_preds

    
    
    #return pred_df

In [None]:
# Define Master Flow for Prediction After Data Import
def runPredModelMaster(df, mdl):
    # Handle NAs
    # Set the option to display more rows
    pd.set_option('display.max_rows', 100)
    # Check the number of missing values in each column
    na_counts = df.isna().sum()
    # Sort the counts in descending order
    sorted_na_counts = na_counts.sort_values(ascending=False)
    # Display the result
    print(sorted_na_counts)
    
    # Handle NAs appropriately
    df['VOLTE_USAGE_LAST_7D'] = df['VOLTE_USAGE_LAST_7D'].fillna(0)
    df['ZERO_USAGE_LAST_30D_FLAG'] = df['ZERO_USAGE_LAST_30D_FLAG'].fillna(0)
    df['SERVICE_ISSUE_NOTE_FLAG'] = df['SERVICE_ISSUE_NOTE_FLAG'].fillna(0)
    df['SIM_REPLACEMENT_NOTE_FLAG'] = df['SIM_REPLACEMENT_NOTE_FLAG'].fillna(0)
    df['PAYMENT_NOTE_FLAG'] = df['PAYMENT_NOTE_FLAG'].fillna(0)
    df['LTE_BAND_71'] = df['LTE_BAND_71'].fillna(1)
    df['HAD_ISSUES_PORTING_IN'] = df['HAD_ISSUES_PORTING_IN'].fillna(0)
    df['EXPECTED_CLV_PS'] = df['EXPECTED_CLV_PS'].fillna(0)
    df['UPGRADE_DOWNGRADE_DATA_FLAG'] = df['UPGRADE_DOWNGRADE_DATA_FLAG'].fillna(0)
    df['CNT_NOTES_LAST_30D'] = df['CNT_NOTES_LAST_30D'].fillna(0)
    df['CNT_NOTES_LAST_7D'] = df['CNT_NOTES_LAST_7D'].fillna(0)
    df['PLAN_CYCLE_NUM'] = df['PLAN_CYCLE_NUM'].fillna(1)
    # THIS NEEDS TO BE REMOVED ON REAL RUN
    #df['L1_CLUST'] = df['L1_CLUST'].fillna(6)
    # THIS NEEDS TO BE REMOVED ON REAL RUN
    
    # Handle NAs
    # Set the option to display more rows
    pd.set_option('display.max_rows', 100)
    # Check the number of missing values in each column
    na_counts = df.isna().sum()
    # Sort the counts in descending order
    sorted_na_counts = na_counts.sort_values(ascending=False)
    # Display the result
    print(sorted_na_counts)
    
    
    # Run Preprocessing
    subsnap_df = hierPreprocess(df)

    # Perform Drop Duplicates
    subsnap_df.drop_duplicates(inplace = True)

    # Enforce the incoming L1 Clusters as Integers
    # Coerce the column to int8, leaving NA values as NA
    subsnap_df['L1_CLUST'] = subsnap_df['L1_CLUST'].astype(pd.Int8Dtype())
    #subsnap_df['L1_CLUST'] = pd.to_numeric(subsnap_df['L1_CLUST'], errors='coerce').astype('int8')  

    # Prep
    model_df = subsnap_df[ORIGIN_COLS]
    ##
    model_df_dummies = pd.get_dummies(model_df[ORIGIN_COLS], columns = ENCODE_COLS)
    # Identify the rows where L1_CLUST is NaN in the original DataFrame
    na_rows = model_df['L1_CLUST'].isna()
    # Replace 0s with NaNs in the dummy columns for rows where L1_CLUST was NaN
    for col in model_df_dummies.columns:
        if col.startswith('L1_CLUST_'):
            model_df_dummies.loc[na_rows, col] = np.nan
    ##
    
    print(model_df_dummies.columns.values)
    
    model_df_dummies = min_max_scale_import(model_df_dummies, ML_COLS_L3, scalar_mdl_pkl)
    model_df_dummies = coerceDataTypes_small(model_df_dummies)

    # Reorder Columns
    model_df_dummies = model_df_dummies[ML_COLS_L3]

    # Run June Predictive Model, get predicted cluster column
    subsnap_df["pred_clust"] = predNewData(dt_pkl = mdl, pred_df = model_df_dummies, create_pred_file = ["n", "pred_clust_test.csv"])['pred_clust']

    # Coerce for reducing format error liklihood
    subsnap_df["pred_clust"] = subsnap_df["pred_clust"].astype(pd.Int8Dtype())
    
    return subsnap_df

## December

In [None]:
dec_subs_sql = '''
DESENSITIZED
'''

cur.execute(dec_subs_sql)
dec_df = cur.fetch_pandas_all()

In [None]:
# Validate Incoming Data
print(dec_df.shape)
print(dec_df.head())

In [None]:
# Execute Model Prediction and Create Column
dec_df_w_pred = runPredModelMaster(dec_df, clust_mdl)
MASTER_PRED_LIST.append(dec_df_w_pred)

## Conslidate

In [None]:
columns_assertion_list = ['SUB_BILLING_ID', 'CHURNED_SUB_FLAG', 'SUB_AUTO_RENEWAL_FLAG',
       'SUB_CREDIT_CARD_FLAG', 'SUB_ESIM_FLAG',
       'ACTIVATING_SALE_GROUP_NAME', 'PORTIN_FLAG',
       'HAD_ISSUES_PORTING_IN', 'GSMA_OPERATING_SYSTEM', 'YEAR_RELEASED',
       'LTE_BAND_71', 'PLAN_CYCLE_NUM', 'PROMO_FLAG', 'PROMO_GROUPED',
       'MEMBER_OF_ACTIVE_FAMILY_FLAG', 'EVER_LOGGED_INTO_APP_FLAG',
       'UPGRADE_DOWNGRADE_DURATION_FLAG', 'UPGRADE_DOWNGRADE_DATA_FLAG',
       'TENURE_MONTHS', 'EXPECTED_CLV_PS', 'FAILED_PAYMENT_GROUPED',
       'COVERAGE_CLASS_4G', 'VOLTE_USAGE_LAST_7D',
       'ZERO_USAGE_LAST_30D_FLAG', 'CNT_NOTES_LAST_30D',
       'CNT_NOTES_LAST_7D', 'SERVICE_ISSUE_NOTE_FLAG',
       'SIM_REPLACEMENT_NOTE_FLAG', 'PAYMENT_NOTE_FLAG',
       'TOTAL_POP_MEDIAN_AGE', 'ACS_HP_PROP',
       'ACS_NOT_HP_ASIAN_ALONE_PROP',
       'ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP',
       'ACS_NOT_HP_WHITE_ALONE_PROP', 'ACS_NOT_HP_OTHER_POP',
       'ACS_PROP_WORKERS_OVER_16', 'ACS_APPROX_AGE_MEDIAN',
       'ACS_APPROX_INCOME_MEDIAN', 'ACS_APPROX_COMMUTE_MEDIAN',
       'NO_SCHOOL_PROP', 'ANY_DEGREE_PROP', 'SINGLE_MOM_PROP',
       'NEVER_MARRIED_PROP', 'HH_WO_INT_ACCESS_PROP',
       'OCCUP_HOUS_UNIT_WO_CAR_PROP', 'L1_CLUST', 'ACS_AGE_MEDIAN',
       'ACS_INCOME_MEDIAN', 'ACTIVATING_SALE_GROUP_NAME_grouped',
       'YEAR_RELEASED_grouped', 'GSMA_OPERATING_SYSTEM_grouped',
       'PLAN_CYCLE_NUM_grouped', 'contacted_care_last7d',
       'contacted_care_last30d', 'PORTIN_ISSUE_DESC', 'SNAPSHOT_DATE','pred_clust']

In [None]:
# Sanity Checks
# Check Length
print(len(MASTER_PRED_LIST))
# Assert Columns are the same in all the DFs
for df in MASTER_PRED_LIST:
    if set(df.columns.values) != set(columns_assertion_list):
        print("WARNING: UNMATCHED COLUMNS FOUND")

In [None]:
# Concatenate and Keep Only Sub_Billing_ID, Snapshot_date, Cluster
concat_df = pd.concat(MASTER_PRED_LIST, axis=0, ignore_index=True)
concat_short_df = concat_df[["SUB_BILLING_ID", "SNAPSHOT_DATE", "pred_clust"]]
concat_short_df.columns = concat_short_df.columns.str.upper()
# Add Current Datetime for Tracking When Uploaded
PD_CURR_DT = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
concat_short_df["UPDATE_DT"] = PD_CURR_DT
concat_short_df["UPDATE_DT"] = concat_short_df["UPDATE_DT"].astype('datetime64[ns]').dt.strftime('%Y-%m-%dT%H:%M:%SZ')

In [None]:
# Validate
concat_short_df

In [None]:
# Write
snowflake_syntax_enforcement = get_col_types(concat_short_df)
create_table(SF_TBL_NAME, 'create_replace',con, snowflake_syntax_enforcement, concat_short_df)