# Feature List Aggregation
This code will pull in the future lists for all directories and combine into a single list. This will be done for all different models that we have to get a better idea what about features are used in what model. This will include aggregating at the Model Level (All submodels for different marketing segments or portions) as well as a full detail leve. 

## Version:
0.1 - Created initial script based on previous Feature List Semi Gross Model script

In [None]:
import sys
# !conda install --yes --prefix {sys.prefix}  "pandas >=1.1.0"  "s3fs>=0.4.2" regex boto3

In [None]:
#Import all standard packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import pickle
import seaborn as sns
import boto3 
import s3fs

from s3fs.core import S3FileSystem
#Import new packages
import os
import regex as re

In [None]:
#use this kernal option to enable me to see the value of maultiple statements at once
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

## Grab all files and make a dictionary
Create a dictionary that shows the location for each type of model. This will give the easy to understand name for the overall model. This will include seperating propensity vs whatever sales category is used. This will give a comprehensive list of all models that are on-going. 

In [None]:
#Define a S3 and client which will be used throughout
s3 = boto3.resource('s3')
s3client = boto3.client('s3')

#Create a generic Paginator
paginator = s3client.get_paginator('list_objects_v2')

In [None]:
# All Model Name and Folder Location:
model_location_dict = {
    "Yearly_Response": "s3://bluestembrands-bi-obmarketing-zone-prod/Production_Models/yearly_models/propensity_models/",
    "Yearly_Semi_Gross_Profit": "s3://bluestembrands-bi-obmarketing-zone-prod/Production_Models/yearly_models/semi_gross_profit_models/",
    "Yearly_Net_Sales_Models": "s3://bluestembrands-bi-obmarketing-zone-prod/Production_Models/yearly_models/net_sales_models/",
    "Monthly_Response_Models":  "s3://bluestembrands-bi-obmarketing-zone-prod/Production_Models/monthly_models/propensity_models/",
    "Monthly_Net_Sales_Models": "s3://bluestembrands-bi-obmarketing-zone-prod/Production_Models/monthly_models/net_sales_models/",
    "Marketing_Discount_Amount": "s3://bluestembrands-bi-obmarketing-zone-prod/Production_Models/discount_models/marketing_disc_amt/",
    "Marketing_Discount_Users": "s3://bluestembrands-bi-obmarketing-zone-prod/Production_Models/discount_models/marketing_disc_user/",
    "Other_Discount_Amount":  "s3://bluestembrands-bi-obmarketing-zone-prod/Production_Models/discount_models/other_disc_amt/",
    "Other_Discount_Users": "s3://bluestembrands-bi-obmarketing-zone-prod/Production_Models/discount_models/other_disc_user/",
    "Catalog_Response": "s3://bluestembrands-bi-obmarketing-zone-prod/Production_Models/catalog_models/propensity_models/", 
    "Catalog_Net_Demand_amt": "s3://bluestembrands-bi-obmarketing-zone-prod/Production_Models/catalog_models/net_demand_amt/",
    "Catalog_Operating_Profit": "s3://bluestembrands-bi-obmarketing-zone-prod/Production_Models/catalog_models/operating_profit/"
}

model_location_dict

### Define functions
There will be a function that gets the list of features for all submodels and one that combines those into a master feature list for each type of model

In [None]:
## Create a function that will create a list of all feature files from an output location:
def get_feature_lists_locations(folder_location):
    #Find S3:// for any digit character
    bucket_search =  re.search("s3://[\\w-_]*", folder_location)
    #remove the S3 to finish the bucket
    bucket_name = re.sub("s3://" ,"", bucket_search[0]) 
    #Define the prekey from the folder
    pre_key = re.sub( f'{bucket_name}/', "",re.sub("s3://" ,"", folder_location))
    
    #Define response iterator which can be used to search through folders
    response_iterator = paginator.paginate(Bucket=bucket_name, Prefix=pre_key)
    #Define empty list
    feature_name_list = []
    
    #Search through paths for all different methods to determine feature
    for response in response_iterator:
        for object_data in response['Contents']:
            key = object_data['Key']
            if key.endswith('features.csv'):
                feature_name_list.append(key)
                
    return feature_name_list

In [None]:
def get_features( location_list , folder_location ):   
    #Find S3:// for any digit character
    bucket_search =  re.search("s3://[\\w-_]*", folder_location)
    #remove the S3 to finish the bucket
    bucket_name = re.sub("s3://" ,"", bucket_search[0]) 
    
    #temporaty dictionary that will be reduced within this funciton
    temp_dict = dict()
    
    for i in range(len(location_list)):
    #Grab the divsion from within the folder structure as only capital letter
        name = i
        #Define the name
        #Load the file if .csv
        temp_dict[name] = pd.read_csv(f"s3://{bucket_name}/{location_list[i]}")
        #If .txt file load as a pickle file
    
    #Create basic dataframe to allow for joining based on dictionary names
    total_features_df = pd.DataFrame( columns = ["Column_Names"])
    
    for k,v in temp_dict.items():
        df =  v
        df.rename(columns={df.columns[0]: 'Column_Names'}, inplace = True)
        #Define the contains column using the key name to create a unqiue list
        df[f'{k}_contains'] = True
        total_features_df = total_features_df.merge(df, how = 'outer', on = "Column_Names" )
    
    return total_features_df
    

### Grab the Features for each model


In [None]:
#Define model list from model filter regex
model_list = list(model_location_dict.keys())

# model_list

In [None]:
#Get an entire list of features with an dictionary to be able to name everything
all_model_features_dict = dict()

for i in range(len(model_list)):
    #Define the name
    name = model_list[i]
    #Grab the feature list
    temp_location_list = get_feature_lists_locations( folder_location= model_location_dict[name] )
    #Using this list get a list of all features fro this
    temp_features_df = get_features(  location_list = temp_location_list, folder_location = model_location_dict[name]   )
    #Just get the features by dropping where they are used in teh submodels
    feature_array = temp_features_df['Column_Names']
    all_model_features_dict[name]  = feature_array
    print(f"Finished with {name}: {i+1} of {len(model_list)}")
    
all_model_features_dict.keys()

In [None]:
## Get a blank dataframe which we use to join all model feature files
total_features_df = pd.DataFrame(columns = ["Column_Names"])

#Join all files together to get a dataframe that shows were all features are used
for k,v in all_model_features_dict.items():
    df =  pd.DataFrame(v, columns = ["Column_Names"]  )
    #Define the contains column using the key name to create a unqiue list
    df[f'{k}_contains'] = True
    total_features_df = total_features_df.merge(df, how = 'outer' , on = "Column_Names" )
    
total_features_df.fillna(False, inplace = True)

In [None]:
#Get a column that determines how many models it is used for and sort using it
total_features_df['Total_Model_Uses'] = total_features_df.sum( axis = 1)
total_features_df.sort_values( by = "Total_Model_Uses", ascending = False, inplace = True )

#View this information
total_features_df

In [None]:
#Load in metadata
meta_data = pd.read_csv("s3://bluestembrands-bi-obmarketing-zone-prod/Feature_Classification/all_features_metadata.csv")

#Rename and drop some columns
meta_data.rename( columns = { "col_name": "Column_Names" }, inplace = True )
meta_data.drop( columns  = ['replacement_type'], inplace = True )

meta_data

In [None]:
#Join the metadata with the feature data to show all the feature's with their metadata
all_features_meta = meta_data.merge(total_features_df, how = 'right' , on = "Column_Names" )

all_features_meta

In [None]:
### Output the features as a .csv file
# all_features_meta.to_csv( "s3://bluestembrands-bi-obmarketing-zone-prod/Feature_Classification/all_model_feature_analysis.csv", index = False  )