## Blog Post Functions
This file contains functions created for use in the blog post project.

Functions:
1. vals_by_col
2. date_to_col
3. secondary_unique
4. counts_to_portions
5. expand_categories

In [1]:
# Import standard libaries and modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

## Function 1: vals_by_col(df, cols)
This function returns all unique values for specified columns in a DataFrame.

In [2]:
def vals_by_col(df, cols):
    """
    INPUT:
    - df: DataFrame containing columns to break into unique values
    
    - cols: list of columns in df in which unique values are desired 
    
    OUTPUT:
    - df_vals: Series with columns in cols as index and uniques as the values
    """
    df_vals = {col: None for col in cols}
    for col in cols:
        df_vals[col] = list(df[col].value_counts().index)
        
    df_vals = pd.Series(df_vals)
    return df_vals

## Function 2: date_to_col(df, col)
This function splits a single entry containing information about date and time, returning a DataFrame with the information split into individual columns.

In [3]:
def date_to_col(df, col):
    """
    INPUT:
    - df: DataFrame containing columns with dates of type:
    "mm/dd/yyyy hh:mm:ss xx", where xx is either "AM" or "PM"

    - col: name of column which contains data of this type to be broken up  
    
    OUTPUT:
    - new_df: will return df with original column removed, and new columns for 
    respective time splits
    """
    month = []
    day = []
    year = []
    hour = []
    minute = []
    second = []
    for entry in df[col]:
        month.append(int(entry[0:2]))
        day.append(int(entry[3:5]))
        year.append(int(entry[6:10]))
        if entry[-2:] == 'AM':
            hour.append(int(entry[11:13]))
        else:
            hour.append(int(entry[11:13])+12)
        minute.append(int(entry[14:16]))
        second.append(int(entry[17:19]))
    
    # drop original column and add new columns
    new_df = df.drop(col, axis=1)
    
    cols = {'month': month, 'day': day, 'year': year,
            'hour': hour, 'minute': minute, 'second': second}
    for col_name in cols:
        new_df[col+'_'+col_name] = cols[col_name]
    
    return new_df

## Function 3: secondary_unique(df, uniq_col, list_col, zero_missing)
This function returns either a list of dictionaries or a DataFrame which contains the number of times each unique in a certain column intersects with values from a secondary column (associated appearances).

In [4]:
def secondary_unique(df, uniq_col, list_col, zero_missing = 'no'):
    """
    INPUT:
    - df: dataframe which contains column to be separated into unique values, and
    to choose secondary values to list
    - uniq_col: column to be separated into unique values (main describer)
    - list_col: column to make list of when a unique value from first list is found,
    (i.e the secondary describer to be counted)
    - zero_missing: boolean 'yes' or 'no', will add any missing overall secondaries with
    the value of 0
    
    OUTPUT:
    - listed_uniqs:
        - if zero_missing == 'no': returns list of dictionaries for number of times
        associated values appear
        - if zero_missing == 'yes': returns dataframe of associated appearences
    """
    
    listed_uniqs = []
    uniq = vals_by_col(df, [uniq_col])
    for uniq_value in range(0, len(uniq[0])):
        listed_series = df[df[uniq_col]==uniq[0][uniq_value]][list_col].value_counts()
        listed_uniqs.append([uniq[0][uniq_value],dict(listed_series)])
        
    if zero_missing == 'yes':
        secondary_list = df[list_col].value_counts().index
        for secondary_counts in listed_uniqs:
            for secondary_category in secondary_list:
                if secondary_category not in secondary_counts[1]:
                    secondary_counts[1][secondary_category] = 0
                    
        all_data = []
        for secondary in listed_uniqs:
            all_data.append(pd.Series(secondary[1], name = secondary[0]))
            
        listed_uniqs = pd.DataFrame(all_data).transpose()
        
    return listed_uniqs

## Function 4: counts_to_portions(df, variant)
Given a DataFrame with counts as the datapoints, this function will return a DataFrame with the datapoints turned into a proportion, calculated by:
1. DataFrame total
2. column total
3. row total

In [5]:
def counts_to_portions(df, variant = 'total'):
    """
    INPUT:
    - df: DataFrame containing counts to turn into portions
    - variant: choose between 'total', 'col', or 'row' for how to calculate
    portions
    
    OUTPUT:
    - portion_df: returns a DataFrame of decimals depending on variant chosen
    """
    
    if variant == 'total': 
        # total portions
        total = df.sum().sum()
        portion_df = df / total
    elif variant == 'col':
        # column portions
        portion_df = df.copy()
        for col in df:
            portion_df[col] = df[col] / df[col].sum()
    else:
        # row portions
        portion_df = df.copy()
        for row in df.index:
            portion_df.loc[row] = df.loc[row] / df.loc[row].sum()
    
    return portion_df

## Function 5: expand_categories(df, df_encoder, pred_value, prediction_input)
Given a single datapoint to plug into a model which has been encoded (i.e. each categorical value has been encoded by the binary method), this function will expand the categorical data from the input to match the input required for the model.

In [6]:
def expand_categories(df, df_encoder, pred_value, prediction_input):
    """
    INPUT:
    - df: DataFrame containing all of the data, not yet encoded or split
    - df_encoder: DataFrame created after running through OneHotEncoder (must
    have columns specified through encoder.get_feature_names_out())
    - pred_value: Value to model is set up to predict
    - prediction_input: list with all of the fields of a row in df (can put
    dummy variable in for pred_value as it will be dropped)
    
    OUTPUT:
    - full_input: array with original input transformed to mimic a row in the
    predictor matrix
    """
    
    # predicting values
    # prediction_input = list(df_subset.iloc[0].drop('Priority'))
    
    removable = df.columns.get_loc(pred_value)
    del prediction_input[removable]
    set_cols = df.columns.drop(pred_value)
    encoder_cols = list(df_encoder.columns)
    
    input_expanded = np.zeros(df_encoder.shape[1])
    input_concise = []
    for count, pred_value in enumerate(prediction_input):
        if type(pred_value) == str:
            expanded_label = set_cols[count]+'_'+pred_value
            input_expanded[encoder_cols.index(expanded_label)] = 1
        else:
            input_concise.append(pred_value)
    
    input_concise = np.array(input_concise)
    full_input = np.concatenate((input_concise,input_expanded))
    
    return full_input