# Extraction and EDA

This notebook extracts the data and performs analysis to test for feature availability.

In [119]:
import os
import glob
from typing import Dict, Union, List
import logging
from functools import reduce
from copy import deepcopy
import pandas as pd
import numpy as np

# set logger level
logging.basicConfig(level=logging.DEBUG)

def data_into_dict(
        filepath: Union[str, List], 
) -> Dict[str, pd.DataFrame]:   
    """Read data from paths into dictionary values
    This is an example of Google style.

    Args:
        filepath (Union[str, List]) : string literal of list of strings pointing to files for io
    Returns:
        file_d: dictionary of files as dataframes, with key as filename abbreviation
    """
    logging.info(f"Filepaths: {filepath}")
    # create file dictionary
    file_d = dict()
    for f in filepath:
        # get abbreviation for key
        fname_abbr =  os.path.split(f)[1].split(".")[0]
        logging.info(f"Reading {f} into key {fname_abbr}")
        # read dataframe into value
        file_d[fname_abbr] = pd.read_csv(f)
    return file_d


# read in data from dir
data_path = "../data/data/"
# group filenames by prefix
a_files = data_into_dict(glob.glob(os.path.join(data_path, "a__*")))
b_files = data_into_dict(glob.glob(os.path.join(data_path, "b__*")))

# consolidate dataframe groups into merged structure 
def merge_all_frames(
    frames: Dict, 
    on: str, 
    how: str, 
    rename_exclusions: List=[]
) -> pd.DataFrame:
    """Merge all frames in list into single dataframe 

    Args:
        frames (Dict) : dict of dfs with frame values to merge into single frame
        on (str) : column to merge on 
        how (str) : merge type
        rename (List) : list of columns to not rename
    Returns:
        frame_merged (pd.DataFrame): merged dataframe
    """
    frames = deepcopy(frames)
    # append df key names to columns to resolve conflicts in col names
    for frame in frames.items(): 
        frames[frame[0]] = frame[1].rename({col:f"{col}_{frame[0]}" \
            for col in frame[1] if (col!=on) and (col not in rename_exclusions)}, axis=1)

    # merge frames and set key value as conflicting column suffixes
    logging.info(f"Merging values of {frames.keys()}")
    frame_merged = reduce(
                lambda  left,right: pd.merge(left,right,on=[on],how=how), frames.values()
            )
        
    logging.info(f"Merged frames into one with columns {frame_merged.columns}")
    return frame_merged

a_frame = merge_all_frames(a_files, on="geo_id", how='outer', rename_exclusions=['vendor_id'])
b_frame = merge_all_frames(b_files, on="b_entity_id", how='outer')

a_frame = a_frame.set_index("vendor_id").reset_index()
b_frame = b_frame.set_index("b_entity_id").reset_index()

INFO:root:Filepaths: ['../data/data/a__geo.csv', '../data/data/a__company.csv']
INFO:root:Reading ../data/data/a__geo.csv into key a__geo
INFO:root:Reading ../data/data/a__company.csv into key a__company
INFO:root:Filepaths: ['../data/data/b__company.csv', '../data/data/b__hierarchy.csv', '../data/data/b__address.csv']
INFO:root:Reading ../data/data/b__company.csv into key b__company
INFO:root:Reading ../data/data/b__hierarchy.csv into key b__hierarchy
INFO:root:Reading ../data/data/b__address.csv into key b__address
  if (await self.run_code(code, result,  async_=asy)):
INFO:root:Merging values of dict_keys(['a__geo', 'a__company'])
INFO:root:Merged frames into one with columns Index(['geo_id', 'zipcode_a__geo', 'is_primary_a__geo', 'latitude_a__geo',
       'longitude_a__geo', 'elevation_a__geo', 'state_a__geo',
       'state_full_name_a__geo', 'area_code_a__geo', 'city_a__geo',
       'city_display_a__geo', 'county_a__geo', 'county_fips_a__geo',
       'state_fips_a__geo', 'timezone

In [111]:
def type_compression(
    frame: pd.DataFrame
) -> pd.DataFrame:
    """Compress types of values into a standard

    Args:
        frame (pd.DataFrame) : data frame to compress types
    Returns: 
        frame_comp (pd.DataFrame) : compressed type data
    """
    # get column types
    col_types = frame.dtypes.to_dict()
    # replace na value with string literal
    frame = frame.fillna("null")
    # convert object types to string values
    for col, datatype in col_types.items():
        frame[col] = frame[col].astype(str)
        # get all values in series that are not null
        non_null_s = frame[col][frame[col].apply(lambda x: x!='null')]
        isnum_s = non_null_s.apply(lambda x: x.replace("-", "").replace(".", "").isnumeric())
        isnum_sum, isnum_true = isnum_s.value_counts().sum(), isnum_s.value_counts().get(True)
        # get proportion of numeric strings in non-null series
        isnum_prop = isnum_true/isnum_sum
        # check proprtion and convert based upon value
        if isnum_prop > 0.80:
            col_numeric = frame[col].apply(lambda x: x.replace("-", "").replace(".", "").isnumeric())
            # check for int vs float values
            int_conv = frame[col][col_numeric].apply(lambda x: set(x.split(".")[-1]).issubset("0"))

            frame.loc[~col_numeric,col] = "-1"
            frame[col] = frame[col].astype("float")
        # if non null values are not primarily numeric, then convert to lowercase
        # and remove non-alphanumeric chars
        else: 

                
            
        
       
    return frame

    
# def data_cleaning(
#     frame: pd.DataFrame
# ) -> pd.DataFrame :
#     """Clean data from compressed types
    
#     Args: 
#         frame (pd.DataFrame) : to be worked dataframe
#     Returns: 
#         frame_clean (pd.DataFrame) : cleaned frame
#     """



comp = type_compression(frame=a_frame[['longitude_a__geo', 'vendor_id']])

In [118]:
isnum = a_frame.zipcode_a__company.astype(str).apply(lambda x: x.replace("-", "").replace(".", "").isnumeric())


0         True
1         True
2         True
3         True
4         True
         ...  
76339     True
76340     True
76341     True
76342     True
76343    False
Name: zipcode_a__company, Length: 76344, dtype: bool

In [122]:
a_frame['zipcode_a__company'][isnum].astype(str).apply(lambda x: set(x.split(".")[-1]).issubset("0"))

0        False
1        False
2        False
3        False
4        False
         ...  
76339    False
76340    False
76341    False
76342    False
76343    False
Name: zipcode_a__company, Length: 76344, dtype: bool