<h1>CSV to PostgreSQL</h1>

# Prepare for Parsing

## Import modules and declare globals

In [None]:
#data conditioning
import pandas as pd
import re
import datetime as dt

# data import and file manipulation
import os
import openpyxl

import pyarrow

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# add current timestamp to filename for reference
current_time = (dt.datetime.utcnow().strftime('%Y_%m_%d_%H%M%S'))

# git repo folder
git_dir = 'd:/git'

# dictionary location
dict_dir = f'{git_dir}/data_parsing/dictionaries'

# export folder will contain all csv exported DataFrames for Ticket Creation
data_source_dir = 'D:/data_sets/billing_archive/data'
export_dir = 'D:/data_sets/billing_archive/exports'


## Pull source file data and worksheet tab names to compile for data targetting

In [None]:
source_files = []

for root, dirs, files in os.walk(data_source_dir):
    for file in files:
        source_files.append(root + "/" + file)

In [None]:
source_files

In [None]:
sheet_names = {}
for file in source_files:
    workbookData = openpyxl.load_workbook(file)
    for name in workbookData.sheetnames:
        temp_dict = {name: sheet_names.get(name, 0) + 1}
        sheet_names.update(temp_dict)

# Data Import and Shaping

## Step 1: Clean and Standardize Dataframe Information

### Import dictionaries for parsing

In [None]:
service_names_dict = pd.read_parquet(f'{dict_dir}/billing_worksheet.parquet', engine='auto').to_dict(orient='records')

rmm_source_tabs = []
for s in service_names_dict:
    if s['service'] == 'rmm':
        rmm_source_tabs.append(s['orig_name'])

In [None]:
client_rename_dict = pd.read_parquet(f"{dict_dir}/standard_client_names.parquet", engine='auto').to_dict(orient='records')

In [None]:
months_dict = pd.read_parquet(f"{dict_dir}/date_naming_conversions.parquet", engine='auto').to_dict(orient='records')

### Define Functions for Shaping

In [None]:
# remove the offboard from CLIENT names that were once used to signify a client was offboarded as an explanation for missing data going forward
def remove_offboard(string):
    offboarded_list = [" - Offboarded"," - Off-Boarded"," \(Offboarded\)"]
    for word in offboarded_list:
        result = re.sub(word,"",string)
        if result != string:
            string = result
            break

    return string

In [None]:
# rename all CLIENT name entries with known standardized names using dictionary and substitution logic
def rename_client(string):
    for c in client_rename_dict:

        #compare exact match on lowercase
        if c['[REDACTED]'].lower() == string.lower():
                    string = c['currentName']
                    break
        # must restrict <= 4 characters to strings that are of size or contain hyphens and uppercase, else there are too many combos
        if len(c['[REDACTED]']) <= 4:
            result = re.sub(c['[REDACTED]'], c['currentName'], string)
            # print("k: " + k + " v: " + v + "\nsting: " + string + "\n")
            if (result.lower() != string.lower()) & ((len(string) <= len(c['[REDACTED]']) + 4) | (" - " in string)):
                string = c['currentName']
                break
        else:
            result = re.sub(c['[REDACTED]'].lower(), c['currentName'], string.lower())
            if result != string.lower():
                string = c['currentName']
                break

    return string

In [None]:
# strip superfluous words (ie. (offboard)) from cells that would normally only contain digits (string to strip then to digit)
def strip_words_from_digit_cols(string):
    result = re.search(r'\d+',str(string))
    # print(result)
    if result is not None:
        result = result.group(0).lstrip().rstrip()
        string = result

    return int(string)

In [None]:
# create aggregate filter to be used when adding rows with the same CLIENT name column on column
def agg_group(dataframe):
    date_cols = list(dataframe.columns[1:])
    agg_filter = {}
    for col in date_cols:
        agg_filter.update({col: "sum"})

    return agg_filter

### Iterate over worksheets

In [None]:
df_list = []

for x in source_files:
    tab_names = openpyxl.load_workbook(x).sheetnames # pull all sheet names in workbook
    year = (re.search(r'\d+',x)).group(0)
    print(x)
    selected_tabs = (list(set(tab_names).intersection(rmm_source_tabs)))[0] # mark against known sheet names that were parsed against current sheet list to find target sheet
    print(selected_tabs)
    df = pd.read_excel(x, sheet_name=selected_tabs, header=1) # import excel and start header with row 1
    df = df.dropna(thresh=3).dropna(thresh=3, axis=1) # drop empty rows or cols with > 3 NAN values
    client_col_index = df.columns.get_loc("CLIENT") # find CLIENT heading index as known standard for client name data
    # if client_col_index < 0: client_col_index = 0 # set CLIENT index is 0 leave else
    df = df.drop(df.iloc[:, :client_col_index],axis = 1)
    client_col = df.columns[0] # Define name for client column (most likely CLIENT but this is the left most column after dropping empty rows/cols)
    df[client_col].fillna('Total',inplace=True) # Fill known NAN CLIENT name with 'Total'
    df[client_col] = df[client_col].apply(remove_offboard) # remove 'offboard' related wording from client names
    df[client_col] = df[client_col].apply(rename_client) # start client renaming substitution function
    df.fillna(0,inplace=True) # fill all date column cell values that are NAN with 0
    for col in df.columns[1:]:
        df[col] = df[col].apply(strip_words_from_digit_cols) # iterate down each column stripping all non-digit characters and replace with int(str) values
    df = df.groupby('CLIENT').agg(agg_group(df)).reset_index() # aggregate using sum on all rows that contain equal CLIENT names

    df = df[~df['CLIENT'].isin(['TOTAL','Total','Monthly Delta'])] # Drop Totals so they can be recalculated using pandas

    #Total sum per column:
    month_sums = dict(df[df.columns[1:]].sum(axis=0))
    month_sums.update({"CLIENT":"Total Agents"})
    df = pd.concat([df, pd.DataFrame.from_records([month_sums])], ignore_index=True)
    df_list.append({"year":year,"file":x,"dataframe":df}) # append both filename and dataframe to df_list for further action

## Step 2: Convert Date Shorthand Column names to Datetime and Transpose to Index

In [None]:
# parse and rename date column names to datetime before transpose to index row markers
def months_rename(df_object):
    rename_dict = {}
    print( list(df_object['dataframe'].columns[1:]))
    for col in list(df_object['dataframe'].columns[1:]):
        for m in months_dict:
            month = "NOT FOUND"
            # compare exact match on lowercase
            if m['abcr'].lower() == col.lower():
                month = int(m['monthNumber'])
                break

            result = re.search(m['abcr'].lower(), col.lower())
            if result:
                month = int(m['monthNumber'])
                break

        date = dt.datetime(int(df_object['year']), month, int(m['lastDay'])).strftime("%Y-%m-%d")
        rename_dict.update({col:date})

    return rename_dict

In [None]:
df_transposed_list = []
for df_obj in df_list:
    df = df_obj['dataframe'].rename(columns=months_rename(df_obj))
    df = df.transpose()
    df.columns = df.iloc[0]
    df.drop(df.index[0], axis=0, inplace=True)
    df.drop("Monthly Delta",axis=1,errors='ignore', inplace=True)
    df.rename({"Total":"Total Agents","TOTAL":"Total Agents"},axis=1,errors='ignore',inplace=True)
    df_transposed_list.append(df)

# concat all transposed df into one
df = pd.concat(df_transposed_list, ignore_index=False)

# reorder columns
columns_list = list(df.columns)
columns_list.remove('Total Agents')
columns_list.sort()
columns_list.insert(0,'Total Agents')

df = df[columns_list]
df.fillna(0,inplace=True)

In [None]:
df.to_csv(f'{export_dir}.csv')

# Display Visuals

### Create Sites DataFrame

In [None]:
df_sites = df.drop(['Total Agents'], axis=1)

In [None]:
# Timeseries plot of DataFrame - Total Agents
fig = plt.figure(figsize=(100,40))
sns.lineplot(data=df,x=df.index, y='Total Agents')
fig.savefig(f'{export_dir}.png')

In [None]:
import matplotlib.ticker as ticker

columns_plot = df_sites.columns

fig, ax = plt.subplots(figsize=(100,40))
ax.yaxis.set_major_formatter(ticker.EngFormatter())
for each in columns_plot:
    sns.lineplot(data = df_sites, x = df_sites.index, y = each, label = str(each), errorbar=None)
plt.legend()
plt.show()
fig.savefig(f'{export_dir}.png')