<h1> Unzip and Rename Folders from Sharepoint </h1>

Standardize file naming to prepare for SQL ingestion

# Import Modules and Define Globals

In [None]:
# data import and file manipulation
import os
import zipfile

#data conditioning
import pandas as pd
import re
import datetime as dt

In [None]:
# add current timestamp to filename for reference
current_time = (dt.datetime.utcnow().strftime('%Y_%m_%d_%H%M%S'))

# git repo folder
git_folder = 'd:/git/example_infrastructure_data_dev'

# dictionary Directory
dictionary_dir = 'd:/git/example_infrastructure_data_dev/dictionaries'

# source folder of zip files
source_folder = "d:/timeseries_data_sources/shared_client"

# export folder will contain all csv exported DataFrames for Ticket Creation
export_folder = 'd:/exports'

# Pull all Historical Data

## Pull list of all zip files and capture folder prefix name

In [None]:
# pull all filenames walking through all folders (recursive going down the tree)

source_report = []
for root, dirs, files in os.walk(source_folder):
    for file in files:
        if ('.zip' in file):
            info_dict = {}
            try:
                info_dict['abcreviation'] = (re.findall(r'([a-z]+)\s\-\s',os.path.join((file))))[0]
            except:
                print(f'cant find match for {os.path.join(file)}')
            info_dict['filename'] = os.path.join(file)
            info_dict['fullPath'] = os.path.join(root,file)
            source_report.append(info_dict)

## Standardize Client Names

In [None]:
df = pd.read_csv(f'{dictionary_dir}/client_name_standardization.dict',delimiter='\t')
client_rename_dict = {}
for index, row in df.iterrows():
    [REDACTED] = row['[REDACTED]']
    currentName = row['currentName']
    client_rename_dict[[REDACTED]] = currentName

In [None]:
def client_names(c_name):
    for k, v in client_rename_dict.items():
        try:
            result = re.sub(k.lower(), v, c_name.lower())
            if result != c_name:
                return v
                break
        except Exception as e:
            print(e)
            break
    return c_name

In [None]:
df_zips = pd.DataFrame(source_report)

In [None]:
df_zips['clientName'] = df_zips['abcreviation'].apply(client_names)

In [None]:
for index,row in df_zips.iterrows():
    source_path = f"{source_folder}/{row['filename']}"
    export_path = f"{export_folder}/{row['clientName']} - Service Reports"
    try:
        os.makedirs(export_path)
    except:
        pass
    print(f"Attempting to Unzip [{source_path}] to [{export_path}]")
    with zipfile.ZipFile(source_path, 'r') as zip:
        zip.extractall(export_path)

In [None]:
df_zips