
<h1> DattoRMM - Patching Time Series Inventory </h1>

# Import Modules and Prepare Globals

In [None]:
import pandas as pd
import numpy as np
import re
import os
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

# filetype modules
import json
import csv
import xlrd

In [None]:
# add current timestamp to filename for reference
current_time = (dt.datetime.utcnow().strftime('%Y_%m_%d_%H%M%S'))

# git repo folder
git_folder = 'd:/git/example_infrastructure_data_dev'

# export folder will contain all csv exported DataFrames for Ticket Creation
export_folder = 'd:/exports/'

# define key column to join on
fieldnames_to_compare = 'Device UID'

# identify folder stages so that files are not called twice in the same stage
source_dir = 'D:/cloud_storage/Think Stack/Infrastructure - Documents/Reports - Archiving and Distribution'

## Import PostgreSQL Modules and Prepare Auth and Query

In [None]:
import psycopg2 as pg
from sqlalchemy import create_engine

In [None]:
# import configparser for env secrets
from configparser import ConfigParser

config = ConfigParser()
config.read(f'{git_folder}/config/env.ini')
import requests
from requests.structures import CaseInsensitiveDict

# import and assign secrets from env.ini
postgresql = config['postgresql']

In [None]:
# define if you are pulling raw data ('data_pool') or test data ('seed_data')
database = 'seed_data'

# DEFINE THE DB URI
db_uri = f"postgresql://{postgresql['username']}:{postgresql['password']}@{postgresql['uri']}:{postgresql['port']}/{database}"

# DEFINE THE ENGINE (CONNECTION OBJECT)
engine = create_engine(db_uri, echo=True)

In [None]:
# dictionary of df_names and queries used to create dataframes from postreSQL
device_details = {'dataframe':'df_device_details','query':'SELECT * FROM datto_rmm.device_details'}
os_patch_mgmt = {'dataframe':'df_os_patch_mgmt','query':'SELECT * FROM datto_rmm.os_patch_mgmt'}

import_dataframes = [device_details,os_patch_mgmt]

# Pull all Historical Data

## Pull list of all csv and xlsx files

In [None]:
# pull all filenames walking through all folders (recursive going down the tree)
#all_source_csv = []
source_report = []
for root, dirs, files in os.walk(source_dir):
    for file in files:
        if ('.csv' in file) | ('.xlsx' in file):
            info_dict = {}
            info_dict['filename'] = os.path.join(file)
            info_dict['fullPath'] = os.path.join(root,file)
            source_report.append(info_dict)

### Create Dataframe of files and lcoations

In [None]:
df_files = pd.DataFrame(source_report)

### Sort out those files that have words that signify a patch report

In [None]:
patch_file_prog = re.compile(r'[pP]atch')
third_prog = re.compile(r'(\b3rd\b|\b3PP\b)+')

In [None]:
def find_patching_reports(filename):
    patch_result = patch_file_prog.findall(filename)
    software_result = third_prog.findall(filename)
    if (software_result == []) & (patch_result != []):
        return filename
    else:
        return 'Remove'

In [None]:
df_files['filename'] = df_files['filename'].apply(find_patching_reports)
df_files = df_files[df_files['filename'] != 'Remove']

# Prepare and Standardize Data Ingestion Flow

## Break out year and month of the report into seperate columns

### Iterate through month names and replace with number if found

In [None]:
month_dict = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06', 'July': '07','August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12'}
def transform_month_to_number(month):

    for k, v in month_dict.items():

        result = re.sub(k,v,month)
        if result != month:
            return (result)
            break
    return month

In [None]:
df_files['adjustedFileName'] = df_files['filename'].apply(transform_month_to_number)

### Functions to break apart month and year

In [None]:
report_year_prog = re.compile(r'\s{1}(\d{4})')
report_month_prog = re.compile(r'\s{1}(\d{2})\s{1}')

In [None]:
def report_date_year(file):
    try:
        result = report_year_prog.search(file)

        return (result[0])
    except:
        print(file)

def report_date_month(file):
    try:
        result = report_month_prog.search(file)
        return (result[0])
    except:
        print(file)

In [None]:
df_files['reportYear'] = df_files['adjustedFileName'].apply(report_date_year)
df_files['reportMonth'] = df_files['adjustedFileName'].apply(report_date_month)
df_files.drop('adjustedFileName',axis=1,inplace=True)

## Standardize Client Names

In [None]:
client_name_prog = re.compile(r'^([^\-]+)\s')
clients_set = set()

def client_names(file):
    result = client_name_prog.findall(file)
    return result[0]

In [None]:
df_files['clientName'] = df_files['filename'].apply(client_names)

In [None]:
df = pd.read_csv(f'.csv')
client_rename_dict = {}
for index, row in df.iterrows():
    [REDACTED] = row['[REDACTED]']
    currentName = row['currentName']
    client_rename_dict[[REDACTED]] = currentName

In [None]:
df_files['clientName'].replace(client_rename_dict,inplace=True)

## Timestamp Files

In [None]:
def AddTime(source_file):
    source_mfdate = 'Source Modified Date'
    source_crdate = 'Source Creation Date'
    source_fn = 'Source Filename'

    # Both the variables would contain time
    # elapsed since EPOCH in float
    ti_c = os.path.getctime(source_file)
    ti_m = os.path.getmtime(source_file)
    fi_n = os.path.basename(source_file)


    # Converting the time in seconds to UTC datetime
    c_ti = dt.datetime.utcfromtimestamp(ti_c).strftime('%Y/%m/%d %H:%M:%S')
    m_ti = dt.datetime.utcfromtimestamp(ti_m).strftime('%Y/%m/%d %H:%M:%S')


    return {source_crdate:c_ti,source_mfdate:m_ti,source_fn:fi_n}

In [None]:
df_files['sourceData'] = df_files['fullPath'].apply(AddTime)

In [None]:
def creation_date(sourceData):
    return sourceData['Source Creation Date']

def modified_date(sourceData):
    return sourceData['Source Modified Date']

def source_filename(sourceData):
    return sourceData['Source Filename']

In [None]:
df_files['sourceCreationDate'] = df_files['sourceData'].apply(creation_date)
df_files['sourceModifiedDate'] = df_files['sourceData'].apply(modified_date)
df_files['sourceFilename'] = df_files['sourceData'].apply(source_filename)

In [None]:
df_files.drop('sourceData',axis=1,inplace=True)

In [None]:
df_files

In [None]:
df_cols_count = {}

In [None]:
for path in df_files.iloc[0]['fullPath']:
    print(path)
    try:
        df = pd.read_csv(path)
        df_cols_count.update(df.columns)
    except:
        print('cannot load csv')
        try:
            df = pd.read_excel(path)
            df_cols_set.update(df.columns)
        except Exception as e:
            print(e)

In [None]:
df_cols_set.update(df_files.columns)

In [None]:
df = pd.read_excel(df_files.iloc[0]['fullPath'])

In [None]:
df_files

In [None]:
df_cols_set

## Break out Data by System Creator (Automate, DattoRMM, Sophos, etc)

In [None]:
pd.DataFrame(clients_set).to_csv('.csv')

In [None]:
df_files.to_csv('.csv')

In [None]:
df_files['reportYear'].unique()

In [None]:
df_files[df_files['filename'].str.contains('March')]


In [None]:
df_files['reportMonth'].unique()

In [None]:
march = df_files[df_files['filename'].str.contains('March')]

In [None]:
march.iloc[:]['filename'] = march['filename'].apply(transform_month_to_number)
march

In [None]:
march

In [None]:
march['reportYear'] = march['filename'].apply(report_date_year)
march['reportMonth'] = march['filename'].apply(report_date_month)

In [None]:
march

In [None]:
march = df_files[df_files['filename'].str.contains('March')]

In [None]:
march

In [None]:
help = transform_month_to_number(str(march['filename']))

In [None]:
help

In [None]:
re.sub('March','03','[REDACTED] - March 2022.xlsx')

<h1>Sorting and Excluding Files</h1>

## Read all csv file columns and create two lists of files:
### Those with the chosen merge key column will be kept and the remaining filenames will not be called any further

## Parse Accepted Reports for file discription and store as dictionary key pair

In [None]:
string = 'ExampleFCU - 2020 - 11 - Patching Audit.xlsx'
ms_patch_prog = re.compile(r'(\bSIG\b|\bExampleFCU\b|\bExample FCU\b){1}.*(?!\b3rd\b)+(\bMS\b|\bPatch\b|\bPatching\b)+')
third_prog = re.compile(r'(\b3rd\b|\b3PP\b)+')
extension_prog = re.compile(r'(\.\w{3,4})')
report_date_prog = re.compile(r'(\d{4}).*(\d{2})')

In [None]:
ms_patch_reports = []
for k,v in source_report_dict.items():
    match = {}

    if (ms_patch_prog.search(k) != None) & (third_prog.search(k) == None):

        # date nested dict
        date = {}
        date['year'] = str(report_date_prog.findall(k)[0][0])
        date['month'] = str(report_date_prog.findall(k)[0][1])

        match['filename'] = k
        match['path'] = v

        # extension
        if extension_prog.findall(file['filename']) == ['.xlsx']:
            match['extension'] = 'xlsx'

        elif extension_prog.findall(file['filename']) == ['.csv']:
            match['extension'] = 'csv'

        match['date'] = date

        ms_patch_reports.append(match)

## For those files that have the key column, set index col and add source info
### 1. Add source file data as columns at end of dataframe (record the file creation, modified, and fullpath name)
### 2. Set index col = fieldnames_to_compare variable list

In [None]:
import os
import datetime

def AddTime(source_file):
    source_mfdate = 'Source Modified Date'
    source_crdate = 'Source Creation Date'
    source_fn = 'Source Filename'

    # Both the variables would contain time
    # elapsed since EPOCH in float
    ti_c = os.path.getctime(source_file)
    ti_m = os.path.getmtime(source_file)
    fi_n = os.path.basename(source_file)


    # Converting the time in seconds to UTC datetime
    c_ti = datetime.datetime.utcfromtimestamp(ti_c).strftime('%Y/%m/%d %H:%M:%S')
    m_ti = datetime.datetime.utcfromtimestamp(ti_m).strftime('%Y/%m/%d %H:%M:%S')


    return {source_crdate:c_ti,source_mfdate:m_ti,source_fn:fi_n}

In [None]:
def map_source(source_file):
    # pull source time from file properties
    source_info = AddTime(source_file)

    # Import CSV
    df = pd.read_csv(source_file,index_col=fieldnames_to_compare)

    # add source info to new columns k with values v
    for k,v in source_info.items():
        df[k] = v

    return df

In [None]:
df_csv.to_sql('test_db', con=engine, if_exists='append',index=False,schema='datto_rmm')