# Datto RMM - CSV import - Activity Log - Shape - Export to PostgreSQL

## Import PostgreSQL Modules

In [None]:
# import necessary packages
import psycopg2 as pg
from sqlalchemy import create_engine

# Import the 'config'.py file
import config.config as config

## Import DataFrame Import and Shaping Modules

In [None]:
import modules.field_standards as fs
from modules.source_file_info import AddTime as srctime
import datetime
import re
import os
import csv
import pandas as pd
import numpy as np

## Arguments and Declarations


In [None]:
# define key column to join on
fieldnames_to_compare = 'Device UID'

# identify folder stages so that files are not called twice in the same stage
source_dir = 'd:/data_sets/data_pool/datto_rmm_data/audit_logs/'
# source_dir = 'D:\cloud_storage\OneDrive - Think Stack\Reports Automation'

In [None]:
# define if you are pulling raw data ('data_pool') or test data ('seed_data')
database = 'seed_data'

# exclude csv if missing column name
exclude_on_missing_column = False

# DEFINE THE DB URI
db_uri = f'postgresql://{config.USERNAME}:{config.SECRET}@{config.URI}:{config.PORT}/{database}'

# DEFINE THE ENGINE (CONNECTION OBJECT)
engine = create_engine(db_uri, echo=True)

In [None]:
# dictionary of df_names and queries used to create tables in postreSQL
device_details = {'dataframe': 'device_details', 'query': 'SELECT * FROM datto_rmm.device_details'}
os_patch_mgmt = {'dataframe': 'os_patch_mgmt', 'query': 'SELECT * FROM datto_rmm.os_patch_mgmt'}
user_activity = {'dataframe': 'audit_logs', 'query': 'SELECT * FROM datto_rmm.audit_logs'}

import_dataframes = [audit_logs]

In [None]:
# Selected Columns from Master Device View Export CSV - This is the column mask that will be used to trimp the outer merge on match key column

device = [
    'Device UID',
    'Device Hostname',
    'Site UID',
    'Site Name',
    'Device Description',
    'Int IP Address',
    'Ext IP Addr',
    'Create Date',
    'Last Seen',
    'Last Audit Date',
    'Session Name',
    'Privacy Mode',
    'Agent Version',
    'Display Version',
    'Device Model',
    'Operating System',
    'Serial Number',
    'Motherboard',
    'Device CPU',
    'Physical CPU Cores',
    '.NET Version',
    'Memory',
    'MAC Address(es)',
    'User-Defined Field 10',
    'Device Type',
    'Domain',
    'Disk Drive (total/free)',
    'Online Duration (hrs)',
    'Architecture',
    'Display Adapters',
    'BIOS Name',
    'BIOS Release Date',
    'BIOS Version',
    'Last Reboot',
    'Reboot required',
    'NIC Vendor',
    'Manufacturer',
]

manage = [
    'Device UID',
    'Site Name',
    'Device Hostname',
    'Device Description',
    'Policy',
    'Int IP Address',
    'Ext IP Addr',
    'Last User',
    'Group',
    'Create Date',
    'Last Seen',
    'Last Audit Date',
    'Session Name',
    'Agent Version',
    'Operating System',
    'Service Pack',
    'Serial Number',
    'User-Defined Field 10',
    'Last Run',
    'Schedule',
    'Patch Status',
    'Patches Approved Pending',
    'Patches Installed',
    'Patches Not Approved',
    'Device Type',
    'Domain',
    'Disk Drive (total/free)',
    'Online Duration (hrs)',
    'Last Reboot',
    'Reboot required',
    'Manufacturer'
]




In [None]:

# output of csv with matching key column
included_files = {}

# output csv of all files that could not be merged
excluded_files = {}

# Regex Match to group files to be combined on rows rather than merged on columns to prevent dropping rows if there isnt a key column match when files are combined in random order
pattern = re.compile(r'^(\w+)_')

# CSV File Types
devices_tab_export_filename = 'DeviceDetailsExport'
manage_tab_export_filename = 'SystemDeviceSelection'
user_activity_export_filename = 'UserActivity'
grouped_export_files_list = []
shaped_df_object_list = []

<h1>Read all files in source_dir and sub directories</h1>
    <h3> Filter by '.csv' </h3>

In [None]:
# pull all filenames walking through all folders (recursive going down the tree)
source_csv_dict = {}
for root, dirs, files in os.walk(source_dir):
    for file in files:
        if '.csv' in file:
            source_csv_dict.update({os.path.join(file): os.path.join(root, file)})


<h1>Sorting and Excluding Files</h1>

## Read all csv file columns and create two lists of files:
### Those with the chosen merge key column will be kept and the remaining filenames will not be called any further

In [None]:
for k, v in source_csv_dict.items():
    if exclude_on_missing_column:
        df = pd.read_csv(v)
        if fieldnames_to_compare not in df.columns:
            print(f'Missing Key: {fieldnames_to_compare} to Join in {k}')
            excluded_files.update({k: v})
        else:
            included_files.update({k: v})
    else:
        included_files.update({k: v})

## Parse Accepted CSV's for file discription and store as dictionary key pair

In [None]:
pattern = re.compile(r'^([a-zA-Z]{0,})(\_|\-|''){0,1}([a-zA-Z]{0,})')

In [None]:
for k, v in included_files.items():
    matches = pattern.search(k)
    if matches[1] == devices_tab_export_filename:
        #print(f'''['{v}'] matches: ['{devices_tab_export_filename}'] on ['{matches[1]}']''')
        grouped_export_files_list.append({'filename': v, 'groupname': 'device_details', 'columns': device})
    elif matches[3] == manage_tab_export_filename:
        #print(f'''['{v}'] matches: ['{manage_tab_export_filename}'] on ['{matches[3]}']''')
        grouped_export_files_list.append({'filename': v, 'groupname': 'os_patch_mgmt', 'columns': manage})
    elif matches[1] == audit_log_export_filename:
        print(f'''['{v}'] matches: ['{audit_log_export_filename}'] on ['{matches[1]}']''')
        grouped_export_files_list.append({'filename': v, 'groupname': 'user_activity', 'columns': device})

print('=' * 50)

## More Shaping on User Activity

In [None]:
for file in grouped_export_files_list:
    filename = file['filename']
    groupname = file['groupname']
    grp_columns = file['columns']
    print(filename)
    print(groupname)
    print(grp_columns)

In [None]:
# Shape CSV for to add or trim to bring columns to standard

with open(filename, 'w+', newline='') as file_shaping:
    csv_reader = csv.DictReader(file_shaping)
    # csv_writer writes the following nested code to the shaped_dir file (unmerged / not-joined)
    csv_writer = csv.DictWriter(file_shaping, fieldnames=grp_columns, extrasaction='ignore', delimiter=',')
    csv_writer.writeheader()

#for each line read as dict add (3) source data dict k,v elements with the source data then complete the row
for line in csv_reader:
    csv_writer.writerow(line)

## For those files that have the key column, shape add source info

### 1. Add in any missing columns against the standard so data columns line up on import
### 3. Trim extra df columns to match column standards
### 4. Replace any known type mismatch values before setting datetime
### 5. Add source file data as columns at end of dataframe (record the file creation, modified, and fullpath name)
### 6. Parse known date columns to datetime so the types are correct in db import
### 7. EXPORT to postregsql

In [None]:
for file in grouped_export_files_list:
    filename = file['filename']
    groupname = file['groupname']
    grp_columns = file['columns']
    print(filename)

    # pull source time from file properties
    source_info = srctime(filename)

    # NA Values Check
    if groupname == 'UserActivity':
        na_values = ['null', '(null)']
    else:
        na_values = ['Currently Online', 'null', '(null)']

        # Import CSV to Pandas
    print(f"reading file ['{filename}'] and ['{groupname}']!")
    df = pd.read_csv(filename, na_values=na_values, skipinitialspace=True)

    # if column is missing in dataframe add it before upload to prevent mismatch or multi indexed columns
    for c in grp_columns:
        if c not in df:
            df[c] = np.nan

    # DTYPES

    # prepare DTYPE values to match predicted values - get column initial dtypes
    column_dtypes = dict(df.dtypes)

    for k, v in column_dtypes.items():

        # strip any whitespace from object columns (non-datetime or boolean)
        if v == 'object':
            df[k].str.strip()

        # Condition boolean values for postgreSQL: (True,False,NULL) Only!
        elif v == 'bool':
            df[k].mask(df[k] == '', pd.NA, inplace=True)

    # regex remove whitespace
    df = df.replace(r'^\s+$', np.nan, regex=True)

    # Replacment Values for type mismatch
    replace_dict = {
        'Currently Online': source_info['Source Modified Date'],
        '': pd.NA
    }
    df.replace(replace_dict)

    # drop non-standard columns
    df.drop([col for col in df.columns if col not in grp_columns], axis=1, inplace=True)
    #print(f'columns after drop: {df.columns}')

    # add source info to new columns k with values v
    for k, v in source_info.items():
        df[k] = v

    # Parse Date Data Options
    date_parser = lambda c: pd.to_datetime(c, errors='coerce')

    details_manage__dates = [
        'Create Date',
        'Last Seen',
        'Last Reboot',
        'Source Creation Date',
        'Source Modified Date'
    ]

    audit_log_dates = [
        'Date/Time'
    ]

    # choose parse by groupname
    if groupname == 'UserActivity':
        parse_dates = audit_log_dates
    else:
        parse_dates = details_manage__dates

    # filename prefix timestamp format
    time_format = '%Y_%m_%d_%H%M%S'

    # convert dates to datetime
    df[parse_dates] = df[parse_dates].apply(date_parser)

    # section can be uncommented for seed data creation for import on a new table to set column names and types
    df.to_csv('.csv')

    # export to postgresql
    #df.to_sql(groupname, con=engine, if_exists='append', index=False, index_label=None, schema='datto_rmm')
    print('=' * 100)
    print('')
    print('=' * 100)