In [None]:
# -----------------------------------------------------------------------------------------------
# Title: CSV - Concat + Join Nultiple CSV Column to Column Alignment
# Author: Gabe McWilliams
# Purpose: When importing data from multiple CSV the df do not have the same columns causing data to be inaccurate for daily delta
# Date of Creation: 2022/05/03
# Version 1.1
# -----------------------------------------------------------------------------------------------

# =============================================================================
# 2022/05/08 - Working to add functionality that will sort all files by modified date
# =============================================================================

## Import Modules

In [None]:
import modules.field_standards as fs
from modules.source_file_info import AddTime as srctime
import datetime
import re
import os
import csv
import sklearn as sk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cufflinks as cf
import chart_studio.plotly as py
import seaborn as sns
import plotly.express as px
import plotly.offline as offline
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()
%matplotlib inline

## Arguments and Declarations


In [None]:
# Selected Columns from Master Device View Export CSV - This is the column mask that will be used to trimp the outer merge on match key column
std_columns = [
    'Device UID',
    'Site Name',
    'Site UID',
    'Device Hostname',
    'Create Date',
    'Last Seen',
    'Last Audit Date',
    'Policy',
    'Patches Approved Pending',
    'Patches Not Approved',
    'Patches Installed',
    'Patch Status',
    'Schedule',
    'Last Run',
    'Operating System',
    'Device CPU',
    'Physical CPU Cores',
    '.NET Version',
    'Memory', 
    'Device Type',
    'Domain',
    'Disk Drive (total/free)',
    'Online Duration (hrs)',
    'Architecture',
    'Last Reboot',
    'Reboot required',
    'Int IP Address', 
    'User-Defined Field 10', 
    'MAC Address(es)',
    'Software Status',
    'Group',
    'Antivirus Product',
    'Antivirus Status',
    'Source Modified Date',
    'Source Creation Date',
    'Source Filename'
 ]

In [None]:
# review column filter for any duplicates values or close similarities
std_col_ser = pd.Series(std_columns).value_counts()
std_col_ser[std_col_ser > 1]

In [None]:
# filename prefix timestamp format
time_format = '%Y_%m_%d_%H%M%S'

# define key column to join on
fieldnames_to_compare = 'Device UID'

# identify folder stages so that files are not called twice in the same stage
source_dir = 'd:/data_sets/raw/'

# Parse Date Data Options
date_parser = lambda c: pd.to_datetime(c, errors='coerce')
parse_dates =  ['Create Date', 'Last Seen','Last Reboot']

# NA Values Check
na_values = ['Currently Online','null', '(null)']

# output of csv with matching key column
included_files = {}

# output csv of all files that could not be merged
excluded_files = {}

# Regex Match to group files to be combined on rows rather than merged on columns to prevent dropping rows if there isnt a key column match when files are combined in random order
pattern = re.compile(r'^(\w+)_')

# final dataframe before training and visualization
df_clean = pd.DataFrame(columns=std_columns)

# CSV File Types
devices_tab_export_filename = 'DeviceDetailsExport'
devices_tab_export_files = []
df_devices_list = []




manage_tab_export_filename = 'SystemDeviceSelection'
manage_tab_export_files = []
df_manage_list = []

<h1>Read all files in source_dir and sub directories</h1>
    <h3> Filter by '.csv' </h3>

In [None]:
# pull all filenames walking through all folders (recursive going down the tree)
#all_source_csv = []
source_csv_dict = {}
for root, dirs, files in os.walk(source_dir):
    for file in files:
        if '.csv' in file:
            #all_source_csv.append(os.path.join(root,file))
            source_csv_dict.update({os.path.join(file):os.path.join(root,file)})
            
# print(all_source_csv)
print('All CSV Files found before futher vetting and filtering')
print('='*50)
for file in source_csv_dict:
    print(file)

<h1>Sorting and Excluding Files</h1>

## Read all csv file columns and create two lists of files:
### Those with the chosen merge key column will be kept and the remaining filenames will not be called any further

In [None]:
# for each filename - pull source data using source_info module in custom modules folder
#for filename in all_source_csv:

    # read in dataframe
    #df = pd.read_csv(filename)
    #print(df['Site Name'].unique())
    
    # for each df review for join key column to be present and add to included_files else add to excluded_files
    #print(df.columns)
    
   # compare_keys(df,filename)
for k,v in source_csv_dict.items():
    df = pd.read_csv(v)
    #print(df['Site Name'].unique())
    #compare_keys(df,filename)
    if fieldnames_to_compare not in df.columns:
        print(f'Missing Key: {fieldnames_to_compare} to Join in {filename}')
        excluded_files.update({k:v})
    else:
        included_files.update({k:v})

In [None]:
print('Files with CORRECT join key column:')
print('-'*50)
for file in included_files:
    print(file)
print('='*50)

print('Files MISSING join key column:')
print('-'*50)
for file in excluded_files:
    print(file)
print('='*50)

## Parse Accepted CSV's for file discription and store as dictionary key pair

In [None]:
pattern = re.compile(r'^([a-zA-Z]{0,})(\_|\-|''){0,1}([a-zA-Z]{0,})')

In [None]:
for k,v in included_files.items():
    matches = pattern.search(k)
    if matches[1] == devices_tab_export_filename:
        print(f'''['{v}'] matches: ['{devices_tab_export_filename}'] on ['{matches[1]}']''')
        devices_tab_export_files.append({'filename':v,'groupname':manage_tab_export_filename})
    elif matches[3] == manage_tab_export_filename:
        print(f'''['{v}'] matches: ['{manage_tab_export_filename}'] on ['{matches[3]}']''')
        manage_tab_export_files.append({'filename':v,'groupname':manage_tab_export_filename})

    
print('='*50)

## For those files that have the key column, set index col and add source info
### 1. Add source file data as columns at end of dataframe (record the file creation, modified, and fullpath name)
### 2. Set index col = fieldnames_to_compare variable list

In [None]:
def map_source(source_file):
    # pull source time from file properties
    source_info = srctime(source_file)

    # Import CSV
    df = pd.read_csv(source_file,index_col=fieldnames_to_compare)

    # add source info to new columns k with values v
    for k,v in source_info.items():
        #print('='*50)
        #print(f'key = {k}')
        #print(f'value = {v}')
        #print('='*50)
        df[k] = v
        #print(df['Source Creation Date'])
    return df    

## Attempt 2 at updating data correctly on import.  Attempting to use iloc for each df row

In [None]:
# Create blank DataFrame to fill
df_master = pd.DataFrame(columns=std_columns)

In [None]:
test_details = pd.read_csv('.csv')

In [None]:
test_manage = pd.read_csv('.csv')

In [None]:
for row in df_test1.row:
    print(row)

In [None]:
for file in devices_tab_export_files:
    #print(file['filename'])
    #print(file['groupname'])
    df_devices_list.append(map_source(file['filename']))
    

## Attempt 1 at updating data correctly on import.  Still having issues with micromanaging file order on join

In [None]:
#print(manage_tab_export_files)
#print(devices_tab_export_files)
for file in devices_tab_export_files:
    #print(file['filename'])
    #print(file['groupname'])
    df_devices_list.append(map_source(file['filename']))
    
    

    
for file in manage_tab_export_files:
    #print(file['filename'])
    #print(file['groupname'])
    df_manage_list.append(map_source(file['filename']))
    
    
#print(df_manage_list)

In [None]:
df_devices = pd.concat(df_devices_list,axis=0)
df_manage = pd.concat(df_manage_list,axis=0)

In [None]:
df_manage.info()

In [None]:
df_manage

In [None]:
df_devices.info()

<h1>Join, Concat, and Merge</h1>

## Those of the same name should be concatonated by row or stacked

## Merge all dataframes (csv's) into an empty dataframe that contains all columns without data

In [None]:
# as a df (dataframe) must be merged on another, we start with filelist with the first element as the df all will be merged into index[0] in list dtype
df_clean = df_list[0]

for df_object in df_list[1:]:
    # join on key column or columns (original 'set' dtype must be changed to 'list' to fit pandas expected argument for 'merge' method)
    df_clean.merge(df_object, on=(fieldnames_to_compare), how= 'outer',suffixes=('', '_drop'))

In [None]:
# drop duplicate colummns renamed as '_drop' during parse
df_clean.drop([col for col in df_clean.columns if 'drop' in col], axis=1, inplace=True)

## Trim any columns not in the column standars list 'columns'

In [None]:
len(df_clean.columns)

In [None]:
df_clean.drop([col for col in df_clean.columns if col not in std_columns], axis=1, inplace=True)

In [None]:
len(df_clean.columns)

In [None]:
# review column filter for any duplicates columns and drop one
clean_col = dict(df_clean.columns.value_counts())
dup_cols = list({k for (k,v) in clean_col.items() if v > 1})
if dup_cols:
    df_clean.rename(dup_cols,axis=1,inplace=True)
df_clean

## Using lists 'parse_dates' as datetime column targets and 'data_parser' as the datetime function to be applied to each value in target columns along each row that will change value type 

In [None]:
# convert dates to datetime
# variable 'time_format' stated at declaration
df_clean[parse_dates] = df_clean[parse_dates].apply(date_parser)

## Add 'Offline 30 days' and 'Extended Reboot' Columns as datetime delta calculations from day this report is run

In [None]:
# Fill in NaT / NaN data with 0 time so columns can be converted to datetime and datetime methods can be used
df_clean[parse_dates].fillna(pd.Timedelta('0 days'),inplace=True)

# Filter - Devices Offline 30 Days
df_clean['Offline 30 Days'] = df_clean['Last Seen'] > datetime.datetime.now() - pd.to_timedelta("30day")

# Filters - Last Reboot Extended Duration and Online without Reboot Extended Duration
df_clean['Last Reboot Extended'] = df_clean['Last Reboot'] > datetime.datetime.now() - pd.to_timedelta("30day")

## Apply heatmap to review any NaN or NaT (null) values before they can be dropped

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(data = df_clean.isnull(),yticklabels=False,cbar=False,cmap='plasma')

<h1>Start ML Trials</h1>

In [None]:
## Create MS Patching pairplot DataFrame
df_patch_pair = df_clean

## Convert 'Category' columns into numbers to get value relationships

In [None]:
d_patch_status = pd.get_dummies(df_clean['Patch Status'],prefix='d',prefix_sep='_')
df_patch_pair.drop('Patch Status',axis=1,inplace=True)
df_patch_pair = pd.concat([df_patch_pair,d_patch_status])

In [None]:
d_offline_30days = pd.get_dummies(df_clean['Offline 30 Days'],drop_first=True,prefix='d',prefix_sep='_')
d_offline_30days.rename(columns={'d_True':'d_Offline_30 Days'},inplace=True)
df_patch_pair.drop('Offline 30 Days',axis=1,inplace=True)
df_patch_pair = pd.concat([df_patch_pair,d_offline_30days])

In [None]:
d_last_reboot_ext = pd.get_dummies(df_clean['Last Reboot Extended'],drop_first=True,prefix='d',prefix_sep='_')
d_last_reboot_ext.rename(columns={'d_True':'d_Last Reboot Extended'},inplace=True)
df_patch_pair.drop('Last Reboot Extended',axis=1,inplace=True)
df_patch_pair = pd.concat([df_patch_pair,d_last_reboot_ext])

In [None]:
d_av_status = pd.get_dummies(df_clean['Antivirus Status'],drop_first=True,prefix='d',prefix_sep='_')
d_av_status.rename(columns={'d_Running & up-to-date':'d_AV Status Ok'},inplace=True)
df_patch_pair.drop('Antivirus Status',axis=1,inplace=True)
df_patch_pair = pd.concat([df_patch_pair,d_av_status])

In [None]:
d_reboot_required = pd.get_dummies(df_clean['Reboot required'],drop_first=True,prefix='d',prefix_sep='_')
d_reboot_required.rename(columns={'d_True':'d_Reboot required'},inplace=True)
df_patch_pair.drop('Reboot required',axis=1,inplace=True)
df_patch_pair = pd.concat([df_patch_pair,d_reboot_required])
d_reboot_required

In [None]:
sns.pairplot(data=df_patch_pair)

In [None]:
for col in df_patch_pair.columns:
    print(col)

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(30,15))
sns.lineplot(data=df_patch_pair,x='Last Reboot',y='Patches Approved Pending',lw=.5)
plt.savefig('.png')

In [None]:
df_patch_pair['Online Duration (hrs)'].value_counts()

In [None]:
df_patch_pair['Site Name'].unique()

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(30,15))
sns.barplot(data=df_patch_pair,x='Site Name',y='Online Duration (hrs)')

In [None]:
# store filename info as dictionary 
var_dict = {}
i = 0
df_list = []
df_names =[]


# create variable dictionary before import iteration
for filename in all_shaped_csv:
    var_dict.update({('df' + str(i)): (shaped_dir + filename)})
    i = i + 1

# for filename = key(k) import into pandas and append resulting dataframe to df list as element
for k, v in var_dict.items():
    k = pd.read_csv(v)
    df_names.append(v)
    df_list.append(k)

#print(df_names)

# as a df (dataframe) must be merged on another, we start with filelist with the first element as the df all will be merged into index[0] in list dtype
for df_object in df_list[1:]:
    #print(df_object)
    #print("")
    
    # join on key column or columns (original 'set' dtype must be changed to 'list' to fit pandas expected argument for 'merge' method)
    df_list[0].merge(df_object, on=list(fieldnames_to_compare), how= 'outer')
    print(df_list[0]['Policy'])
    
    
# add current timestamp to filename for reference    
current_time = (datetime.datetime.utcnow().strftime('%Y_%m_%d_%H%M%S'))

# add 'merged_' to filename startswith and export
df_list[0].to_csv(merged_dir + 'merged_' + str(current_time) + ".csv", index= False)

cleanup = True
# clean up intermediate data
if cleanup == True:
    for s in all_shaped_csv:
        path = shaped_dir + s
        print(f'Removing file {path}')
        os.remove(path)







In [None]:
df

In [None]:
for filename in all_source_csv:
    source_info = srctime(source_dir + filename)
    df = pd.read_csv(source_dir + filename)
    for k,v in source_info.items():
        print(f'The key is {k}')
        print(f'The value is {v}')
        df[k] = v
    #df.insert(1,columns=source_info.keys(),source_info.values())
    #df[source_info.keys()]
    #print(source_info)
    #print(source_info.keys())
    #print(source_info.values())
   

    #print(filename)

    #df.colums = fs.patch_columns
    #print(df.columns)

In [None]:
df