In [1]:
import json
import re

In [2]:
file1 = 'rocopylog.txt'
file2 = 'rocopylog_invalid_source.txt'

# Open one of the files above

In [3]:
def open_file(file):
    with open(file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        return lines

# Pattern to Identify Header

In [4]:
PATTERN_SOURCE_DESTN = re.compile(r'\s+(?P<type>Source|Dest) : (?P<dir>C.*\b)')

In [5]:
def identify_header(lines, pattern):
    header_dict_regex = {'type': [], 'dir': []}
    for line in lines:
        source_destn_data_from_regex = re.finditer(pattern, line)
        for match in source_destn_data_from_regex:
            header_dict_regex['type'].append(match.group('type'))
            header_dict_regex['dir'].append(match.group('dir'))
    return header_dict_regex

In [6]:
headers = identify_header(open_file(file1), PATTERN_SOURCE_DESTN)

In [7]:
for key, val in headers.items():
    print(key, val)

type ['Source', 'Dest']
dir ['C:\\RegularExpressionsWithDotNet\\robocopytest\\source\\தமிழ்\\हिन्दी\\English', 'C:\\RegularExpressionsWithDotNet\\robocopytest\\destn']


# Pattern to Capture Error Message

In [8]:
PATTERN_ERROR_MSG = re.compile(r'(?P<ts>\b\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}) ERROR '\
                               r'(?P<error>.*\b)')

In [9]:
def capture_error_msg(lines, pattern):
    error_dict_regex = {'ts': [], 'error': []}
    for line in lines:
        error_msg_data_from_regex = re.finditer(pattern, line)
        for match in error_msg_data_from_regex:
            error_dict_regex['ts'].append(match.group('ts'))
            error_dict_regex['error'].append(match.group('error'))
    return error_dict_regex

In [10]:
errors = capture_error_msg(open_file(file2), PATTERN_ERROR_MSG)

In [11]:
for key, val in errors.items():
    print(key, val)

ts ['2016/05/28 19:24:30']
error ['2 (0x00000002) Accessing Source Directory C:\\RegularExpressionsWithDotNet\\robocopytest\\source2']


# Pattern to Capture Metrics Table

In [12]:
PATTERN_METRICS_TBL = re.compile(r'\s+(?P<type>Dirs|Files|Bytes) :\s+'\
                                 r'(?P<total>\d+)\s+'\
                                 r'(?P<copied>\d+)\s+'\
                                 r'(?P<skipped>\d+)\s+'\
                                 r'(?P<mismatch>\d+)\s+'\
                                 r'(?P<failed>\d+)\s+'\
                                 r'(?P<extras>\d+)')

In [13]:
def capture_metrics_tbl(lines, pattern):
    metrics_dict_regex = {'type': [], 'total': [], 'copied': [], 'skipped': [], \
                          'mismatch': [], 'failed': [], 'extras': []}
    for line in lines:
        metrics_data_from_regex = re.finditer(pattern, line)
        for match in metrics_data_from_regex:
            metrics_dict_regex['type'].append(match.group('type'))
            metrics_dict_regex['total'].append(match.group('total'))
            metrics_dict_regex['copied'].append(match.group('copied'))
            metrics_dict_regex['skipped'].append(match.group('skipped'))
            metrics_dict_regex['mismatch'].append(match.group('mismatch'))
            metrics_dict_regex['failed'].append(match.group('failed'))
            metrics_dict_regex['extras'].append(match.group('extras'))
    return metrics_dict_regex

In [14]:
metrics = capture_metrics_tbl(open_file(file1), PATTERN_METRICS_TBL)

In [15]:
for key, val in metrics.items():
    print(key, val)

type ['Dirs', 'Files', 'Bytes']
total ['7', '29', '133567']
copied ['6', '29', '133567']
skipped ['1', '0', '0']
mismatch ['0', '0', '0']
failed ['0', '0', '0']
extras ['0', '0', '0']


# Convert dictionaries to json format

In [16]:
print(json.dumps(headers))

{"type": ["Source", "Dest"], "dir": ["C:\\RegularExpressionsWithDotNet\\robocopytest\\source\\\u0ba4\u0bae\u0bbf\u0bb4\u0bcd\\\u0939\u093f\u0928\u094d\u0926\u0940\\English", "C:\\RegularExpressionsWithDotNet\\robocopytest\\destn"]}


In [17]:
print(json.dumps(errors))

{"ts": ["2016/05/28 19:24:30"], "error": ["2 (0x00000002) Accessing Source Directory C:\\RegularExpressionsWithDotNet\\robocopytest\\source2"]}


In [18]:
print(json.dumps(metrics))

{"type": ["Dirs", "Files", "Bytes"], "total": ["7", "29", "133567"], "copied": ["6", "29", "133567"], "skipped": ["1", "0", "0"], "mismatch": ["0", "0", "0"], "failed": ["0", "0", "0"], "extras": ["0", "0", "0"]}


In [19]:
with open('headers.json','w', encoding='utf-8') as wr:
        json.dump(headers, wr, ensure_ascii=False, indent=True)

In [20]:
with open('errors.json','w', encoding='utf-8') as wr:
        json.dump(errors, wr, ensure_ascii=False, indent=True)

In [21]:
with open('metrics.json','w', encoding='utf-8') as wr:
        json.dump(metrics, wr, ensure_ascii=False, indent=True)