In [None]:
import os
import datetime
import json
import pymongo
import decimal
import urllib.parse
import numpy as np
import pandas as pd
import time

def float_range(start, stop, step):
	while start < stop:
		yield float(start)
		start += decimal.Decimal(step)

def generate_timestamps(time_offset, segment_start_datetime):
	time_delta = datetime.timedelta(seconds = time_offset)
	sample_time = segment_start_datetime + time_delta
	sample_timestamp = datetime.datetime.timestamp(sample_time)
	return int(sample_timestamp * timestamp_conversion_factor) 

def index_channel_values_by_timestamp(timestamps, channel_values_dict):
	channel_values_by_timestamp = {}
	for i, timestamp in enumerate(timestamps):
		timestamp_dict = {}
		for channel in channel_values_dict.keys():
			timestamp_dict[channel] = channel_values_dict[channel][i]
		channel_values_by_timestamp[timestamp] = timestamp_dict
	return channel_values_by_timestamp

def check_datetime(datetime_to_check, correct_datetime, current_file):
	date_match = True
	hour_match = True
	if datetime_to_check.date() != correct_datetime.date():
		date_match = False
	if datetime_to_check.hour != correct_datetime.hour:
		hour_match = False
	if date_match and hour_match:
		return datetime_to_check
	elif date_match and not hour_match:
		print(f"\nHours do not match in {current_file}")
		new_datetime = datetime_to_check.replace(hour = correct_datetime.hour)
	elif not date_match and hour_match:
		print(f"\nDates do not match in {current_file}")
		new_datetime = datetime_to_check.replace(year = correct_datetime.year, month = correct_datetime.month, day = correct_datetime.day)
	elif not date_match and not hour_match:
		print(f"\nDates and hours do not match in {current_file}")
		new_datetime = datetime_to_check.replace(year = correct_datetime.year, month = correct_datetime.month, day = correct_datetime.day, hour = correct_datetime.hour)
	print(f"Previous segment start time: {datetime_to_check}")
	print(f"New segment start time: {new_datetime}")
	return new_datetime

timestamp_conversion_factor = 10**9

unit_conversions = {"[V]": "V", "[°C]": "C"}

channel_types = {"V": "voltage", "C": "temperature", "g": "acceleration"}

#  ADU is mislabelled as ADU in the emsdata.mat 
column_headings = ['AT', 'H', 'TE', 'ADU', 'TSPU1', 'TSPU2', 'TSAU1', 'TSAU2', 'TSAU3', 'TSPK1', 'TSPK2', 'TSAK1', 'TBC1', 'TBC2', 'TSWS1', 'TSWN1', 'TWS1', 'TWC1', 'TWN1', 'TP1', 'TDS1', 'TS1', 'TSWS2', 'TSWN2', 'TWC2', 'TWN2', 'TP2', 'TDT2', 'TDS2', 'TS2', 'TSWS3', 'TSWN3', 'TWS3', 'TWC3', 'TWN3', 'TDT3']

env_column_mappings = {2: 'AT', 4: 'H', 5: 'TE', 6: 'ADU', 8: 'TSPU1', 9: 'TSPU2', 11: 'TSAU1', 12: 'TSAU2', 13: 'TSAU3', 14: 'TSPK1', 15: 'TSPK2', 17: 'TSAK1', 20: 'TBC1', 21: 'TBC2', 22: 'TSWS1', 23: 'TSWN1', 24: 'TWS1', 25: 'TWC1', 26: 'TWN1', 27: 'TP1', 29: 'TDS1', 30: 'TS1', 31: 'TSWS2', 32: 'TSWN2', 34: 'TWC2', 35: 'TWN2', 36: 'TP2', 37: 'TDT2', 38: 'TDS2', 39: 'TS2', 40: 'TSWS3', 41: 'TSWN3', 42: 'TWS3', 43: 'TWC3', 44: 'TWN3', 46: 'TDT3'}

first_day = datetime.datetime(1997,11,10,14)
day_offset = {"A": -2, "B": -1, "C": 0, "D": 1, "E": 2, "F": 3, "G": 4}

total_number_of_data_dirs = 5653

folder_count = 0

average_time = 0
n = 1

# include option to time individual sections of the code
timing = False

current_folder = None

current_sub_dir = None

with open("./Z24-config.json") as file:
	config = json.load(file)
	credentials = config["credentials"]
	main_dir = config["mainDirectory"]
	export_dir = config["exportDirectory"]

sub_dirs = ["Z24ems1", "Z24ems2", "Z24ems3"]

if current_sub_dir != None:
	index = sub_dirs.index(current_sub_dir)
	sub_dirs = sub_dirs[index:]
for sub_dir in sub_dirs:
	if timing: start = time.time()
	# find data directories in first sub directory
	data_dirs = next(os.walk(f"{main_dir}/{sub_dir}"))[1]
	# include option to start from a particular folder, e.g. if importing crashed partway through
	if current_folder != None:
		index = data_dirs.index(current_folder)
		data_dirs = data_dirs[index:]
		current_folder = None
	if timing: print(f"Data folders in current sub-driectory discovered in {round(time.time() - start, 5)} seconds")
	for data_dir in data_dirs:
		folder_count += 1
		overall_start = time.time()
		if timing: 
			start = time.time()
			print(f"Currently importing: {data_dir}")
		elif current_folder != None or current_sub_dir != None:
			print(f"Currently importing: {sub_dir}/{data_dir} - Average time to import: {round(average_time, 5)} seconds", end="\r")
		else:
			length = 20
			percent = 100 * (folder_count / float(total_number_of_data_dirs))
			filled_length = int(length * folder_count // total_number_of_data_dirs)
			bar = "▮" * filled_length + "-" * (length - filled_length)
			print(f"Currently importing: {sub_dir}/{data_dir} - Progress: |{bar}| {percent:.2f}% - Average time to import: {round(average_time, 5)} seconds", end="\r")
		# find the week, day, hour info for the data directory in question
		week = data_dir[0:2]
		day = data_dir[2]
		hour = data_dir[3:]
		# print(f"\nweek: {week}\nday: {day}\nhour: {hour}")
		# calculate the day and time that the data was collected on
		days_into_project = datetime.timedelta(hours = int(hour) - 14,days = day_offset[day], weeks = int(week) - 1)
		datetime_from_folder_name = first_day + days_into_project
		# print(f"Date and hour from folder name: {datetime_from_folder_name}")
		# find data files within the first data directory in the first sub directory
		data_files = next(os.walk(f"{main_dir}/{sub_dir}/{data_dir}"))[2]
		channel_units_dict = {}
		# find only the .aaa files and exclude the car log as its name is over 11 characters long
		acceleration_files = [file for file in data_files if file.endswith(".aaa") and len(file) == 11]
		# print(acceleration_files)
		acc_channel_values = {}
		for j, acc_file in enumerate(acceleration_files):
			with open(f"{main_dir}/{sub_dir}/{data_dir}/{acc_file}") as file:
				lines = file.readlines()
				# print(f"current file: {main_dir}{sub_dir}/{data_dir}/{acc_file}")
				channel_number = acc_file[5:7]
				channel_units_dict[channel_number] = "g"
				acc_channel_values[channel_number] = [float(line) for line in lines[3:65539]]
				# print(f"channel number: {channel_number}")
				sample_time_offsets = []
				sample_values = []
				time_interval = float(lines[2])
				segment_start_str = lines[65546].replace("Segment #1 Start :","").replace("\n", "")
				segment_start_datetime = check_datetime(datetime.datetime.strptime(segment_start_str, "%a %b %d %H:%M:%S %Y"), datetime_from_folder_name, acc_file)
				# print(f"Segment #1 Start :{segment_start_datetime}")
					
			if j == 0:
				# assuming that all samples are taken at the same across the channels
				acc_sample_timestamps = [generate_timestamps(time_offset, segment_start_datetime) for time_offset in float_range(0, 65535 * time_interval, time_interval)]
				# print(acc_sample_timestamps)
		
		acc_channel_values_by_timestamp = index_channel_values_by_timestamp(acc_sample_timestamps, acc_channel_values)
		# print(acc_channel_values_by_timestamp)

		# find the environmental files in the data directory
		environmental_files = [file for file in data_files if file.endswith("env")]
		# print(environmental_files)
		reformatted_env_sample_timestamps = []
		for k, env_file in enumerate(environmental_files):
			with open(f"{main_dir}/{sub_dir}/{data_dir}/{env_file}", encoding="iso-8859-1") as file:
				for i, line in enumerate(file):
					if i == 0:
						channel_names = line.split()[::2]
						if k == 0:
							env_channel_values = {c_n : [] for c_n in column_headings}
						# print(f"number of channels: {len(channel_names)}")
						# print(f"channel names from file: {channel_names}")
						env_channel_units = [unit_conversions[symbol] for symbol in line.split()[1::2]]
						for j, entry in enumerate(env_channel_units):
							if j in env_column_mappings: channel_units_dict[env_column_mappings[j]] = entry
						# print(env_channel_units)
					elif 11 > i >= 1:
						raw_data = line.split()
						# print(f"data line: {raw_data}")
						# print(f"entries in data line: {len(raw_data)}")
						for j, entry in enumerate(raw_data):
							# print(entry, channel_name)
							if j in env_column_mappings: env_channel_values[env_column_mappings[j]].append(float(entry))
					elif line.startswith("EnvScan started : "):
						envscan_start_str = line.replace("EnvScan started : ","").replace("\n", "")
						envscan_start_datetime = check_datetime(datetime.datetime.strptime(envscan_start_str, "%a %b %d %H:%M:%S %Y"), datetime_from_folder_name, env_file)
					elif line.startswith(" Acquisition time :"):
						envscan_duration_str = line.replace(" Acquisition time :", "").replace("number of scans : 10", "")
						envscan_duration_timedelta = datetime.timedelta(seconds = float(envscan_duration_str) / 10)

			env_sample_times = [envscan_start_datetime + envscan_duration_timedelta * k for k in range(10)]
			env_sample_timestamps = [datetime.datetime.timestamp(sample_time) for sample_time in env_sample_times]
			reformatted_env_sample_timestamps = reformatted_env_sample_timestamps.copy() + [int(sample_timestamp * timestamp_conversion_factor) for sample_timestamp in env_sample_timestamps]
			# print(reformatted_env_sample_timestamps)

		env_channel_values_by_timestamp = index_channel_values_by_timestamp(reformatted_env_sample_timestamps, env_channel_values)
		# print(env_channel_values_by_timestamp)

		all_channel_values_by_timestamp = {**env_channel_values_by_timestamp, **acc_channel_values_by_timestamp}
		# print(len(all_channel_values_by_timestamp.keys()))
		# print(all_channel_values_by_timestamp)

		if timing: print(f"Channel data read from directory in {round(time.time()-start, 5)} seconds.")

		if timing: start = time.time()

		documents = []
		for timestamp in all_channel_values_by_timestamp.keys():
			# print(index)
			channels = []
			# print(all_channel_values_by_timestamp[timestamp])
			for channel_name in all_channel_values_by_timestamp[timestamp].keys():
				channel_object = {
								"name": f"channel-{channel_name}",
								"type": channel_types[channel_units_dict[channel_name]],
								"unit": channel_units_dict[channel_name],
								"value": all_channel_values_by_timestamp[timestamp][channel_name]
							}
				channels.append(channel_object)

			output_json = {
						"version": "1.1.0",
						"name": "z24-measurements",
						"population": "realbridges",
						"timestamp": timestamp,
						"channels": channels
					}

			documents.append(output_json)

		if timing: print(f"json documents generated in {round(time.time()-start, 5)} seconds.")

		if timing: start = time.time()

		# connect to Server
		client = pymongo.MongoClient("mongodb://{username}:{password}@{host}:{port}/{authdb}".format(
				username=urllib.parse.quote_plus(credentials["username"]), password=urllib.parse.quote_plus(credentials["password"]),
				host=credentials["host"], port=credentials["port"], authdb=credentials["authdb"]
				), serverSelectionTimeoutMS = 2000)
		# Insert JSON
		client[credentials["database"]][credentials["collection"]].insert_many(documents)

		if timing: print(f"Data successfully imported in {round(time.time()-start, 5)} seconds")

		if timing: print(f"Data folder imported in {round(time.time()-overall_start, 5)} seconds")

		average_time = average_time*((n-1)/n) + (time.time() - overall_start)/n
		n += 1