In [8]:
import os
import datetime
import json
import pymongo
import decimal
import urllib.parse
import numpy as np
import pandas as pd

timing = True

# import this for timing only
if timing: import time

def float_range(start, stop, step):
	while start < stop:
		yield float(start)
		start += decimal.Decimal(step)

def generate_timestamps(time_offset, segment_start_date_time):
	time_delta = datetime.timedelta(seconds = time_offset)
	sample_time = segment_start_date_time + time_delta
	sample_timestamp = datetime.datetime.timestamp(sample_time)
	return int(sample_timestamp * timestamp_conversion_factor) 

timestamp_conversion_factor = 10**9

unit_conversions = {"[V]": "V", "[°C]": "C"}

channel_types = {"V": "voltage", "C": "temperature", "g": "acceleration"}

unique_env_channel_names = ['WS', 'WD', 'AT', 'R', 'H', 'TE', 'ADU', 'ADK', 'TSPU1', 'TSPU2', 'TSPU3', 'TSAU1', 'TSAU2', 'TSAU3', 'TSPK1', 'TSPK2', 'TSPK3', 'TSAK1', 'TSAK2', 'TSAK3', 'TBC1', 'TBC2', 'TSWS1', 'TSWN1', 'TWS1', 'TWC1', 'TWN1', 'TP1', 'TDT1', 'TDS1', 'TS1', 'TSWS2', 'TSWN2', 'TWS2', 'TWC2', 'TWN2', 'TP2', 'TDT2', 'TDS2', 'TS2', 'TWS3', 'TWN3', 'TWC3', 'TP3', 'TDT3', 'TS3']

first_day = datetime.datetime(1997,11,10,14)
day_offset = {"A": -2, "B": -1, "C": 0, "D": 1, "E": 2, "F": 3, "G": 4}

with open("./Z24-config.json") as file:
	config = json.load(file)
	credentials = config["credentials"]
	main_dir = config["mainDirectory"]
	export_dir = config["exportDirectory"]

sub_dirs = ["/Z24ems1", "/Z24ems2", "/Z24ems3"]

for sub_dir in sub_dirs:
	# find data directories in first sub directory
	data_dirs = next(os.walk(f"{main_dir}{sub_dir}"))[1]

	for data_dir in data_dirs:
		# find the week, day, hour info for the data directory in question
		week = data_dir[0:2]
		day = data_dir[2]
		hour = data_dir[3:]
		# print(f"week: {week}\nday: {day}\nhour: {hour}")
		# calculate the day and time that the data was collected on
		days_into_project = datetime.timedelta(hours = int(hour) - 14,days = day_offset[day], weeks = int(week) - 1)
		new_date = first_day + days_into_project
		# print(new_date)
		# find data files within the first data directory in the first sub directory
		data_files = next(os.walk(f"{main_dir}{sub_dir}/{data_dir}"))[2]
		channel_units_dict = {}
		if timing: start = time.time()
		# find only the .aaa files and exclude the car log as its name is over 11 characters long
		acceleration_files = [file for file in data_files if file.endswith(".aaa") and len(file) == 11]
		print(acceleration_files)
		acc_channel_values = {}
		for j, acc_file in enumerate(acceleration_files):
			with open(f"{main_dir}{sub_dir}/{data_dir}/{acc_file}") as file:
				lines = file.readlines()
				# print(f"current file: {main_dir}{sub_dir}/{data_dir}/{acc_file}")
				channel_number = acc_file[5:7]
				channel_units_dict[channel_number] = "g"
				acc_channel_values[channel_number] = [float(line) for line in lines[3:65539]]
				# print(f"channel number: {channel_number}")
				sample_time_offsets = []
				sample_values = []
				time_interval = float(lines[2])
				segment_start_str = lines[65546].replace("Segment #1 Start :","").replace("\n", "")
				segment_start_date_time = datetime.datetime.strptime(segment_start_str, "%a %b %d %H:%M:%S %Y")
				# print(f"Segment #1 Start :{segment_start_date_time}")
			if j == 0:
				# assuming that all samples are taken at the same across the channels
				acc_sample_timestamps = [generate_timestamps(time_offset, segment_start_date_time) for time_offset in float_range(0, 65535 * time_interval, time_interval)]
				# print(acc_sample_timestamps)

		acc_channels_dataframe = pd.DataFrame(data=acc_channel_values, index=acc_sample_timestamps)
		# print(acc_channels_dataframe)

		# find the environmental files in the data directory
		environmental_files = [file for file in data_files if file.endswith("env")]
		print(environmental_files)
		reformatted_env_sample_timestamps = []
		for k, env_file in enumerate(environmental_files):
			with open(f"{main_dir}{sub_dir}/{data_dir}/{env_file}", encoding="iso-8859-1") as file:
				for i, line in enumerate(file):
					if i == 0:
						channel_names = line.split()[::2]
						if k == 0:
							env_channel_values = {c_n : [] for c_n in unique_env_channel_names}
						# print(f"number of channels: {len(channel_names)}")
						# print(f"channel names from file: {channel_names}")
						env_channel_units = [unit_conversions[symbol] for symbol in line.split()[1::2]]
					elif 11 > i >= 1:
						raw_data = line.split()
						# print(f"data line: {raw_data}")
						# print(f"entries in data line: {len(raw_data)}")
						del raw_data[45]
						del raw_data[-6]
						for j, (entry, channel_name) in enumerate(zip(raw_data, unique_env_channel_names)):
							# print(entry, channel_name)
							env_channel_values[channel_name].append(float(entry))
					elif line.startswith("EnvScan started : "):
						segment_start_str = line.replace("EnvScan started : ","").replace("\n", "")
						segment_start_date_time = datetime.datetime.strptime(segment_start_str, "%a %b %d %H:%M:%S %Y")
					elif line.startswith(" Acquisition time :"):
						segment_duration_str = line.replace(" Acquisition time :", "").replace("number of scans : 10", "")
						segment_duration_timedelta = datetime.timedelta(seconds = float(segment_duration_str) / 10)

			env_sample_times = [segment_start_date_time + segment_duration_timedelta * k for k in range(10)]
			env_sample_timestamps = [datetime.datetime.timestamp(sample_time) for sample_time in env_sample_times]
			reformatted_env_sample_timestamps = reformatted_env_sample_timestamps.copy() + [int(sample_timestamp * timestamp_conversion_factor) for sample_timestamp in env_sample_timestamps]
			# print(reformatted_env_sample_timestamps)
		print("Channel data read from file")
		if timing:
			end = time.time()
			print(f"in {end-start} seconds")

		# add units for environmental channels to global dict
		for channel_name, units in zip(channel_names, env_channel_units):
			channel_units_dict[channel_name] = units

		if timing: start = time.time()
		env_channels_dataframe = pd.DataFrame(data=env_channel_values, index=reformatted_env_sample_timestamps)

		all_channels_dataframe = pd.concat([acc_channels_dataframe, env_channels_dataframe], axis=1)
		all_channels_values = {**acc_channel_values, **env_channel_values}

		print("Dataframe created")
		if timing:
			end = time.time()
			print(f"in {end-start} seconds")

		if timing: start = time.time()
		documents = []
		for index, row in all_channels_dataframe.iterrows():
			# print(index)
			channels = []
			for channel_name in all_channels_values.keys():
				if not np.isnan(row[channel_name]):
					
					channel_object = {
									"name": f"channel-{channel_name}",
									"type": channel_types[channel_units_dict[channel_name]],
									"unit": channel_units_dict[channel_name],
									"value": row[channel_name]
								}
					channels.append(channel_object)

			output_json = {
						"version": "1.1.0",
						"name": "z24-measurements",
						"population": "realbridges",
						"timestamp": index,
						"channels": channels
					}

			documents.append(output_json)

		print("json documents generated")
		if timing:
			end = time.time()
			print(f"in {end-start} seconds")
		if timing: start = time.time()
		# connect to Server
		client = pymongo.MongoClient("mongodb://{username}:{password}@{host}:{port}/{authdb}".format(
				username=urllib.parse.quote_plus(credentials["username"]), password=urllib.parse.quote_plus(credentials["password"]),
				host=credentials["host"], port=credentials["port"], authdb=credentials["authdb"]
				), serverSelectionTimeoutMS = 2000)
		# Insert JSON
		client[credentials["database"]][credentials["collection"]].insert_many(documents)

		print("Data successfully imported")
		if timing:
			end = time.time()
			print(f"in {end-start} seconds")

['01C1403.aaa', '01C1405.aaa', '01C1406.aaa', '01C1407.aaa', '01C1410.aaa', '01C1412.aaa', '01C1414.aaa', '01C1416.aaa']
['01C14POS.env', '01C14PRE.env']
Channel data read from file
in 0.5644640922546387 seconds
Dataframe created
in 0.025961875915527344 seconds
json documents generated
in 16.291426181793213 seconds
Data successfully imported
in 3.2897520065307617 seconds
['01C1503.aaa', '01C1505.aaa', '01C1506.aaa', '01C1507.aaa', '01C1510.aaa', '01C1512.aaa', '01C1514.aaa', '01C1516.aaa']
['01C15POS.env', '01C15PRE.env']
Channel data read from file
in 0.5217165946960449 seconds
Dataframe created
in 0.020916223526000977 seconds


KeyboardInterrupt: 