In [2]:
import os
import datetime
import json
import pymongo
import urllib.parse
import numpy as np
import pandas as pd

timestamp_conversion_factor = 10**9

unit_conversions = {"[V]": "V", "[°C]": "C"}

channel_types = {"V": "voltage", "C": "temperature", "g": "acceleration"}

unique_env_channel_names = ['WS', 'WD', 'AT', 'R', 'H', 'TE', 'ADU', 'ADK', 'TSPU1', 'TSPU2', 'TSPU3', 'TSAU1', 'TSAU2', 'TSAU3', 'TSPK1', 'TSPK2', 'TSPK3', 'TSAK1', 'TSAK2', 'TSAK3', 'TBC1', 'TBC2', 'TSWS1', 'TSWN1', 'TWS1', 'TWC1', 'TWN1', 'TP1', 'TDT1', 'TDS1', 'TS1', 'TSWS2', 'TSWN2', 'TWS2', 'TWC2', 'TWN2', 'TP2', 'TDT2', 'TDS2', 'TS2', 'TWS3', 'TWN3', 'TWC3', 'TP3', 'TDT3', 'TS3']

first_day = datetime.datetime(1997,11,10,14)
day_offset = {"A": -2, "B": -1, "C": 0, "D": 1, "E": 2, "F": 3, "G": 4}

with open("./Z24-config.json") as file:
	config = json.load(file)
	credentials = config["credentials"]
	main_dir = config["mainDirectory"]
	export_dir = config["exportDirectory"]

sub_dirs = ["/Z24ems1", "/Z24ems2", "/Z24ems3"]

# find data directories in first sub directory
data_dirs = next(os.walk(f"{main_dir}{sub_dirs[0]}"))[1]

# find the week, day, hour info for the data directory in question
week = data_dirs[0][0:2]
day = data_dirs[0][2]
hour = data_dirs[0][3:]
# print(f"week: {week}\nday: {day}\nhour: {hour}")
# calculate the day and time that the data was collected on
days_into_project = datetime.timedelta(hours = int(hour) - 14,days = day_offset[day], weeks = int(week) - 1)
new_date = first_day + days_into_project
# print(new_date)
# find data files within the first data directory in the first sub directory
data_files = next(os.walk(f"{main_dir}{sub_dirs[0]}/{data_dirs[0]}"))[2]
channel_units_dict = {}
# find only the .aaa files and exclude the car log as its name is over 11 characters long
acceleration_files = [file for file in data_files if file.endswith(".aaa") and len(file) == 11]
print(acceleration_files)
acc_channel_values = {}
for j, acc_file in enumerate(acceleration_files):
	with open(f"{main_dir}{sub_dirs[0]}/{data_dirs[0]}/{acc_file}") as file:
		# print(f"current file: {main_dir}{sub_dirs[0]}/{data_dirs[0]}/{acc_file}")
		channel_number = acc_file[5:7]
		channel_units_dict[channel_number] = "g"
		acc_channel_values[channel_number] = []
		# print(f"channel number: {channel_number}")
		sample_time_offsets = []
		sample_values = []
		for i, line in enumerate(file):
			# temporarily changed from 65539 to 10 to make it easier to debug
			if 65539 > i >= 3:
				sample_number = i-2
				sample_time_offsets.append(datetime.timedelta(seconds = sample_number * time_interval))
				if channel_number in acc_channel_values:
					acc_channel_values[channel_number].append(float(line))
				else:
					acc_channel_values[channel_number] = [float(line)]
			elif i == 2:
				time_interval = float(line)
				# print(f"time interval: {time_interval}")
			# if i == 65538:
				# print(f"final sample (number {sample_number}) is: {float(line)}")
			elif line.startswith("Segment #1 Start"):
				segment_start_str = line.replace("Segment #1 Start :","").replace("\n", "")
				segment_start_date_time = datetime.datetime.strptime(segment_start_str, "%a %b %d %H:%M:%S %Y")
				# print(f"Segment #1 Start :{segment_start_date_time}")
	if j == 0:
		# assuming that all samples are taken at the same across the channels
		acc_sample_times = [segment_start_date_time + time_offset for time_offset in sample_time_offsets]
		acc_sample_timestamps = [datetime.datetime.timestamp(sample_time) for sample_time in acc_sample_times]
		reformatted_acc_sample_timestamps = [int(sample_timestamp * timestamp_conversion_factor) for sample_timestamp in acc_sample_timestamps]
		# print(reformatted_acc_sample_timestamps)

acc_channels_dataframe = pd.DataFrame(data=acc_channel_values, index=reformatted_acc_sample_timestamps)
# print(acc_channels_dataframe)

# find the environmental files in the data directory
environmental_files = [file for file in data_files if file.endswith("env")]
print(environmental_files)
reformatted_env_sample_timestamps = []
for k, env_file in enumerate(environmental_files):
	with open(f"{main_dir}{sub_dirs[0]}/{data_dirs[0]}/{env_file}", encoding="iso-8859-1") as file:
		for i, line in enumerate(file):
			if i == 0:
				channel_names = line.split()[::2]
				if k == 0:
					env_channel_values = {c_n : [] for c_n in unique_env_channel_names}
				# print(f"number of channels: {len(channel_names)}")
				# print(f"channel names from file: {channel_names}")
				env_channel_units = [unit_conversions[symbol] for symbol in line.split()[1::2]]
			elif 11 > i >= 1:
				raw_data = line.split()
				# print(f"data line: {raw_data}")
				# print(f"entries in data line: {len(raw_data)}")
				del raw_data[45]
				del raw_data[-6]
				for j, (entry, channel_name) in enumerate(zip(raw_data, unique_env_channel_names)):
					# print(entry, channel_name)
					env_channel_values[channel_name].append(float(entry))
			elif line.startswith("EnvScan started : "):
				segment_start_str = line.replace("EnvScan started : ","").replace("\n", "")
				segment_start_date_time = datetime.datetime.strptime(segment_start_str, "%a %b %d %H:%M:%S %Y")
			elif line.startswith(" Acquisition time :"):
				segment_duration_str = line.replace(" Acquisition time :", "").replace("number of scans : 10", "")
				segment_duration_timedelta = datetime.timedelta(seconds = float(segment_duration_str) / 10)

	env_sample_times = [segment_start_date_time + segment_duration_timedelta * k for k in range(10)]
	env_sample_timestamps = [datetime.datetime.timestamp(sample_time) for sample_time in env_sample_times]
	reformatted_env_sample_timestamps = reformatted_env_sample_timestamps.copy() + [int(sample_timestamp * timestamp_conversion_factor) for sample_timestamp in env_sample_timestamps]
	# print(reformatted_env_sample_timestamps)

# add units for environmental channels to global dict
for channel_name, units in zip(channel_names, env_channel_units):
	channel_units_dict[channel_name] = units

env_channels_dataframe = pd.DataFrame(data=env_channel_values, index=reformatted_env_sample_timestamps)

all_channels_dataframe = pd.concat([acc_channels_dataframe, env_channels_dataframe], axis=1)
all_channels_values = {**acc_channel_values, **env_channel_values}

for index, row in all_channels_dataframe.iterrows():
	# print(index)
	channels = []
	for channel_name in all_channels_values.keys():
		if not np.isnan(row[channel_name]):
			
			channel_object = {
							"name": f"channel-{channel_name}",
							"type": channel_types[channel_units_dict[channel_name]],
							"unit": channel_units_dict[channel_name],
							"value": row[channel_name]
						}
			channels.append(channel_object)

	output_json = {
				"version": "1.1.0",
				"name": "z24-measurements",
				"population": "realbridges",
				"timestamp": index,
				"channels": channels
			}

	# with open (f"{export_directory}/Z24-{index}.json", "w") as outfile:
	# 	json.dump(output_json, outfile, indent=4)

	json = output_json

	# Connect to Server
	client = pymongo.MongoClient("mongodb://{username}:{password}@{host}:{port}/{authdb}".format(
		username=urllib.parse.quote_plus(credentials["username"]), password=urllib.parse.quote_plus(credentials["password"]),
		host=credentials["host"], port=credentials["port"], authdb=credentials["authdb"]
	), serverSelectionTimeoutMS = 2000)
	# call the server_info() to verify that client instance is valid
	# client.server_info() 8
	# Insert JSON
	client[credentials["database"]][credentials["collection"]].insert_one(json)

TypeError: load() missing 1 required positional argument: 'fp'