# Machine Learning 2020 - Homework 1

## Data Transformation

In [86]:
import json
from typing import List, TextIO
from datetime import datetime


def output(f: TextIO, lines: List[bytes]):
    data = [{} for hour in range(24)]
    
    for line in lines:
        columns: List[bytes] = line.strip().split(b",")
            
        column_date: str = columns[0].decode()
        column_metric: str = columns[2].decode()
        column_values: List[str] = [v.decode() for v in columns[3:]]
        assert len(column_values) == 24
        
        for hour, item in enumerate(data):
            item["timestamp"] = int(datetime.strptime(f"{column_date} {hour}", "%Y/%m/%d %H").timestamp())
            
        for hour, column_value in enumerate(column_values):
            try:
                value = float(column_value)
            except ValueError:
                assert column_value == "NR"
                value = None
            data[hour][column_metric] = value
            
    for item in sorted(data, key=lambda x: x["timestamp"]):
        f.write(json.dumps(item, separators=(",", ":")) + "\n")


def transform(train_data_path: str, transformed_data_path: str):
    with open(train_data_path, "rb") as input_file, open(transformed_data_path, "w") as output_file:
        lines = []
        for i, line in enumerate(input_file):
            # Skip header.
            if i == 0:
                continue
            
            if i % 18 != 0:
                lines.append(line)
            else:
                output(output_file, lines)
                lines = []

In [89]:
transform("train.csv", "/tmp/train_transformed.csv")

In [91]:
!cat /tmp/train_transformed.csv | head -n 3

{"timestamp":1388505600,"AMB_TEMP":14.0,"CH4":1.8,"CO":0.51,"NMHC":0.2,"NO":0.9,"NO2":16.0,"NOx":17.0,"O3":16.0,"PM10":56.0,"PM2.5":26.0,"RAINFALL":null,"RH":77.0,"SO2":1.8,"THC":2.0,"WD_HR":37.0,"WIND_DIREC":35.0,"WIND_SPEED":1.4}
{"timestamp":1388509200,"AMB_TEMP":14.0,"CH4":1.8,"CO":0.41,"NMHC":0.15,"NO":0.6,"NO2":9.2,"NOx":9.8,"O3":30.0,"PM10":50.0,"PM2.5":39.0,"RAINFALL":null,"RH":68.0,"SO2":2.0,"THC":2.0,"WD_HR":80.0,"WIND_DIREC":79.0,"WIND_SPEED":1.8}
{"timestamp":1388512800,"AMB_TEMP":14.0,"CH4":1.8,"CO":0.39,"NMHC":0.13,"NO":0.5,"NO2":8.2,"NOx":8.7,"O3":27.0,"PM10":48.0,"PM2.5":36.0,"RAINFALL":null,"RH":67.0,"SO2":1.7,"THC":2.0,"WD_HR":57.0,"WIND_DIREC":2.4,"WIND_SPEED":1.0}
cat: stdout: Broken pipe
