# Machine Learning 2020 - Homework 1

## Data Transformation

In [23]:
from typing import List, TextIO
from datetime import datetime


available_metrics = [
    'timestamp', 'AMB_TEMP', 'CH4', 'CO', 'NMHC', 'NO', 'NO2', 'NOx', 'O3', 'PM10', 'PM2.5', 'RAINFALL', 'RH', 'SO2', 'THC', 'WD_HR', 'WIND_DIREC', 'WIND_SPEED'
]


def _append_transformed_data(f: TextIO, lines: List[bytes], with_headers: bool):
    data = [{} for hour in range(24)]
    
    for line in lines:
        columns: List[bytes] = line.strip().split(b",")
            
        column_date: str = columns[0].decode()
        column_metric: str = columns[2].decode()
        column_values: List[str] = [v.decode() for v in columns[3:]]
        assert len(column_values) == 24
        
        for hour, item in enumerate(data):
            item["timestamp"] = int(datetime.strptime(f"{column_date} {hour}", "%Y/%m/%d %H").timestamp())
            
        for hour, column_value in enumerate(column_values):
            try:
                value = float(column_value)
            except ValueError:
                assert column_value == "NR"
                value = 0
            data[hour][column_metric] = value
    
    metrics = list(data[0].keys())
    if with_headers:
        f.write(",".join(metrics) + "\n")
    for item in sorted(data, key=lambda x: x["timestamp"]):
        features = [str(item[metric]) for metric in metrics]
        f.write(",".join(features) + "\n")


def transform(*, input_path: str, output_path: str):
    with open(input_path, "rb") as input_file, open(output_path, "w") as output_file:
        lines = []
        for i, line in enumerate(input_file):
            # Skip header.
            if i == 0:
                continue
            
            if i % 18 != 0:
                lines.append(line)
            else:
                _append_transformed_data(output_file, lines, i == 18)
                lines = []

In [24]:
transform(input_path="train.csv", output_path="/tmp/train_transformed.csv")

In [25]:
!cat /tmp/train_transformed.csv | head -n 3

timestamp,AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RAINFALL,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED
1388505600,14.0,1.8,0.51,0.2,0.9,16.0,17.0,16.0,56.0,26.0,0,77.0,1.8,2.0,37.0,35.0,1.4
1388509200,14.0,1.8,0.41,0.15,0.6,9.2,9.8,30.0,50.0,39.0,0,68.0,2.0,2.0,80.0,79.0,1.8
cat: stdout: Broken pipe


## Processing

In [26]:
import numpy as np


named_features = np.genfromtxt("/tmp/train_transformed.csv", delimiter=",", names=True)
named_features

array([(1.3885056e+09, 14., 1.8, 0.51, 0.2 , 0.9, 16. , 17. , 16., 56., 26., 0., 77., 1.8, 2. ,  37.,  35. , 1.4),
       (1.3885092e+09, 14., 1.8, 0.41, 0.15, 0.6,  9.2,  9.8, 30., 50., 39., 0., 68., 2. , 2. ,  80.,  79. , 1.8),
       (1.3885128e+09, 14., 1.8, 0.39, 0.13, 0.5,  8.2,  8.7, 27., 48., 36., 0., 67., 1.7, 2. ,  57.,   2.4, 1. ),
       ...,
       (1.4190804e+09, 13., 1.8, 0.51, 0.16, 1.5, 13. , 15. , 13., 50., 17., 0., 82., 2.3, 1.9, 114., 118. , 1.5),
       (1.4190840e+09, 13., 1.8, 0.57, 0.19, 1.1, 13. , 14. , 13., 32., 24., 0., 84., 2.3, 2. , 108., 100. , 2. ),
       (1.4190876e+09, 13., 1.8, 0.56, 0.19, 1.3, 14. , 15. , 13., 22., 29., 0., 84., 2.3, 2. , 109., 105. , 2. )],
      dtype=[('timestamp', '<f8'), ('AMB_TEMP', '<f8'), ('CH4', '<f8'), ('CO', '<f8'), ('NMHC', '<f8'), ('NO', '<f8'), ('NO2', '<f8'), ('NOx', '<f8'), ('O3', '<f8'), ('PM10', '<f8'), ('PM25', '<f8'), ('RAINFALL', '<f8'), ('RH', '<f8'), ('SO2', '<f8'), ('THC', '<f8'), ('WD_HR', '<f8'), ('WIND_DIRE

In [27]:
features = named_features.view((np.float64, len(named_features.dtype.names)))
features.shape

(5760, 18)

In [28]:
offsets = 0

cont_features = features
for offset in range(1, offsets + 1):
    cont_features = np.concatenate(
        (
            cont_features,
            np.concatenate(
                (
                    features[offset:],
                    np.array([[np.nan] * len(named_features.dtype.names)] * offset),
                ),
                axis=0,
            ),
        ),
        axis=1
    )
    
cont_features = cont_features[:-(offsets + 1)]
labels = named_features["PM25"][:,np.newaxis][offsets + 1:]
print(cont_features.shape)
print(labels.shape)

(5759, 18)
(5759, 1)
