# Data Reorganization

## Traininf Data Transformation 1

In [129]:
from datetime import datetime
from typing import List


def _partial_transpose(output_path: str, lines: List[bytes], is_initial_turn: bool) -> None:
    assert len(lines) == 18
    output_mode: str =  "w" if is_initial_turn else "a"
    data = [{} for _ in range(24)]

    with open(output_path, output_mode) as f:
        for line in lines:
            raw_date, _, raw_metric, raw_values = line.strip().split(b",", 3)
            
            date: str = raw_date.decode()
            metric: str = raw_metric.decode()
            values: List[str] = [raw_value.decode() for raw_value in raw_values.split(b",")]
            assert len(values) == 24
            
            for hour, _ in enumerate(data):
                timestamp: int = int(datetime.strptime(date + f" {hour}", "%Y/%m/%d %H").timestamp())
                data[hour]["timestamp"] = str(timestamp)
            
            for hour, value in enumerate(values):
                data[hour][metric] = ("0" if value == "NR" else value)
                
        metrics: List[str] = sorted(data[0].keys())
        if is_initial_turn:
            f.write(",".join(metrics) + "\n")
            
        for item in data:
            f.write(",".join([item[metric] for metric in metrics]) + "\n")


def reorganize_training_data(input_path: str, output_path: str) -> None:
    with open(input_path, "rb") as f:
        lines: List[bytes] = []
        
        for i, line in enumerate(f):
            if i == 0:  # Skip the header line
                continue
            
            lines.append(line)
            
            if i % 18 == 0:
                _partial_transpose(output_path, lines, i == 18)
                lines = []


In [130]:
reorganize_training_data("./train.csv", "./train-transformed1.csv")

In [131]:
!cat ./train-transformed1.csv | head -n 5

AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RAINFALL,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR,timestamp
14,1.8,0.51,0.2,0.9,16,17,16,56,26,0,77,1.8,2,37,35,1.4,0.5,1388505600
14,1.8,0.41,0.15,0.6,9.2,9.8,30,50,39,0,68,2,2,80,79,1.8,0.9,1388509200
14,1.8,0.39,0.13,0.5,8.2,8.7,27,48,36,0,67,1.7,2,57,2.4,1,0.6,1388512800
13,1.8,0.37,0.12,1.7,6.9,8.6,23,35,35,0,74,1.6,1.9,76,55,0.6,0.3,1388516400
cat: stdout: Broken pipe


## Training Data Transformation 2

In [132]:
import numpy as np

In [133]:
original_training_data = np.genfromtxt("./train-transformed1.csv", delimiter=",", names=True)
original_training_data

array([(14., 1.8, 0.51, 0.2 , 0.9, 16. , 17. , 16., 56., 26., 0., 77., 1.8, 2. ,  37.,  35. , 1.4, 0.5, 1.3885056e+09),
       (14., 1.8, 0.41, 0.15, 0.6,  9.2,  9.8, 30., 50., 39., 0., 68., 2. , 2. ,  80.,  79. , 1.8, 0.9, 1.3885092e+09),
       (14., 1.8, 0.39, 0.13, 0.5,  8.2,  8.7, 27., 48., 36., 0., 67., 1.7, 2. ,  57.,   2.4, 1. , 0.6, 1.3885128e+09),
       ...,
       (13., 1.8, 0.51, 0.16, 1.5, 13. , 15. , 13., 50., 17., 0., 82., 2.3, 1.9, 114., 118. , 1.5, 1.6, 1.4190804e+09),
       (13., 1.8, 0.57, 0.19, 1.1, 13. , 14. , 13., 32., 24., 0., 84., 2.3, 2. , 108., 100. , 2. , 1.8, 1.4190840e+09),
       (13., 1.8, 0.56, 0.19, 1.3, 14. , 15. , 13., 22., 29., 0., 84., 2.3, 2. , 109., 105. , 2. , 2. , 1.4190876e+09)],
      dtype=[('AMB_TEMP', '<f8'), ('CH4', '<f8'), ('CO', '<f8'), ('NMHC', '<f8'), ('NO', '<f8'), ('NO2', '<f8'), ('NOx', '<f8'), ('O3', '<f8'), ('PM10', '<f8'), ('PM25', '<f8'), ('RAINFALL', '<f8'), ('RH', '<f8'), ('SO2', '<f8'), ('THC', '<f8'), ('WD_HR', '<f8'), ('W

In [103]:
original_training_data.shape

(5760,)

In [188]:
print(",".join(original_training_data.dtype.names))
fields = [
    "AMB_TEMP",
    "CH4",
    "CO",
    "NMHC",
    "NO",
    "NO2",
    "NOx",
    "O3",
    "PM10",
    "PM25",
    "RAINFALL",
    "RH",
    "SO2",
    "THC",
    "WD_HR",
    "WIND_DIREC",
    "WIND_SPEED",
    "WS_HR",
#     "timestamp"
]

AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM25,RAINFALL,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR,timestamp


In [256]:
def to_data(original_data, fields):
    data = original_data[fields[0]][:, np.newaxis]

    for field in fields[1:]:
        data = np.concatenate([data, original_data[field][:, np.newaxis]], axis=1)
        
    return data

training_data = to_data(original_training_data, fields)
norm_mean = np.mean(training_data, axis=0)
norm_std = np.std(training_data, axis=0)

training_data = (training_data - norm_mean) / norm_std
training_data.shape

(5760, 18)

In [271]:
def to_X(hours, data, shrink: bool = True):
    assert 1 <= hours

    X_prime = data
    for h in range(hours - 1):
        X_prime = np.concatenate((X_prime[:-1], data[h+1:]), axis=1)

    if shrink:
        X = X_prime[:-1]
    else:
        X = X_prime

    n, k = X.shape

    return np.concatenate((np.ones(n).reshape(n, 1), X), axis=1)
    
hours = 5
X = to_X(hours, training_data)

y = original_training_data["PM25"][hours:]
y_mean = np.mean(y, axis=0)
y_std = np.std(y, axis=0)
y = (y - y_mean) / y_std

n, k = X.shape
k -= 1

In [259]:
print("X =", X.shape)
print("y =", y.shape)
print("(n, k) =", (n, k))

X = (5755, 91)
y = (5755,)
(n, k) = (5755, 90)


# Training

In [269]:
bound = 10000

w_init = np.zeros(k + 1)

w = w_init
for b in range(bound):
    w = w - pow(0.1, 5.9)* np.dot(X.T, -2 * (y - np.dot(X, w)))
    
    if bound - b < 10:
        loss = np.power(np.sum(np.power(y - np.dot(w, X.T), 2)), 0.5)
        print(loss)

print(w)

26.908395013537785
26.90839387101642
26.908392728732643
26.908391586686335
26.908390444877387
26.908389303305686
26.908388161971114
26.90838702087356
26.908385880012908
[ 4.78986410e-04 -2.00409483e-02 -5.25649509e-03  2.67039657e-03
  1.61605455e-02 -4.75526381e-05 -3.67574463e-02  1.81290002e-02
 -1.61448857e-02  6.56629616e-03 -5.69949620e-02 -6.10500806e-03
 -3.14106009e-02 -3.96711045e-03 -5.61489569e-03  2.45217705e-03
  3.92320863e-03 -5.16764080e-03 -6.47697968e-03  2.23977919e-03
  3.04246122e-03 -9.22599105e-04 -1.94274515e-02 -1.35821100e-02
  1.40568038e-03  6.01078276e-03 -2.68291485e-02 -2.75904301e-02
  3.71625226e-01  6.47698290e-03  3.85910704e-02  5.62277493e-03
  1.43394944e-02  1.08453790e-02  1.61016332e-03 -7.97043186e-03
  1.86265208e-02 -4.11108802e-03 -1.16704155e-02 -8.17733166e-03
 -1.62026470e-03  6.60860394e-03 -2.19083379e-02 -4.71226532e-03
 -3.04594658e-02  2.88916252e-02 -4.44104038e-01  1.81868684e-03
 -6.97173251e-02 -1.06538157e-02  1.00497004e-02 -1

# Testing Data Transformation

In [193]:
!cat ./test.csv | head -n 21

id_0,AMB_TEMP,21,21,20,20,19,19,19,18,17
id_0,CH4,1.7,1.7,1.7,1.7,1.7,1.7,1.7,1.7,1.8
id_0,CO,0.39,0.36,0.36,0.4,0.53,0.55,0.34,0.31,0.23
id_0,NMHC,0.16,0.24,0.22,0.27,0.27,0.26,0.27,0.29,0.1
id_0,NO,1.3,1.3,1.3,1.3,1.4,1.6,1.2,1.1,0.9
id_0,NO2,17,14,13,14,18,21,8.9,9.4,5
id_0,NOx,18,16,14,15,20,23,10,10,5.8
id_0,O3,32,31,31,26,16,12,27,20,26
id_0,PM10,62,50,44,39,38,32,48,36,25
id_0,PM2.5,33,39,39,25,18,18,17,9,4
id_0,RAINFALL,NR,NR,NR,NR,NR,NR,NR,NR,NR
id_0,RH,83,85,87,87,86,85,78,81,80
id_0,SO2,2,1.8,1.8,1.8,2.1,2.6,2,2.3,2.4
id_0,THC,1.8,1.9,1.9,2,2,2,2,2,1.9
id_0,WD_HR,58,53,67,59,59,73,79,82,104
id_0,WIND_DIREC,57,44,73,44,56,115,45,107,103
id_0,WIND_SPEED,1.4,1.3,1.5,1.4,1.6,1.6,1.2,1.8,2.3
id_0,WS_HR,1,0.9,0.9,0.9,1.2,0.7,1,0.6,1.8
id_1,AMB_TEMP,14,13,13,13,13,13,13,12,13
id_1,CH4,1.8,1.8,1.8,1.8,1.8,1.8,1.7,1.7,1.8
id_1,CO,0.33,0.33,0.33,0.35,0.34,0.33,0.32,0.34,0.61
cat: stdout: Broken pipe


In [284]:
from numpy.lib import recfunctions as np_rfn


def predict(data):
    testing_data = to_data(data, fields)
    testing_data = (testing_data - norm_mean) / norm_std
    p = np.dot(to_X(hours, testing_data, False)[-1], w)
    return max(p * y_std + y_mean, 0)


with open("./test.csv", "r") as f, open("./prediction.csv", "w") as out:
    current_test_id = ""
    data = None
    
    out.write("id,value\n")

    for line in f:
        line = line.strip()
        test_id, field, raw_values = line.split(",", 2)

        field = field.replace(".", "")
        dt = np.dtype([(field, np.float64)])
        values = [(float(v) if v != "NR" else 0) for v in raw_values.split(",")]
        
        if current_test_id != test_id:
            if data is not None:
                p = predict(data)
                out.write(f"{current_test_id},{p}\n")
                
            current_test_id = test_id
            data = np.array(values, dtype=dt)
        else:
            data = np_rfn.merge_arrays([data, np.array(values, dtype=dt)], flatten=True)

    p = predict(data)
    out.write(f"{current_test_id},{p}\n")
        

In [285]:
!cat ./prediction.csv | tail -n 6

id_234,24.731843979880026
id_235,41.25969992988027
id_236,68.50777451687931
id_237,42.2697038729074
id_238,12.862995317603668
id_239,16.859228759444928
