In [1]:
import os
import math
import csv
import sys
import datetime
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
feature_num = 18
feature_list = ['AMB_TEMP', 'CH4', 'CO', 'NMHC', 'NO', 'NO2', 'NOx',
                'O3','PM10', 'PM2.5', 'RAINFALL', 'RH', 'SO2', 'THC',
                'WD_HR', 'WIND_DIREC', 'WIND_SPEED', 'WS_HR']
train_data_path = './train.csv'
test_data_path = './test.csv'
output_answer_path = './answer.csv'

class Data(object):
    
    def __init__(self, location, date, feature_dict):
        
        self.location = location
        self.date = date
        self.feature_dict = feature_dict
        self.str2num()
        self.PM2point5 = self.feature_dict['PM2.5']
        
    def str2num(self):   
        for feature in feature_list:
            if feature == 'RAINFALL':
                temp = self.feature_dict[feature]
                for i in range(len(temp)):
                    if self.feature_dict[feature][i] == 'NR':
                        self.feature_dict[feature][i] = 1.
                    else:
                        self.feature_dict[feature][i] = 1.01
                self.feature_dict[feature] = np.array(self.feature_dict[feature])
                continue
            temp = self.feature_dict[feature]
            for i in range(len(temp)):
                self.feature_dict[feature][i] = float(temp[i])
            self.feature_dict[feature] = np.array(self.feature_dict[feature])
            
    def feature2matrix(self):   
        x = np.ndarray(shape=(18, 24), dtype=float)
        cntr = 0
        for feature in feature_list:
            x[cntr] = self.feature_dict[feature]
            cntr += 1
            
        return np.asmatrix(x)
    
    def test_feature2matrix(self):      
        x = np.ndarray(shape=(18, 9), dtype=float)
        cntr = 0
        for feature in feature_list:
            x[cntr] = self.feature_dict[feature]
            cntr += 1
            
        return np.asmatrix(x)
    
    def get_matrix(self):
        self.x = self.feature2matrix()
        return self.x
    
    def test_get_matrix(self):
        self.x = self.test_feature2matrix()
        return self.x
    
    def get_PM2point5(self):
        return self.PM2point5

In [3]:
def PM2point5_dataset(train_data_path):
    train_data, feature_mean_dict, feature_std_dict = feature_scaling(read_train_data(train_data_path))
    x_matrix = None
    y_matrix = None
    cntr = 0
    for data in train_data:
        if cntr == 0:
            x_matrix = data.get_matrix()
            y_matrix = data.get_PM2point5()
            cntr += 1
            continue
        x_temp = data.get_matrix()
        y_temp = data.get_PM2point5()
        x_matrix = np.hstack((x_matrix, x_temp))
        y_matrix = np.hstack((y_matrix, y_temp))
        cntr += 1
        
    return x_matrix, y_matrix, feature_mean_dict, feature_std_dict

def read_train_data(train_data_path):
    train_data = []
    feature_value = []
    line_count = 0
    
    with open(train_data_path, encoding='big5') as train_data_csv:
        next(train_data_csv)
        reader = csv.reader(train_data_csv, delimiter=',')
        for line in reader:
            line_count += 1
            time = line[0].split('/')
            date = datetime.datetime(int(time[0]), int(time[1]), int(time[2]))
            location = line[1].strip()
            value = []
            for ele in line[3:]:
                value.append(ele)     
            feature_value.append(value)
            if line_count % 18 == 0:
                feature_dict = dict(zip(feature_list, feature_value))
                train_data.append(Data(location, date, feature_dict))
                feature_value = []
                
    return train_data

def read_test_data(test_data_path, feature_mean_dict, feature_std_dict):
    test_data = []
    feature_value = []
    line_count = 0
    with open(test_data_path, encoding='big5') as test_data_csv:
        reader = csv.reader(test_data_csv, delimiter=',')
        for line in reader:
            line_count += 1
            value = []
            for ele in line[2:]:
                value.append(ele)     
            feature_value.append(value)
            if line_count % 18 == 0:
                feature_dict = dict(zip(feature_list, feature_value))
                test_data.append(Data('', '0-0-0', feature_dict))
                feature_value = [] 
                
    for data in test_data:
        for feature in feature_list:
            data.feature_dict[feature] = (data.feature_dict[feature] - feature_mean_dict[feature]) / feature_std_dict[feature]
                
    return test_data

def feature_scaling(train_data):  
    feature_value = [np.array([])]*24
    feature_dict = dict(zip(feature_list, feature_value))
    feature_mean = []*24
    feature_mean_dict = dict(zip(feature_list, feature_value))
    feature_std = []*24
    feature_std_dict = dict(zip(feature_list, feature_value)) 
    
    for data in train_data:    
        for feature in feature_list:
            feature_dict[feature] = np.concatenate((feature_dict[feature], data.feature_dict[feature]))   
    for feature in feature_list:    
        if feature == 'RAINFALL':
            feature_mean_dict[feature] = feature_dict[feature].mean()
            feature_std_dict[feature] = feature_dict[feature].std()
            continue
        feature_mean_dict[feature] = feature_dict[feature].mean()
        feature_std_dict[feature] = feature_dict[feature].std()
    
    feature_dict = dict(zip(feature_list, feature_value))
    for data in train_data:
        for feature in feature_list:
            data.feature_dict[feature] = (data.feature_dict[feature] - feature_mean_dict[feature]) / feature_std_dict[feature]

    return train_data, feature_mean_dict, feature_std_dict

In [12]:
def linear_regression_one(X, Y, hr, learning_rate):
    iteration = 5000
    W_b = np.random.rand(feature_num*hr+1) 
    y_len = X.shape[1] - hr
    #print(count_loss(W_b, X, Y, hr))
    for epoch in range(iteration):
        if epoch % 10 == 0:
            print('epoch:', (epoch+1), 'RMSD:', count_loss(W_b, X, Y, hr))
            if (epoch+1) % 500 == 0:
                print(list(W_b))
        temp_W_b = W_b
        for i in range(len(W_b)):
            delta = 0
            for cntr in range(y_len):
                if (cntr + hr) % 480 == 0 and cntr is not 0:
                    cntr += 10
                x = np.concatenate((np.array(X[:,cntr:cntr+hr]).reshape(-1), [1]), axis=0)
                delta += (np.dot(W_b, x) - Y[cntr+hr]) * x[i]
            temp_W_b[i] = W_b[i] - learning_rate * (1 / y_len) * delta
        W_b = temp_W_b
        if epoch % 1000 == 0:
            learning_rate *= 0.1
            
    return W_b

def count_loss(W_b, X, Y, hr):
    loss = 0.
    y_len = X.shape[1] - hr
    RMSE = 0.
    for cntr in range(y_len):
        if (cntr + hr) % 480 == 0 and cntr is not 0:   
            cntr += 10
        x = np.concatenate((np.array(X[:,cntr:cntr+hr]).reshape(-1), [1]), axis=0)
        loss += ((np.dot(W_b, x)) - Y[cntr+hr])**2
        
    return math.sqrt(loss / (y_len))

def test(test_data, W_b):
    answer = []
    for data in test_data:
        x_matrix = data.test_get_matrix()
        x_ = np.concatenate((np.array(x_matrix).reshape(-1), [1]), axis=0)
        answer.append(np.dot(W_b, x_))
    return answer

def output_answer(output_answer_path, answer):
    os.remove(output_answer_path)
    with open(output_answer_path, 'a') as output_csv:
        output_csv.write('id,value\r\n')
        for i in range(len(answer)):
            line = 'id%d,%f', (int(i+1), an)
            output_csv.write()

In [13]:
learning_rate = 0.1
hour = 9
X, Y, feature_mean_dict, feature_std_dict = PM2point5_dataset(train_data_path)
X_ = read_test_data(test_data_path, feature_mean_dict, feature_std_dict)
W = linear_regression_one(X, Y, hour, learning_rate)
#W_b = [3.25108426e-01,-5.03398455e-01,-2.12642109e-01,6.68537231e-02,-2.72730167e-01,-6.66302095e-02,1.85388996e-01,6.48821276e-02,2.36598006e-01,-1.62021897e-01,3.20845117e-02,-2.04757765e-02,-2.08234699e-01,-1.31801358e-01,-1.89750931e-02,-6.15993875e-03,1.42570000e-02,2.10914480e-01,6.51178481e-02,-1.28857019e-02,7.44868267e-02,-1.22344642e-01,1.39808143e-01,-6.96903363e-02,-1.31240411e-01,9.38781660e-02,4.65731079e-01,-1.32497524e-01,2.91526300e-02,-7.61704817e-02,-1.06280952e-02,7.46664448e-02,-1.25409109e-01,3.25778129e-02,-1.70529749e-01,5.65749710e-02,5.67906514e-02,-2.09573119e-03,3.17513702e-01,-9.05480532e-02,-4.04705686e-02,-1.48390506e-02,1.11932375e-01,1.77284568e-01,-2.35969237e-01,-9.26590928e-02,-2.75119576e-01,-1.50227308e-01,-1.04311253e-01,-9.58637077e-02,-3.66456875e-03,-3.59957824e-01,-9.87228097e-02,1.39436250e+00,1.64920032e-01,-3.72694802e-02,-1.42860084e-01,1.00718618e-01,-4.69053027e-02,-3.62730866e-01,3.38637036e-02,9.77014653e-02,2.99827237e-01,7.06336195e-02,3.30077782e-01,-3.66359787e-01,-2.31700455e-01,-3.82260860e-01,-6.07540990e-01,-2.98363262e-01,2.17782023e-02,1.94458256e+00,-7.47271713e-02,2.21238650e-01,5.94980091e-02,1.68817360e-02,-3.19062333e-01,3.90150501e-01,-4.08573717e-01,-2.42064567e-01,2.06082202e+00,9.98575688e-02,1.23712109e-01,4.17649306e-01,-8.44007259e-02,7.61522167e-01,9.49880667e-01,-3.06834440e+00,2.42141250e+00,1.14885628e+01,-1.70068751e-02,-7.11624703e-02,2.47554673e-02,-8.46883923e-02,1.39972404e-01,9.95604226e-02,-9.61615908e-02,-1.85214841e-01,-2.45231917e-01,1.03753172e-01,1.95248674e-01,5.43848102e-02,-7.91315924e-01,-7.37131303e-02,1.70538896e-01,-4.90753973e-01,8.08942935e-02,4.80254732e-01,-3.14545607e-01,3.83013731e-01,-1.25946383e-01,-4.65914765e-02,-1.29954564e-02,1.26344279e-01,-2.56842769e-01,1.14801201e-01,4.89312914e-01,2.03971846e-01,-6.37071614e-02,6.90494170e-02,2.56330121e-02,1.91484885e-01,2.09936155e-01,-1.53807136e-01,-1.07224465e-01,3.15637654e-01,-3.72047807e-02,2.23722578e-01,-3.49814358e-02,2.55922124e-01,1.98226605e-02,1.03467887e-01,-2.19492449e-01,1.16948261e-01,5.05647635e-02,-1.31826912e-01,-6.56214189e-02,3.82886663e-02,-2.05315436e-01,1.51769955e-01,4.41706076e-02,-1.07322022e-01,-3.12404791e-01,8.99251586e-02,-1.68664504e-01,-6.53218148e-02,1.89773492e-01,4.32174107e-02,-1.14284463e-01,-1.55312526e-01,-1.16590130e-01,-4.79492754e-03,-4.46518539e-02,-3.57445454e-02,1.73002224e-01,-1.62926464e-01,-2.02160808e-01,1.05756336e-01,2.85947730e-01,-2.14610238e-02,-1.58448061e-01,1.48824633e-02,2.13961501e+01]

epoch: 1 RMSD: 31.020275307047683


UnboundLocalError: local variable 'i' referenced before assignment

In [None]:
answer = test(X_, W_b)
output_answer(output_answer_path, answer)