In [1]:
import csv
import os
import sys
import datetime
import math
import numpy as np

In [2]:
feature_num = 18
feature_list = ['AMB_TEMP', 'CH4', 'CO', 'NMHC', 'NO', 'NO2', 'NOx',
                'O3','PM10', 'PM2.5', 'RAINFALL', 'RH', 'SO2', 'THC',
                'WD_HR', 'WIND_DIREC', 'WIND_SPEED', 'WS_HR']
train_data_path = './train.csv'
test_data_path = './test.csv'
output_answer_path = './answer.csv'

class Data(object):
    
    def __init__(self, location, date, feature_dict):
        
        self.location = location
        self.date = date
        self.feature_dict = feature_dict
        self.str2num()
        self.PM2point5 = self.feature_dict['PM2.5']
        
    def str2num(self):   
        for feature in feature_list:
            if feature == 'RAINFALL':
                temp = self.feature_dict[feature]
                for i in range(len(temp)):
                    if self.feature_dict[feature][i] == 'NR':
                        self.feature_dict[feature][i] = 0
                    else:
                        self.feature_dict[feature][i] = 0
                self.feature_dict[feature] = np.array(self.feature_dict[feature])
                continue
            temp = self.feature_dict[feature]
            for i in range(len(temp)):
                self.feature_dict[feature][i] = float(temp[i])
            self.feature_dict[feature] = np.array(self.feature_dict[feature])
            
    def feature2matrix(self):   
        x = np.ndarray(shape=(18, 24), dtype=float)
        cntr = 0
        for feature in feature_list:
            x[cntr] = self.feature_dict[feature]
            cntr += 1
            
        return np.asmatrix(x)
    
    def test_feature2matrix(self):      
        x = np.ndarray(shape=(18, 9), dtype=float)
        cntr = 0
        for feature in feature_list:
            x[cntr] = self.feature_dict[feature]
            cntr += 1
            
        return np.asmatrix(x)
    
    def get_matrix(self):
        self.x = self.feature2matrix()
        return self.x
    
    def test_get_matrix(self):
        self.x = self.test_feature2matrix()
        return self.x
    
    def get_PM2point5(self):
        return self.PM2point5
    
def PM2point5_dataset(train_data_path, fs):
    train_data, feature_mean_dict, feature_std_dict = feature_scaling(read_train_data(train_data_path), fs)
    x_matrix = None
    y_matrix = None
    cntr = 0
    for data in train_data:
        if cntr == 0:
            x_matrix = data.get_matrix()
            y_matrix = data.get_PM2point5()
            cntr += 1
            continue
        x_temp = data.get_matrix()
        y_temp = data.get_PM2point5()
        x_matrix = np.hstack((x_matrix, x_temp))
        y_matrix = np.hstack((y_matrix, y_temp))
        cntr += 1
    
    return x_matrix, y_matrix, feature_mean_dict, feature_std_dict

def read_train_data(train_data_path):
    train_data = []
    feature_value = []
    line_count = 0
    
    with open(train_data_path, encoding='big5') as train_data_csv:
        next(train_data_csv)
        reader = csv.reader(train_data_csv, delimiter=',')
        for line in reader:
            line_count += 1
            time = line[0].split('/')
            date = datetime.datetime(int(time[0]), int(time[1]), int(time[2]))
            location = line[1].strip()
            value = []
            for ele in line[3:]:
                value.append(ele)     
            feature_value.append(value)
            if line_count % 18 == 0:
                feature_dict = dict(zip(feature_list, feature_value))
                train_data.append(Data(location, date, feature_dict))
                feature_value = []
                
    return train_data

def read_test_data(test_data_path, feature_mean_dict, feature_std_dict, fs):
    test_data = []
    feature_value = []
    line_count = 0
    with open(test_data_path, encoding='big5') as test_data_csv:
        reader = csv.reader(test_data_csv, delimiter=',')
        for line in reader:
            line_count += 1
            value = []
            for ele in line[2:]:
                value.append(ele)     
            feature_value.append(value)
            if line_count % 18 == 0:
                feature_dict = dict(zip(feature_list, feature_value))
                test_data.append(Data('', '0-0-0', feature_dict))
                feature_value = [] 
    
    if fs == 1:
        for data in test_data:
            for feature in feature_list:
                if feature == 'RAINFALL':
                    data.feature_dict[feature] = 0
                    continue
                data.feature_dict[feature] = (data.feature_dict[feature] - feature_mean_dict[feature]) / feature_std_dict[feature]
    else:
        for data in test_data:
            for feature in feature_list:
                if feature == 'RAINFALL':
                    data.feature_dict[feature] = 0

    return test_data

def feature_scaling(train_data, fs):  
    feature_value = [np.array([])]*24
    feature_dict = dict(zip(feature_list, feature_value))
    feature_mean = []*24
    feature_mean_dict = dict(zip(feature_list, feature_value))
    feature_std = []*24
    feature_std_dict = dict(zip(feature_list, feature_value)) 
    
    for data in train_data:    
        for feature in feature_list:
            feature_dict[feature] = np.concatenate((feature_dict[feature], data.feature_dict[feature]))   
    for feature in feature_list:    
        if feature == 'RAINFALL':
            feature_mean_dict[feature] = 0
            feature_std_dict[feature] = 1
            continue
        if fs == 1:
            feature_mean_dict[feature] = feature_dict[feature].mean()
            feature_std_dict[feature] = feature_dict[feature].std()
        else:
            feature_mean_dict[feature] = 0
            feature_std_dict[feature] = 1
    
    feature_dict = dict(zip(feature_list, feature_value))
    for data in train_data:
        for feature in feature_list:
            if feature == 'RAINFALL':
                data.feature_dict[feature] = 0
                continue
            data.feature_dict[feature] = (data.feature_dict[feature] - feature_mean_dict[feature]) / feature_std_dict[feature]

    return train_data, feature_mean_dict, feature_std_dict


def test(test_data, W_b):
    answer = []
    hour = 9
    for data in test_data:
        x_matrix = data.test_get_matrix()
        a = np.array(x_matrix[9]).reshape(-1)
        #b = np.array(x_matrix[7]).reshape(-1)
        #c = np.concatenate((a, b), axis=0)
        c = np.array(x_matrix[7:10,4:9]).reshape(-1)
        x_ = np.concatenate(([1], c), axis=0)
        a = np.dot(W_b, x_) + 1
        #if a % 1 > 0.5:
            #a = int(a) + 1
        #else:
            #a = int(a)
        answer.append(a)
    return answer

def output_answer(output_answer_path, answer):
    #os.remove(output_answer_path)
    #os.mknod("answer.csv")
    with open(output_answer_path, 'a') as output_csv:
        output_csv.write('id,value\r\n')
        for i in range(len(answer)):
            line = 'id_%d,%f\r\n' % (i, answer[i])
            output_csv.write(line)
            

In [30]:
def gradient_descent(X, Y, regularization_lambda, learning_rate):
    x_train = []
    y_train = []
    hour = 9
    epoch = 100000
    feature_use = 1
    for i in range(X.shape[1] - hour - 1):
        if (i + hour) % 480 >= (480 - hour):
            continue
        a = np.array(X[9,i:i+hour]).reshape(-1)
        #b = np.array(X[7,i:i+hour]).reshape(-1)
        #c = np.array(X[7:10,i:i+hour]).reshape(-1)
        x_train.append(np.concatenate(([1], a), axis=0))
        y_train.append(Y[i+hour])
    
    print(x_train[0])
    W = np.random.uniform(-1, 1, (feature_use*hour+1))
    temp_W = np.random.rand(feature_use*hour+1)
    example_num = float(len(x_train))
    
    for e in range(epoch):
        loss = []
        RMSD = 0.
        temp_W = W
        for i in range(int(example_num) - 1):
            loss.append((np.dot(W, x_train[i]) - y_train[i])**2)
        
        for j in range(feature_use*hour + 1):
            delta = 0
            if j == 0:
                for i in range(int(example_num) - 1):
                    delta += (y_train[i] - np.dot(W, x_train[i])) * (-1)
                temp_W[j] = W[j] - learning_rate * (1. / example_num) * delta
            else:
                for i in range(int(example_num) - 1):
                    delta += (y_train[i] - np.dot(W, x_train[i])) * (-1)*x_train[i][j]
                temp_W[j] = W[j]*(1-regularization_lambda*(1. / example_num)*W[j]) - learning_rate * (1. / example_num) * delta                                 

        W = temp_W
        if (e) % 10 == 0:
            print('epoch', e+1,'loss:', sum(loss), 'RMSD:', math.sqrt(sum(loss)/example_num))
            if (e) % 500 == 0:
                print(list(W))
        
    return W

X, Y, feature_mean_dict, feature_std_dict = PM2point5_dataset(train_data_path, 0)
#W = gradient_descent(X, Y, 0.1, 0.0001)

In [16]:
X_ = read_test_data(test_data_path,feature_mean_dict, feature_std_dict, 0)
#W = [0.000383057055293,-0.00630075731942,-0.0283506555302,-0.00715920455751,-0.0534075305958,-0.0153211111846,0.00521029537902,-0.00664538293774,-0.008942586149,0.10329842431,0.0150177748134,0.0161006836293,0.0174309829953,0.0106078737856,0.00843565900007,0.0281079100516,0.0153012577659,0.0321362881693,0.0440220881212,0.00981952545287,0.00270374547641,0.00311904539474,-0.00831366158948,0.0265086557981,-0.00537383664071,-0.0104548358418,0.0329079031975,0.0710665471373,-0.00555804630319,0.008511107516,-0.000307704952032,0.00985935951031,0.00820938969585,-0.00331744532489,0.00320150594786,0.00168022535264,0.00905859314458,0.0251965168113,0.0182314801681,0.0544218795243,0.00545063784645,-0.00814474875217,-0.0436367996308,0.0372905532286,0.034983932386,-0.0740747636577,-0.00654003330572,-0.0206985514046,-0.0293321991665,-0.0524123771623,0.0066062098727,5.81635580245e-05,-0.0620603292073,-0.0599523152067,0.176319965742,0.00918519976358,-0.0135588102588,0.0226743692423,-0.0110433781941,0.0210224928888,-0.015075712565,-0.0239545901955,0.0179404262737,0.111748051009,0.00363421466892,0.0142623800384,-0.00479632241087,-0.0244512026301,0.00312751523334,-0.0306296198157,-0.0237280498825,-0.0112402076076,0.104676638978,0.00757523918108,0.0085910141521,-0.030473180885,0.0413045397263,-0.00730506517061,-0.0389661876672,0.039793856623,-0.00468430334157,0.0371543059178,-0.0373520227186,-0.00828451881405,0.209389766975,-0.231852203777,-0.0213441657211,0.488444112834,-0.552360315409,0.0235025802024,0.955010424198,0.0249196915948,-0.00957363805853,-0.0431009530465,-0.00670515884781,-0.0380986567958,0.0401302596772,0.0129598061915,-0.0427505864022,-0.0782288343044,-0.005054341055,0.0243150863422,0.00276589769699,-0.0302473049307,-0.03926404826,0.048629807594,-0.0654737115393,0.0316630323739,0.0234880609866,-0.0913532880209,0.092576883831,-0.00372856624081,-0.0414721007753,-0.00156586844374,0.0175197559178,-0.0257111151619,0.0784339537014,0.164986965396,0.0183370526281,0.015410752574,0.0224874864523,0.015579418433,0.0305620977937,0.0152641529079,0.0204758307344,0.0254696933113,0.0450274354238,-0.000675916150581,0.00304485635235,-0.000571332145922,0.00170677586207,0.000192997897954,0.00183191495744,-0.00241664917611,0.000790589016487,0.000284028227633,-0.00196445184234,-0.000781614739661,0.00106351559366,-0.00224287538506,0.00048417628924,-0.000131303874755,-0.000240891779796,-0.00261076435566,0.000383178722308,-0.0344693947731,-0.00937917039673,0.0160136730698,-0.00742077584701,-0.0135806496511,-0.000143304478653,-0.0395818800161,-0.034288788286,-0.0230690234364,-0.0236181834091,-0.0027577210085,-0.0166078901847,-0.0341795522015,-0.00657074089321,0.049291012862,-0.0072334058529,-0.0421218400797,-0.00445628618361]
#W_2 = [21.398850649011692, -0.058428663188457096, -0.31221586140790536, 1.326441349095562, -1.7343909598274891, 0.13169788069840291, 0.79245888009157495, -0.49856043706215408, -1.2738610531576868, 1.5863254029967517, -0.18494666073133936, 0.23863690705738683, 0.26844631953253317, 0.3165437848683802, 0.031397904403610746, 0.024406336908566865, -0.16541765569692499, 0.089359185227595561, 0.39535235335488844, -0.0085727406379833403, -0.039212211991691469, 0.097222479198955492, -0.15588063098494229, 0.11793907439043103, 0.020229834023606497, -0.14275680345622718, 0.062920282055513088, 0.38941547441608904, -0.27276746676131808, 0.44616543650516544, -0.0040137582032632804, 0.5036230043282075, 0.35782628286777873, -0.48571261839706892, 0.074268230513515776, 0.10036950714348966, -0.025407984863833558, 0.084273709227773108, -0.27470163696676864, 0.061145717917273808, -0.34580725299948634, -0.054333527500777773, -0.12462152365810884, -0.019485695635131691, 0.050642616346644923, -0.025314052298640644, 0.30378593721564873, -1.0604281160788638, -0.72334907926365843, -0.81605516404873135, -0.40665133798261383, 0.13781098968301991, -0.40156066053433309, -1.129641459524102, 1.9356772247806411, -0.027032473420767346, 0.78521086518877381, 0.7225133381475477, 0.83598458917372298, 0.27131273930864841, -0.097970442184845741, 0.13487788166533399, 0.68493940171754908, -0.26179632880584558, -0.022141146026008324, 0.32567169536004725, -0.13142645180059243, -0.4287136473634045, 0.058629990506335175, -0.53616781599895669, -0.32872639131979753, -0.32906084016430848, 1.7801969767647698, 0.26430839766198394, 0.25434463895337778, -0.80147057931818244, 1.0080793725169379, -0.24507685669703805, -0.79554135186774821, 0.82878260525337133, -0.060599347921022763, 0.94213218082636707, -0.61413398440160571, -0.16175817699530379, 3.4506753941602333, -3.860978029444269, -0.32173848141288591, 8.0529404610650364, -9.237634198796151, 0.53289243429727551, 15.821539560600291, 0.73654923151208851, -0.83860997631846856, -0.74284785810835097, 0.41910771232725297, -0.17260958977971411, -0.080407650461389021, 0.78214375323175855, 0.32606291485566891, -0.54970060637247431, -0.3957972382534457, 0.62888270565508153, 0.30247573969024122, -0.76787180928021259, -0.39452030508036157, 0.8820872411725359, -1.1572772315089705, 0.12003898020633706, 0.3734667577850313, -0.43151902013444882, 0.66156221554173611, -0.21203990401176717, -0.088785118454118697, 0.010122845110422572, 0.076078347635592958, -0.22448724657356833, 0.26091184320018362, 0.25897303625536577, 0.1748704132094924, -0.45610627892616096, -0.14867412977496713, -0.71467202847565969, -0.16113713102310817, 0.29012144148678776, 0.10784485749029686, -0.27576330375593217, 0.011738329221607242, -0.053937302953315877, 0.23719520657176477, -0.11125573850379174, 0.17374200201811812, 0.069972713477736689, 0.15825603189647003, -0.28432541789269017, 0.077047079108156655, 0.0077973600786040206, -0.14838917041919969, -0.086239378134611952, 0.10083600924503788, -0.17666994063678282, 0.087080053289975562, 0.032434772487686656, 0.013851302504987771, -0.19546921911693874, 0.034520284743916635, -0.22469041153987912, -0.20303094866738783, 0.20734226298205558, 0.099595937893862507, -0.04949558413226119, -0.14609317481925255, -0.09541526699487668, -0.030604914187800241, -0.082853329490312044, -0.018080816014329579, 0.31706546134085362, -0.16997972478837917, -0.19593000450073086, -0.11941196208447354, 0.34291320535605402, -0.0098861746106588041, -0.22051280390401429, 0.19232354807660315]
#W_3 = [0.00348262559583,0.00193013355057,-0.0336814078252,0.00226642123987,-0.0731450152635,-0.0193964207454,0.0118214618255,-0.0138947953103,-0.0247503438309,0.131429194669,0.0175712684287,0.020566932232,0.0235466975238,0.0140781774896,0.0135307976527,0.0370504276681,0.0229219182029,0.0411883312987,0.0582663845281,0.0106195185641,0.00518986603234,0.00632184106418,-0.0104422951715,0.0377606236889,-0.00391622313214,-0.014267451611,0.0468632386296,0.102116819129,-0.00639995918888,0.00888524064025,-0.00117731213818,0.0100874432981,0.0106558891408,-0.00361161418404,0.00420203521033,0.00274454097956,0.0110836978869,0.024600929251,0.016530083502,0.0602965221745,-0.00158778710368,-0.0109343403617,-0.0559536541932,0.0389209672043,0.0432345435128,-0.0921966486655,-0.00207389586192,-0.0223147834265,-0.032769158973,-0.0593263386542,0.00221208730926,0.00414841281054,-0.0620821627088,-0.0793556360696,0.183753821672,0.00959559360843,-0.0173806651773,0.0245136341966,-0.00366003886346,0.0252014960902,-0.0151314691143,-0.0205005492027,0.0250458602369,0.108486379548,0.00320552936261,0.0140934650768,-0.00423904453212,-0.0250169493068,0.00343606248214,-0.0304165345751,-0.0241315311312,-0.0118435503752,0.104291640651,0.00751840647875,0.0091648443515,-0.0312946812117,0.0416038277031,-0.0065427414462,-0.0400572410715,0.0400368795966,-0.00381592484237,0.0362370919647,-0.0368006176352,-0.0105096835998,0.212599876986,-0.23340795858,-0.0236115036336,0.492288862915,-0.554045332705,0.0224491304671,0.956165867469,0.0272479821901,-0.00935006136573,-0.0463585452649,-0.00157000534122,-0.0447571239458,0.0436110194328,0.0155108656403,-0.0432107396434,-0.0797046271224,-0.00598331936248,0.0239851279323,0.00537745580748,-0.0324084443752,-0.0446158696086,0.0578018328775,-0.0781663112379,0.0377293731632,0.0252154281606,-0.12448429784,0.13031243663,-0.00608292789087,-0.053451190208,-0.00193590810909,0.0200471348121,-0.0471485788726,0.0877295209469,0.184020354412,0.0213422695997,0.0193451378901,0.0272563270022,0.0175897078261,0.038152643404,0.0249511106192,0.0291133469629,0.0343450743002,0.0597367826542,-0.000633963880766,0.0030199690729,-0.000584085194075,0.00172248588261,0.000216806775606,0.00185053569032,-0.00239328669108,0.00078929182295,0.000220443521131,-0.00197253074568,-0.000785759977643,0.00110154188868,-0.00219512646349,0.000483122296044,-0.000103281915002,-0.000223746596597,-0.00260461028247,0.000342689724557,-0.0426204876178,-0.0133648546708,0.0259547927101,-0.00512813234685,-0.0144981364855,0.00490871997838,-0.0451460762546,-0.0422527163404,-0.0265001354772,-0.0305420405226,0.000913504208467,-0.0204741690415,-0.0413040583289,-0.00811290293669,0.0704574793694,-0.0066554655082,-0.054355087719,0.00295449849437]
#W_9 = [1.0717842651854583, -0.025907615684444222, -0.017876943888587238, 0.20899464909363966, -0.23585596730719077, -0.045273885301724257, 0.52457103819644924, -0.57640734747620548, 0.017259186037478759, 1.0909371927544229]
#W_18 = [-0.25843406889216736, -0.016419904603506365, 0.018580995093799034, -0.0083989008344174457, -0.0035376094151997905, 0.0029840680404419457, -0.021312304869977813, 0.0036318745805717346, -0.021547755254193428, 0.070625445975628157, 0.010902503512705025, -0.0014779407654862023, -0.023115400777239292, 0.041071190116965091, -0.026120401256189293, -0.014874596920240631, 0.026867731556848375, -0.024124048820350564, 0.079693025218588828, -0.058609174081809261, 0.032693055500844961, 0.20084073508344413, -0.28015448728352349, 0.029355378311028631, 0.48034981379502495, -0.60461093874590888, 0.059801188955979231, 0.97475670829389427]
W_X = [-0.51493742804043952, 0.0029576997730171094, -0.021703822906912876, -0.0042505114290215789, -0.01386638434673188, 0.070685144707054212, 0.001985133912326853, -0.0078332400611526125, 0.006431267044018028, -0.01157076150433761, 0.080695014599191789, -0.063488925548648739, 0.3818044397613139, -0.45961882029106871, -0.0079163839684036653, 0.97838211858350144]
W_normal_equation = [ 1.71786466, -0.03282123, -0.0213131,   0.21143442, -0.23585443, -0.0528447, 0.52723935, -0.57353344,  0.00768656,  1.08954066]
print(len(W_X))
answer = test(X_, W_nomarl_equation)
output_answer(output_answer_path, answer)
#best W_X + 2

16


In [29]:
def normal_equation(X, Y):
    x_train = []
    y_train = []
    feature_use = 1
    hour = 9
    for i in range(X.shape[1] - hour - 1):
        if (i + hour) % 480 >= (480 - hour):
            continue
        a = np.array(X[9,i:i+hour]).reshape(-1)
        #b = np.array(X[7,i:i+hour]).reshape(-1)
        #c = np.array(X[7:10,i:i+hour]).reshape(-1)
        x_train.append(np.concatenate(([1], a), axis=0))
        #x_train.append(a)
        y_train.append(Y[i+hour])
    
    print(x_train[0])
    x_train = np.matrix(x_train)
    y_trian = np.matrix(y_train)
    print(x_train.shape)
    x_train_t = x_train.transpose()
    print(x_train_t.shape)
    
    theta = np.matmul(np.matmul(np.linalg.inv(np.matmul(x_train_t, x_train)), x_train_t), y_train)
    print(theta.shape)
    print(theta)
    
normal_equation(X, Y)

[  1.  26.  39.  36.  35.  31.  28.  25.  20.  19.]
(5643, 10)
(10, 5643)
(1, 10)
[[ 1.71786466 -0.03282123 -0.0213131   0.21143442 -0.23585443 -0.0528447
   0.52723935 -0.57353344  0.00768656  1.08954066]]
