In [2]:
%load_ext autoreload
%autoreload

import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

from equationmodel_gp import gp_regression, prediction_rmse_error, gp_train_graph, gp_linear_compare_graph
from equationmodel_ann import ann_mlp_regression, prediction_rmse_error, ann_linear_compare_graph
from util import ADD_data_loader, combineDF, filteringDF, getFreeSpacePathLoss, makeXforGraphWithGroupingFrequency,\
                makeXforGraph, inverseScale, samplingData, normalizeData,train_2d_graph, train_3d_graph

desired_width=620
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns',30)
pd.options.display.float_format = "{:.2f}".format

In [3]:
%autoreload

iksan_fileList = {'../data/PLdata_iksan_wt_400.csv':[('frequency', 400), ('heightTB',30), ('heightB',15), ('heightM',2)],
            '../data/PLdata_iksan_wt_1399.csv':[('frequency', 1399), ('heightTB',30), ('heightB',15), ('heightM',2)],
            '../data/PLdata_iksan_wt_2249.csv':[('frequency', 2249), ('heightTB',30), ('heightB',15), ('heightM',2)]}
nonsan_fileList = {'../data/PLdata_nonsan_wt_400.csv':[('frequency', 400), ('heightTB',30), ('heightB',15), ('heightM',2)],
            '../data/PLdata_nonsan_wt_1399.csv':[('frequency', 1399), ('heightTB',30), ('heightB',15), ('heightM',2)],
            '../data/PLdata_nonsan_wt_2249.csv':[('frequency', 2249), ('heightTB',30), ('heightB',15), ('heightM',2)]}
paju_fileList = {'../data/PLdata_paju_wt_400.csv':[('frequency', 400), ('heightTB',100), ('heightB',7), ('heightM',2)],
            '../data/PLdata_paju_wt_1399.csv':[('frequency', 1399), ('heightTB',100), ('heightB',7), ('heightM',2)],
            '../data/PLdata_paju_wt_2249.csv':[('frequency', 2249), ('heightTB',100), ('heightB',7), ('heightM',2)]}

addIksan = ADD_data_loader(iksan_fileList)
print("Iksan data:",addIksan.shape)
print("")
addNonsan = ADD_data_loader(nonsan_fileList)
print("Nonsan data:",addNonsan.shape)
print("")
addPaju = ADD_data_loader(paju_fileList)
print("Paju data:",addPaju.shape)
print("")

# print(addIksan.describe())
# print(addNonsan.describe())
# print(addPaju.describe())

addData = combineDF([addIksan, addNonsan, addPaju])
print("antenna_b height filtering-before(data count):",addData.shape)
addData = filteringDF(addData, 'heightTM', [10,100])
print("antenna_b height filtering-after(data count):",addData.shape)

print("freespace pathloss filtering-before(data count):",addData.shape)
addData['freePathloss'] = getFreeSpacePathLoss(addData['distance'],addData['frequency'])
addData = addData[addData['pathloss'] >= addData['freePathloss']]
print("freespace pathloss filtering-after(data count):",addData.shape)

# print("ADD data sample:\n",addData.head())

targetCols = ['logDistance', 'logFrequency', 'logHeightB', 'logHeightM','logExtendedHeightTratio', 'logHeightTratio', 'logAntennaMulLogDistance', 'pathloss']
xCols = ['logDistance', 'logFrequency', 'logHeightB', 'logHeightM','logExtendedHeightTratio', 'logHeightTratio', 'logAntennaMulLogDistance']
yCols = 'pathloss'

print("\nADD data description")
print(addData.describe())
# print(addData[['logDistance', 'logFrequency', 'logHeightB', 'logHeightM', 'logHeightTratio', 'logAntennaMulLogDistance', 'pathloss']].describe())
print("\nCovariance Matrix - ADD data[pathloss]")
print(addData.cov()['pathloss'])
print("\nCovariance Matrix - ADD data[Target Columns]")
print("Target Columns:", targetCols)
print(addData[['logDistance', 'logFrequency', 'logHeightB', 'logExtendedHeightTratio', 'logHeightTratio', 'logAntennaMulLogDistance', 'pathloss']].cov()['pathloss'])

print("\nCorrelation Matrix - ADD data[Target Columns]")
print("Target Columns:", targetCols)
corrMat = addData[['logDistance', 'logFrequency', 'logHeightB', 'logExtendedHeightTratio', 'logHeightTratio', 'logAntennaMulLogDistance', 'pathloss']].corr()
print(corrMat)
f, ax = plt.subplots(figsize=(8,7))
sns.heatmap(corrMat, square=True)

print("\nSample - ADD data")
print(addData.sample())
# print(addData[['logDistance', 'logFrequency', 'logHeightB', 'logHeightM', 'logHeightTratio', 'logAntennaMulLogDistance', 'pathloss']].head())

ADD data preprocessing
../data/PLdata_iksan_wt_1399.csv: distance filtering(before):(109060, 4)
../data/PLdata_iksan_wt_1399.csv: distance filtering(after):(10396, 4)
../data/PLdata_iksan_wt_2249.csv: distance filtering(before):(108873, 4)
../data/PLdata_iksan_wt_2249.csv: distance filtering(after):(10396, 4)
../data/PLdata_iksan_wt_400.csv: distance filtering(before):(109210, 4)
../data/PLdata_iksan_wt_400.csv: distance filtering(after):(10396, 4)
          type  distance  pathloss  heightTM  frequency  heightTB  heightB  heightM
type 6355    m      2.97    124.62      0.02       1399        30       15        2
     6356    m      2.96    123.33      0.02       1399        30       15        2
     6357    m      2.96    121.37      0.02       1399        30       15        2
     6358    m      2.95    121.60      0.02       1399        30       15        2
     6359    m      2.91    114.82      0.02       1399        30       15        2
Combined data set: (31188, 8)
type filterin

In [16]:
%autoreload
samplingRatio = 0.7
addDataSample = samplingData(addData[targetCols], samplingRatio)
# addDataSample.sort_values(by=['logDistance'])

normalizer = 'standard'

X, scaler = normalizeData(addDataSample[xCols], scaler = normalizer)
Y = np.array(addDataSample[yCols])

data distribution(before)
       logDistance  logFrequency     pathloss
count  79125.00000   79125.00000  79125.00000
mean       0.30807       3.03282    127.46660
std        0.11901       0.31582     16.61257
min        0.00003       2.60206     85.05800
25%        0.23252       2.60206    114.90700
50%        0.32884       3.14582    127.16800
75%        0.39827       3.35199    140.50700
max        0.47708       3.35199    173.14600
sampling shape(before):(79125, 3)
sampling shape(after):(7912, 3)
data distribution(after)
       logDistance  logFrequency    pathloss
count   7912.00000    7912.00000  7912.00000
mean       0.30684       3.02882   127.26826
std        0.11958       0.31618    16.71644
min        0.00010       2.60206    85.30200
25%        0.23044       2.60206   114.93275
50%        0.32880       3.14582   126.94650
75%        0.39800       3.35199   140.28900
max        0.47708       3.35199   170.73300
normalization distribution(before):
       logDistance  logFrequ

In [17]:
kf = KFold(n_splits=5, shuffle=False)

modelList = []
dataSet = []
i = 1
for train_index, test_index in kf.split(X):
#     print("TRAIN index:", train_index, "TEST index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    print("X_train shape:{}, y_train shape:{}, X_test shape:{}, y_test shape:{}".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    model = gp_regression(X_train, y_train.flatten())

#     trainError = prediction_rmse_error(model.predict(X_train), y_train)
#     testError = prediction_rmse_error(model.predict(X_test), y_test)
    
    pred_train, std_train = model.predict(X_train, return_std=True)
    pred_test, std_test = model.predict(X_test, return_std=True)
    
    trainError = np.sqrt(mean_squared_error(y_train, pred_train))
    testError = np.sqrt(mean_squared_error(y_test, pred_test))
    print("GP-Model-"+str(i)+"-train error(RMSE):", trainError)
    print("GP-Model-"+str(i)+"-test error(RMSE):", testError) 
    
    print("GP-Model-"+str(i)+"-STD DEVIATION:", std_train)
    print("GP-Model-"+str(i)+"-STD DEVIATION:", std_test)    
    
    dataSet.append([X_train,y_train,X_test,y_test, trainError, testError])
    
    modelList.append(model)
    filename = 'model/gp_model_' + str(i) + '.sav'
    pickle.dump(modelList[-1], open(filename, 'wb'))
    i+=1

X_train shape:(6329, 2), y_train shape:(6329, 1), X_test shape:(1583, 2), y_test shape:(1583, 1)
GP-Model-1-train error(RMSE): 0.8563661039906155
GP-Model-1-test error(RMSE): 13.064601822444297
GP-Model-1-STD DEVIATION: [4.40927398 4.39974862 4.39655282 ... 4.402519   4.39459392 4.33510581]
GP-Model-1-STD DEVIATION: [13.51733779 13.93150582 12.4596096  ... 12.71655296  4.35542408
 12.35129564]
X_train shape:(6329, 2), y_train shape:(6329, 1), X_test shape:(1583, 2), y_test shape:(1583, 1)
GP-Model-2-train error(RMSE): 0.8776706543072857
GP-Model-2-test error(RMSE): 13.242998920278595
GP-Model-2-STD DEVIATION: [4.40959808 4.41289009 4.4007601  ... 4.40126586 4.40652086 4.34167238]
GP-Model-2-STD DEVIATION: [13.50798898 13.07014064 11.81194893 ... 12.46327147 13.23962758
 13.30450253]
X_train shape:(6330, 2), y_train shape:(6330, 1), X_test shape:(1582, 2), y_test shape:(1582, 1)
GP-Model-3-train error(RMSE): 0.862650345866623
GP-Model-3-test error(RMSE): 12.862018014970673
GP-Model-3-ST