# Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from visualize import *
from data_treatments import *
from feature_selection import *
from models import *

from tp.competitive_learning import *
from tp.linear_model import *
from tp.rbfn import *

# Load data

In [2]:
X1 = pd.read_csv("../data/X1.csv") # [7684 rows x 15 columns]
Y1 = pd.read_csv("../data/Y1.csv",sep='\t',names=["Label"]) # [7684 rows x 1 columns]
X2 = pd.read_csv("../data/X2.csv") # [3787 rows x 15 columns]

# X1 = X1.values # converting pandas dataframe to a numpy array

# Visualize data

In [3]:
#visualize(X1, Y1, 'data.png', True) 

# Data treatments

In [4]:
# step 1 : handle cyclic values

X1_handled = handlecyclic(X1) # handle month, day, hour and wd cyclic features
X1_handled = handle_station(X1_handled) # handle station feature
X1_handled = add_linear_time(X1_handled) # add a global linear time (with year, month, day, hour features)

print(X1_handled.shape)



(7684, 34)


# Visualize new data

In [5]:
#visualize(X1_handled, Y1, 'new_data.png', True) 

# Data Splitting

In [6]:
# use of random_state parameter ?
X_train_valid, X_test, Y_train_valid, Y_test = train_test_split(X1_handled, Y1, test_size=0.2, random_state=0)

print(X_train_valid.shape)
print(X_test.shape)
print(Y_train_valid.shape)
print(Y_test.shape)

(6147, 34)
(1537, 34)
(6147, 1)
(1537, 1)


# Normalisation/Standardisation

In [7]:
# normalization
X_norm = norm(X_train_valid)
X_norm = pd.DataFrame(X_norm)
X_norm.columns = X_train_valid.columns

X_norm_test = norm(X_test)
X_norm_test = pd.DataFrame(X_norm_test)
X_norm_test.columns = X_test.columns

# standardisation
X_stand = stand(X_train_valid)
X_stand = pd.DataFrame(X_stand)
X_stand.columns = X_train_valid.columns

X_stand_test = stand(X_test)
X_stand_test = pd.DataFrame(X_stand_test)
X_stand_test.columns = X_test.columns


# Feature selection

In [8]:
# feature selection

# previously
'''
n = 7 # number of feature to select
most_mi = print_mutual_information(X1_handled, X_stand.values, Y_train_valid.values.ravel(), 10)
a = len(most_mi)
index_selected = most_mi[a-n:a]
print(index_selected)
X1 = X_stand.values[:,index_selected]
X1_test = X_stand_test.values[:,index_selected]
'''

features = features_selection(X_stand, Y_train_valid.values.ravel(), 7)
#features = features_selection(X_norm, Y_train_valid.values.ravel(), 7)

X_selected = X_stand[features]
#X_selected = X_norm[features]

X_test_selected = X_stand_test[features]
#X_test_selected = X_norm_test[features]

print(X_selected.columns)
print(X_test_selected.columns)

Index(['DEWP', 'WSPM', 'SO2', 'NO2', 'CO'], dtype='object')
Index(['DEWP', 'WSPM', 'SO2', 'NO2', 'CO'], dtype='object')


# Building model

In [9]:
def build_model(X_train, Y_train, X_test, Y_test, model):
    rmse = 1000
    
    if model[0]=='linear_regression_tp':
        rmse = linear_regression_tp(X_train, Y_train, X_test, Y_test)
    
    elif model[0] =='rbfn_tp':
        rmse = rbfn_tp(X_train, Y_train, X_test, Y_test, model[1], model[2])
    
    elif model[0] =='KNN':
        rmse = KNN(X_train, Y_train, X_test, Y_test)
    
    elif model[0] =='linear_regression':
        rmse = linear_regression(X_train, Y_train, X_test, Y_test)
    
    elif model[0] =='tree':
        rmse = tree(X_train, Y_train, X_test, Y_test)
    
    elif model[0] =='random_forest':
        rmse = random_forest(X_train, Y_train, X_test, Y_test)
    
    elif model[0] =='MLperceptron':
        rmse = MLperceptron(X_train, Y_train, X_test, Y_test)
    
    elif model[0] =='SVM':
        rmse = SVM(X_train, Y_train, X_test, Y_test)
    
    else:
        print('incorrect value for model')
    return rmse
    
    

In [10]:
rmse = build_model(X_selected, Y_train_valid, X_test_selected, Y_test, ['rbfn_tp', [20, 30, 40], [4,8]])
rmse # 41.109355030141586 # with stand, X_test_selected

n_center =  20
smooth_f =  4
smooth_f =  8
n_center =  30
smooth_f =  4
smooth_f =  8
n_center =  40
smooth_f =  4
smooth_f =  8


40.728885700161854

In [11]:
rmse = build_model(X_selected, Y_train_valid, X_test_selected, Y_test, ['linear_regression_tp'])
rmse # 45.81350958024227 # with stand, X_test_selected
# 46.555418705773626 # with norm, X_test_selected

47.5009732912099

In [12]:
rmse_knn = build_model(X_selected, Y_train_valid, X_test_selected, Y_test, ['KNN'])
rmse_knn # 41.25390056831065 # with stand, X_test_selected
# 45.50425826121668 # with norm, X_test_selected

42.7422616742308

In [13]:
rmse_lin_reg = build_model(X_selected, Y_train_valid, X_test_selected, Y_test, ['linear_regression'])
rmse_lin_reg # 45.813509580242275 # with stand, X_test_selected
# 46.55541870577369 # with norm, X_test_selected

47.5009732912099

In [None]:
rmse_tree = build_model(X_selected, Y_train_valid, X_test_selected, Y_test, ['tree'])
rmse_tree
# very time expensive
# 106.71974709600715 # with stand, X_test_selected
# 108.58110978903885 # with norm, X_test_selected

In [None]:
rmse_rf = build_model(X_selected, Y_train_valid, X_test_selected, Y_test, ['random_forest'])
rmse_rf # 102.87238489221627 # with stand, X_test_selected
# 105.65649031239074 # with norm, X_test_selected

In [None]:
rmse_mlp = build_model(X_selected, Y_train_valid, X_test_selected, Y_test, ['MLperceptron'])
rmse_mlp # 107.77550719298902 # with stand, X_test_selected
# 108.22558752681337 # # with norm, X_test_selected

In [None]:
rmse_svm = build_model(X_selected, Y_train_valid, X_test_selected, Y_test, ['SVM'])
rmse_svm # 101.41083882437974, # with stand, X_test_selected
# 103.97616339884539 # with norm, X_test_selected