# Imports

In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split

from visualize import *
from data_treatments import *
from feature_selection import *

from tp.competitive_learning import *
from tp.linear_model import *
from tp.rbfn import *

# Load data

In [46]:
X1 = pd.read_csv("../data/X1.csv") # [7684 rows x 15 columns]
Y1 = pd.read_csv("../data/Y1.csv",sep='\t',names=["Label"]) # [7684 rows x 1 columns]
X2 = pd.read_csv("../data/X2.csv") # [3787 rows x 15 columns]

# X1 = X1.values # converting pandas dataframe to a numpy array

# Visualize data

In [3]:
#visualize(X1, Y1, 'data.png', True) 

# Data treatments

In [47]:
# step 1 : handle cyclic values

X1_handled = handlecyclic(X1) # handle month, day, hour and wd cyclic features
X1_handled = handle_station(X1_handled) # handle station feature
X1_handled = add_linear_time(X1_handled) # add a global linear time (with year, month, day, hour features)

print(X1_handled.shape)



(7684, 34)


# Visualize new data

In [5]:
#visualize(X1_handled, Y1, 'new_data.png', True) 

# Data Splitting

In [48]:
# use of random_state parameter ?
X_train_valid, X_test, Y_train_valid, Y_test = train_test_split(X1_handled, Y1, test_size=0.2, random_state=0)

print(X_train_valid.shape)
print(X_test.shape)
print(Y_train_valid.shape)
print(Y_test.shape)

(6147, 34)
(1537, 34)
(6147, 1)
(1537, 1)


# Normalisation/Standardisation and features selection

In [54]:
# normalization
X_norm = norm(X_train_valid)
X_norm = pd.DataFrame(X_norm)
X_norm.columns = X_train_valid.columns

X_norm_test = norm(X_test)
X_norm_test = pd.DataFrame(X_norm_test)
X_norm_test.columns = X_test.columns

# standardisation
X_stand = stand(X_train_valid)
X_stand = pd.DataFrame(X_stand)
X_stand.columns = X_train_valid.columns

X_stand_test = stand(X_test)
X_stand_test = pd.DataFrame(X_stand_test)
X_stand_test.columns = X_test.columns

# feature selection
n = 7 # number of feature to select
most_mi = print_mutual_information(X1_handled, X_stand.values, Y_train_valid.values.ravel(), 10)

a = len(most_mi)
index_selected = most_mi[a-n:a]
print(index_selected)

X1 = X_stand.values[:,index_selected]
X1_test = X_stand_test.values[:,index_selected]


Mutual Information : 
mutual_information =  0.05466198033008851 	 	 8 TEMP
mutual_information =  0.055069884140506176 	 	 1 month
mutual_information =  0.05584882544631942 	 	 9 PRES
mutual_information =  0.09147456322777003 	 	 12 WSPM
mutual_information =  0.13536770942468923 	 	 33 time
mutual_information =  0.14580728482424377 	 	 10 DEWP
mutual_information =  0.1460998843001624 	 	 7 O3
mutual_information =  0.1717893831683277 	 	 4 SO2
mutual_information =  0.35357712663916807 	 	 5 NO2
mutual_information =  0.6576062979029422 	 	 6 CO
[12 33 10  7  4  5  6]


# Main Loop

In [44]:
def build_model(X_train, Y_train, X_test, Y_test, model):
    
    if model[0]=='linear_regression':
        print('todo')
    
    elif model[0] =='rbfn':
        n_center = model[1] # ex : [x for x in range(2,100,10)]
        smooth_f = model[2] # ex : [0.5, 1, 2, 4, 8, 16]

        rmses = {}
        best = {'rmse':1000}

        for nc in n_center:
            print("number of centers : ",nc)
            rmses[nc] = list()

            for sf in smooth_f:
                print("smooth_f : ",sf)

                rmse = score_rbfn(nc, sf, X1_train_selected, Y1_train, X1_valid_selected, Y1_valid)
                rmses[nc].append(rmse)

                if rmse < best['rmse']:
                    best['rmse'] = rmse
                    best['n_center'] = nc
                    best['smooth_f']= sf
        return [rmses, best]
    
    
    else:
        print('incorrect value for model')
    
    
    

In [None]:
build_model()