In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
import random
import tensorflow.keras.utils as utils
import pydot
from tensorflow.keras import regularizers
from tensorflow.keras import layers
from sklearn.ensemble import RandomForestRegressor
class Mics_Model:
    def __init__(self, dataset_dir, use_encoder=True, sampling_method="Vanilla", global_model="NN", group_number = 2, company_number = 1):
        self.dataset_dir = dataset_dir
        self.use_encoder = use_encoder
        self.sampling_method = sampling_method
        self.global_model = global_model
        self.group_number = group_number
        self.company_number = company_number
        self.raw_data = None
            
    #ASSUMPTION: column 0: index, column 1: labels, remaining columns are features. 
    def get_raw_data(self, index_col=0):
        raw_data = pd.read_csv(self.dataset_dir, index_col=0)
        raw_data = raw_data.fillna(raw_data.mean())
        raw_data = raw_data.sample(frac=1, random_state=41)
        self.raw_data = raw_data
        
    #This method assigns the feature number = column number - 1 (exclude label column). After that, it returns a list of
    #input feature numbers according to group count. Ex: for 28 cols, 27 features, 4 group_num: returns [7,7,6,7] 
    #Output of this function can be fed to get_model methods as inp_sizes input.
    def get_input_group_lenthgs(self):
        count = self.group_number
        input_sizes = [None]*count
        feature_num = len(self.raw_data.columns) - 1
        for i in range(count):
            group_size = round(feature_num/(count-i))
            input_sizes[i] = group_size
            feature_num = feature_num - group_size
        return input_sizes
    
    #This method returns grouped column numbers
    #[[1,4,5],[2,3,6]]
    def get_grouped_feature_cols(self):
        grouped_feature_cols = [None]*self.group_number
        feature_num = len(self.raw_data.columns) - 1
        inp_sizes = self.get_input_group_lenthgs()
        total_nums = [i for i in range(feature_num)]
        for j in range(len(inp_sizes)):
            size = inp_sizes[j]
            temp_list = random.sample(total_nums, size)
            grouped_feature_cols[j] = temp_list
            for k in temp_list:
                total_nums.remove(k)
        return grouped_feature_cols
    
    #groups is a list of lists [[1,4,5], [2,3,6]] which is output of get_grouped_feature_cols method
    #returns: [[train_x1, train_x2..., train_xn, train_y],
    #          [test_x1, test_x2..., test_xn, test_y]]
    def get_features_and_labels(self, groups, random_seed=41):
        row_num = len(self.raw_data.index)
        
        trainx_df = self.raw_data.iloc[:int(0.7*row_num), 1:]
        trainy_df = self.raw_data.iloc[:int(0.7*row_num), 0]
        valx_df = self.raw_data.iloc[int(0.7*row_num):int(0.85*row_num), 1:]
        valy_df = self.raw_data.iloc[int(0.7*row_num):int(0.85*row_num), 0] 
        testx_df = self.raw_data.iloc[int(0.85*row_num):, 1:]
        testy_df = self.raw_data.iloc[int(0.85*row_num):, 0]         
        
        scaler = StandardScaler()
        #trainx_scaled = pd.DataFrame(scaler.fit_transform(trainx_df), columns = trainx_df.columns, index = trainx_df.index)
        #textx_scaled = pd.DataFrame(scaler.transform(testx_df), columns = testx_df.columns, index = testx_df.index)
        
        features_and_labels = [[None for _ in range(self.group_number + 1)] for _ in range(3)]
        
        for index, group in enumerate(groups):
            train_temp = trainx_df.iloc[:,group]
            train_temp_companies = self.transform_dataset(df=train_temp, random_seed=random_seed)
            features_and_labels[0][index] = train_temp_companies.values
            val_temp = valx_df.iloc[:,group]
            val_temp_companies = self.transform_dataset(df=val_temp, random_seed=random_seed)
            features_and_labels[1][index] = val_temp_companies.values            
            test_temp = testx_df.iloc[:,group]
            test_temp_companies = self.transform_dataset(df=test_temp, random_seed=random_seed)            
            features_and_labels[2][index] = test_temp_companies.values
        #trainy_df_companies = trainy_df.sample(frac=1, random_state=random_seed)
        #testy_df_companies = testy_df.sample(frac=1, random_state=random_seed)        
        features_and_labels[0][self.group_number] = trainy_df.values
        features_and_labels[1][self.group_number] = valy_df.values   
        features_and_labels[2][self.group_number] = testy_df.values   
        
        return features_and_labels
    #returns [[train_x1, train_x2..., train_xn, train_y],
    #         [test_x1, test_x2..., test_xn, test_y]]
    
    
    #For a 10k row telecommunication data, it splits data into #company_num row groups and transforms them independently, adds one-hot encoding.
    #There is no label column here, all columns are features.
    def transform_dataset(self, df, random_seed=41):
        company_num = self.company_number
        col_num = len(df.columns)
        row_num = len(df.index)
        dfs = [None]*company_num
        dfs_features = [None]*company_num
        dfs_scaleds = [None]*company_num
        dfs_new = [None]*company_num
        scaler = StandardScaler()
        for i in range(company_num):
            dfs[i] = df.iloc[int(i/company_num*row_num):int((i+1)/company_num*row_num), :]
            dfs[i] = dfs[i].sample(frac=1, axis=1, random_state=random_seed)
            df_features_scaled_temp = pd.DataFrame(scaler.fit_transform(dfs[i]), columns = dfs[i].columns, index = dfs[i].index)
            dfs_new[i] = df_features_scaled_temp
            dfs_new[i]['group'] = i
            cols_num = len(dfs_new[i].columns)
            col_names = [j for j in range(cols_num)]
            dfs_new[i].columns = col_names
        df_final = pd.concat(dfs_new, axis=0)
        last_col_num = cols_num - 1
        df_new = df_final.rename(columns={last_col_num: 'group'})
        df_final_onehot = pd.concat([df_new.iloc[:,:-1], pd.get_dummies(df_new.group, prefix='group')], axis=1)
        #df_final_onehot = df_final_onehot.sample(frac=1, random_state=random_seed)
        return df_final_onehot
    
    def get_vanilla_encoder_model(self, inp_size):
        inputs = keras.layers.Input(shape=(inp_size+self.company_number))
        h1 = keras.layers.Dense(10, activation="relu")(inputs)
        h1 = keras.layers.Dense(10, activation="relu")(inputs)        
        outputs = keras.layers.Dense(inp_size, activation="relu")(h1)
        return keras.Model(inputs,outputs)
    
    #This subclass is created for sampling for a given mean and log_variance.
    class Sampling(layers.Layer):
        def call(self, inputs):
            z_mean, z_log_var = inputs
            batch = tf.shape(z_mean)[0]
            dim = tf.shape(z_mean)[1]
            epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
            return z_mean + tf.exp(0.5 * z_log_var) * epsilon #multiplies with std
    
    def get_variatonal_encoder_model(self, inp_size):
        inputs = layers.Input(shape=(inp_size+self.company_number))
        h1 = layers.Dense(10, activation="relu")(inputs)
        z_mean = layers.Dense(inp_size, name="z_mean")(h1)
        z_log_var = layers.Dense(inp_size, name="z_log_var")(h1)
        outputs = self.Sampling()([z_mean, z_log_var])
        return keras.Model(inputs,outputs)
    #New sampling methods can be added here 
    
    def get_nn_model(self, inp_sizes, drop_out=0.25, hidden_num = 4, hidden_size=32, activation="relu"):
        inp_group_count = len(inp_sizes)
        inputs = [None]*inp_group_count
        for i in range(inp_group_count):
            inputs[i] = keras.layers.Input(shape=(inp_sizes[i]+self.company_number), name="input_"+str(i))
        if self.use_encoder == True:
            encoders = [None]*inp_group_count
            if self.sampling_method == "Vanilla":
                for j in range(inp_group_count):
                    encoders[j] = self.get_vanilla_encoder_model(inp_sizes[j])
            elif self.sampling_method == "Variational":
                for j in range(inp_group_count):
                    encoders[j] = self.get_variatonal_encoder_model(inp_sizes[j])
            #This place can be extended if new sampling methods are added.
            global_inputs = [None]*inp_group_count
            for k in range(inp_group_count):
                global_inputs[k] = encoders[k](inputs[k])
            global_input = keras.layers.concatenate(global_inputs)
        else:
            global_input = keras.layers.concatenate(inputs)
            
        h = keras.layers.Dense(hidden_size, activation="relu", kernel_regularizer=regularizers.l1_l2(l1=1e-4, l2=1e-3))(global_input)
        h = keras.layers.Dropout(drop_out)(h)
        for hidden in range(hidden_num):
            h = keras.layers.Dense(hidden_size, activation="relu", kernel_regularizer=regularizers.l1_l2(l1=1e-4, l2=1e-3))(h)
            h = keras.layers.Dropout(drop_out)(h) 

        outputs = keras.layers.Dense(1, activation=activation)(h)    
        return keras.Model(inputs=inputs, outputs = outputs) 
    
    def default_exp(self, batch_size = 300):
        inp_sizes = self.get_input_group_lenthgs()
        groups = self.get_grouped_feature_cols()
        features_and_labels = self.get_features_and_labels(groups)
        MICS_model = self.get_nn_model(inp_sizes=inp_sizes)
        callback = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50), 
                keras.callbacks.ReduceLROnPlateau("val_loss", factor = 0.8, patience=30,
                                                 verbose = 2, mode = "auto", 
                                                  min_lr = 1e-6)]
        MICS_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01), loss=keras.losses.MeanSquaredError())
        history = MICS_model.fit(x = features_and_labels[0][:-1], y = features_and_labels[0][-1],  
                                 validation_data = (features_and_labels[1][:-1], features_and_labels[1][-1]),
                                 epochs=300, batch_size = batch_size, callbacks=callback)
        training_val_loss = history.history["val_loss"]
        best_row_index = np.argmin(training_val_loss)
        best_val_loss = training_val_loss[best_row_index]
        print(best_val_loss)
        
    def default_exp_house(self, batch_size = 32):
        inp_sizes = self.get_input_group_lenthgs()
        groups = self.get_grouped_feature_cols()
        features_and_labels = self.get_features_and_labels(groups)
        MICS_model = self.get_nn_model(inp_sizes=inp_sizes, activation="sigmoid")
        checkpoint_filepath = 'tmp/checkpoint'
        callback = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50), 
                keras.callbacks.ReduceLROnPlateau("val_loss", factor = 0.8, patience=30,
                                                 verbose = 2, mode = "auto", 
                                                  min_lr = 1e-6),
                keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
                save_weights_only=True,
                monitor='val_loss',
                mode='min',
                save_best_only=True)]  
        
        MICS_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01), loss=keras.losses.BinaryCrossentropy(), metrics=["accuracy"])
        history = MICS_model.fit(x = features_and_labels[0][:-1], y = features_and_labels[0][-1],  
                                 validation_data = (features_and_labels[1][:-1], features_and_labels[1][-1]),
                                 epochs=300, batch_size = batch_size, callbacks=callback)
        
        
        training_acc = history.history["val_accuracy"]
        MICS_model.load_weights(checkpoint_filepath)
        result = MICS_model.evaluate(x = features_and_labels[2][:-1], y = features_and_labels[2][-1])[1]
        
        print("best test accuracy is: " + str(result))
        return result
                


In [None]:
dataset_dir = "./Datasets/energydata_use.csv"
deneyelim = Mics_Model(dataset_dir, use_encoder=True, group_number=2, company_number=1)
deneyelim.get_raw_data()
#deneyelim.raw_data
deneyelim.default_exp()

In [None]:
dataset_dir2 = "./Datasets/houseprices_ready.csv"
deneyelim2 = Mics_Model(dataset_dir2, use_encoder=True, group_number=10, company_number=50)
deneyelim2.get_raw_data()
a = deneyelim2.default_exp_house(batch_size=32)

In [45]:
group_numbers = [2,4,6,8,10]
company_numbers = [1,2,4,7,10,20,40]

result_matrix = [[[] for _ in range(len(company_numbers))] for _ in range(len(group_numbers))]

In [None]:
dataset_dir2 = "./Datasets/houseprices_ready.csv"
for group_index in range(len(group_numbers)):
    for company_index in range(len(company_numbers)):
        for loop in range(5):
            deneyelim2 = Mics_Model(dataset_dir2, use_encoder=True, group_number=group_numbers[group_index], company_number=company_numbers[company_index])
            deneyelim2.get_raw_data()
            score = deneyelim2.default_exp_house(batch_size=32)
            result_matrix[group_index][company_index].append(score)

In [None]:
import copy
result_matrix_copy = copy.deepcopy(result_matrix)
result_matrix_copy

In [49]:
import statistics

In [51]:
avg_std_matrix = [[[] for _ in range(len(company_numbers))] for _ in range(len(group_numbers))]
for group_index in range(len(group_numbers)):
    for company_index in range(len(company_numbers)):
        current_list = result_matrix_copy[group_index][company_index]
        avg_std_matrix[group_index][company_index].append(statistics.mean(current_list)) 
        avg_std_matrix[group_index][company_index].append(statistics.stdev(current_list)) 
        
        

In [55]:
pd.DataFrame(avg_std_matrix, columns=company_numbers, index=group_numbers)

Unnamed: 0,1,2,4,7,10,20,40
2,"[0.901369845867157, 0.008901180200105482]","[0.9013698697090149, 0.01000404993961545]","[0.8968036532402038, 0.002501039350504474]","[0.8904109597206116, 0.012081049554110651]","[0.8757990837097168, 0.011819354129907245]","[0.8657534122467041, 0.024926657367662853]","[0.8347031950950623, 0.010412549991255569]"
4,"[0.8995433688163758, 0.01070870269931029]","[0.9095890402793885, 0.006772776253837231]","[0.8949771761894226, 0.018826970403172494]","[0.8812785387039185, 0.009132415056238363]","[0.8876712322235107, 0.014654610225575277]","[0.8757990837097168, 0.01780236880875643]","[0.8365296721458435, 0.016901132849862725]"
6,"[0.8986301422119141, 0.012670721870541872]","[0.9059360742568969, 0.016014328872232364]","[0.8986301302909852, 0.014221375875773275]","[0.877625572681427, 0.010901684161528903]","[0.8721461176872254, 0.018826959560969136]","[0.8602739691734314, 0.01190721358284922]","[0.8182648420333862, 0.01041255129817017]"
8,"[0.9132420063018799, 0.0055924453351836115]","[0.8885844826698304, 0.01190721358284922]","[0.8794520616531372, 0.011907223868657016]","[0.8931506752967835, 0.006925013290902036]","[0.8721461296081543, 0.028148012965071873]","[0.8648401856422424, 0.01665258945231131]","[0.8228310346603394, 0.01690114895334232]"
10,"[0.8995433807373047, 0.007219812741103662]","[0.8995433807373047, 0.007219812741103662]","[0.8867579817771911, 0.015618848947038588]","[0.8767123222351074, 0.021172657396003998]","[0.8803653120994568, 0.052202777648679415]","[0.8584474921226501, 0.01677731978310403]","[0.8191780805587768, 0.013545567577167085]"
