In [54]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from copy import deepcopy
from sklearn.random_projection import GaussianRandomProjection
from sklearn.preprocessing import StandardScaler

In [55]:
dataset_dir = "./Datasets/houseprices_ready.csv"
df = pd.read_csv(dataset_dir, index_col=0)
df = df.fillna(df.mean())
df = df.sample(frac=1)
df_features = df.iloc[:,1:]
df_features
df_features_copy = deepcopy(df_features)

In [59]:
labels = pd.DataFrame(df.iloc[:,0], index=df.index)
labels

Unnamed: 0,AboveMedianPrice
490,0
834,0
1331,0
1053,0
934,1
...,...
1226,1
357,0
604,1
1063,0


In [21]:
row_num = len(df_features.index)
col_num = len(df_features.columns)
col_num

10

In [22]:
def get_input_group_lenthgs(group_num, feature_num):
    input_sizes = [None]*group_num
    for i in range(group_num):
        group_size = round(feature_num/(group_num-i))
        input_sizes[i] = group_size
        feature_num = feature_num - group_size
    return input_sizes

In [23]:
get_input_group_lenthgs(3,17)

[6, 6, 5]

In [11]:
import random
#This method returns grouped column numbers
#[[1,4,5],[2,3,6]]
def get_grouped_numbers(group_num, feature_num):
    grouped_feature_cols = [None]*group_num
    inp_sizes = get_input_group_lenthgs(group_num, feature_num)
    total_nums = [i for i in range(feature_num)]
    for j in range(len(inp_sizes)):
        size = inp_sizes[j]
        temp_list = random.sample(total_nums, size)
        grouped_feature_cols[j] = temp_list
        for k in temp_list:
            total_nums.remove(k)
    return grouped_feature_cols

In [12]:
get_grouped_numbers(3, 17)

[[6, 7, 5, 2, 9, 1], [3, 8, 15, 4, 10, 11], [16, 13, 12, 14, 0]]

In [48]:
group_num = 3
group_lengths = get_input_group_lenthgs(group_num, col_num)
group_columns = get_grouped_numbers(group_num, col_num)
group_index_list = [None]*group_num
for i in range(group_num):
    group_index_list[i] = get_grouped_numbers(group_num, row_num)
df_list_of_col_groups = [None]*group_num
col_groups = [None]*group_num
for i, group_column in enumerate(group_columns):
    temp_col_group = [None]*group_num
    for j, group_row in enumerate(group_index_list[i]):
        scaler = StandardScaler()
        temp_row_group = df_features.iloc[group_row,group_column]
        scaled_row_group = pd.DataFrame(scaler.fit_transform(temp_row_group), columns = temp_row_group.columns, index = temp_row_group.index)
        transformer = GaussianRandomProjection(n_components=group_lengths[i])
        transformed_row_group = pd.DataFrame(transformer.fit_transform(scaled_row_group), columns = scaled_row_group.columns, index = scaled_row_group.index)
        transformed_row_group["company"] = j
        temp_col_group[j] = transformed_row_group
    temp_col_group_df = pd.concat(temp_col_group, axis=0)
    temp_col_group_df = pd.concat([temp_col_group_df.iloc[:,:-1], pd.get_dummies(temp_col_group_df.company, prefix='group_{}_company'.format(i))], axis=1)        
    col_groups[i] = temp_col_group_df
final_df_features = pd.concat(col_groups, axis=1)
final_df = pd.concat([final_df_features, labels], axis=1)

In [49]:
col_groups[0]

Unnamed: 0,HalfBath,LotArea,GarageArea,group_0_company_0,group_0_company_1,group_0_company_2
699,0.397754,-0.019291,0.187377,1,0,0
1133,-0.569779,-0.104484,-1.041596,1,0,0
184,0.837758,-0.049794,0.241101,1,0,0
479,1.379488,-0.152575,-0.070984,1,0,0
486,-0.235937,-0.150988,-1.136298,1,0,0
...,...,...,...,...,...,...
229,-0.551361,-0.820717,-0.069416,0,0,1
483,-0.434755,-0.793467,-0.089334,0,0,1
1335,0.013230,-0.011705,-0.263124,0,0,1
23,-0.462892,-0.470672,-0.131843,0,0,1


In [50]:
col_groups[1]

Unnamed: 0,TotalBsmtSF,TotRmsAbvGrd,OverallCond,BedroomAbvGr,group_1_company_0,group_1_company_1,group_1_company_2
209,-0.398053,-1.109696,0.522870,0.287041,1,0,0
1457,-0.774931,-3.114580,2.287585,-0.046031,1,0,0
81,0.243555,0.804066,-1.379757,0.041949,1,0,0
398,0.885810,2.945895,-1.842818,-0.344249,1,0,0
1383,-0.619847,-1.584113,1.201281,0.209426,1,0,0
...,...,...,...,...,...,...,...
9,-0.138545,1.422282,-0.578149,-0.509883,0,0,1
273,0.283061,0.120376,0.000133,-0.073814,0,0,1
411,-0.714329,0.648375,0.091560,0.022968,0,0,1
841,1.326684,-1.014326,-0.219309,-1.950947,0,0,1


In [51]:
col_groups[2]

Unnamed: 0,FullBath,OverallQual,Fireplaces,group_2_company_0,group_2_company_1,group_2_company_2
447,-0.174759,-0.165708,0.064849,1,0,0
1380,1.074370,0.296477,-0.103763,1,0,0
844,-0.875268,0.516553,-0.312136,1,0,0
1126,-0.174759,-0.165708,0.064849,1,0,0
188,-3.525414,1.418889,-0.897494,1,0,0
...,...,...,...,...,...,...
134,-0.856152,0.400784,-0.111199,0,0,1
700,-0.088222,0.012664,-0.039917,0,0,1
1418,0.560439,-0.141656,-0.048293,0,0,1
627,-0.638201,-0.769354,1.562105,0,0,1


In [61]:
final_df_features = pd.concat(col_groups, axis=1)
final_df_features

Unnamed: 0,HalfBath,LotArea,GarageArea,group_0_company_0,group_0_company_1,group_0_company_2,TotalBsmtSF,TotRmsAbvGrd,OverallCond,BedroomAbvGr,group_1_company_0,group_1_company_1,group_1_company_2,FullBath,OverallQual,Fireplaces,group_2_company_0,group_2_company_1,group_2_company_2
0,-0.111033,-0.192690,-1.312654,1,0,0,0.030070,0.043675,0.494752,-0.250460,1,0,0,0.456245,0.017358,0.441808,0,1,0
1,1.369236,-0.082403,-0.590148,0,1,0,1.493019,0.051247,-0.300425,-1.319645,0,0,1,0.180719,-0.096771,0.035205,1,0,0
2,0.401681,-0.238836,-1.311531,1,0,0,0.017041,0.439585,0.101512,0.177416,1,0,0,-0.344199,0.142038,-0.063678,0,0,1
3,1.356742,-0.105230,0.191637,1,0,0,-0.099956,0.092069,0.558773,0.037226,1,0,0,-0.311790,-0.015699,0.388430,0,1,0
4,1.995278,-0.451044,-1.706883,1,0,0,0.370584,0.365817,0.779444,-0.453563,1,0,0,-0.088222,0.012664,-0.039917,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,-2.190836,0.217408,0.970446,0,1,0,0.084375,0.315278,0.169786,-0.072837,1,0,0,-0.600175,0.271411,-0.087438,0,0,1
1456,0.619916,0.053501,0.725907,1,0,0,-0.598896,0.244835,0.600633,0.439251,0,1,0,-1.057963,0.261178,-0.232457,1,0,0
1457,-1.153403,0.280175,1.110951,1,0,0,-0.774931,-3.114580,2.287585,-0.046031,1,0,0,-1.413441,0.192241,-0.202813,1,0,0
1458,-1.198531,0.295370,1.175409,1,0,0,-0.193236,-0.248170,-0.685313,0.233643,1,0,0,0.560439,-0.141656,-0.048293,0,0,1


In [62]:
final_df = pd.concat([final_df_features, labels], axis=1)
final_df

Unnamed: 0,HalfBath,LotArea,GarageArea,group_0_company_0,group_0_company_1,group_0_company_2,TotalBsmtSF,TotRmsAbvGrd,OverallCond,BedroomAbvGr,group_1_company_0,group_1_company_1,group_1_company_2,FullBath,OverallQual,Fireplaces,group_2_company_0,group_2_company_1,group_2_company_2,AboveMedianPrice
0,-0.111033,-0.192690,-1.312654,1,0,0,0.030070,0.043675,0.494752,-0.250460,1,0,0,0.456245,0.017358,0.441808,0,1,0,1
1,1.369236,-0.082403,-0.590148,0,1,0,1.493019,0.051247,-0.300425,-1.319645,0,0,1,0.180719,-0.096771,0.035205,1,0,0,1
2,0.401681,-0.238836,-1.311531,1,0,0,0.017041,0.439585,0.101512,0.177416,1,0,0,-0.344199,0.142038,-0.063678,0,0,1,1
3,1.356742,-0.105230,0.191637,1,0,0,-0.099956,0.092069,0.558773,0.037226,1,0,0,-0.311790,-0.015699,0.388430,0,1,0,0
4,1.995278,-0.451044,-1.706883,1,0,0,0.370584,0.365817,0.779444,-0.453563,1,0,0,-0.088222,0.012664,-0.039917,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,-2.190836,0.217408,0.970446,0,1,0,0.084375,0.315278,0.169786,-0.072837,1,0,0,-0.600175,0.271411,-0.087438,0,0,1,1
1456,0.619916,0.053501,0.725907,1,0,0,-0.598896,0.244835,0.600633,0.439251,0,1,0,-1.057963,0.261178,-0.232457,1,0,0,1
1457,-1.153403,0.280175,1.110951,1,0,0,-0.774931,-3.114580,2.287585,-0.046031,1,0,0,-1.413441,0.192241,-0.202813,1,0,0,1
1458,-1.198531,0.295370,1.175409,1,0,0,-0.193236,-0.248170,-0.685313,0.233643,1,0,0,0.560439,-0.141656,-0.048293,0,0,1,0


In [66]:
labels.sort_index()

Unnamed: 0,AboveMedianPrice
0,1
1,1
2,1
3,0
4,1
...,...
1455,1
1456,1
1457,1
1458,0
