In [2]:
import pandas as pd
import numpy as np
import shap

from sklearn.naive_bayes import GaussianNB  
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# Import data
from os.path import dirname, join
from pathlib import Path
# ROOT_DIR = dirname(os.path.abspath('ARCH7211-SPRING2021')) # This is your Project Root 
ROOT_DIR = dirname(os.path.abspath('ARCH7211-SPRING2021')) # This is your Project Root
data_PATH = join(ROOT_DIR[:-16],"data","room_info.csv")  # requires `import os`
print(data_PATH)
# input data
data_raw = pd.read_csv(data_PATH)

#Cleaning data and feature engineering
exclusion = ["Undefined","UserDefined","Other","Elevated"]
merge = {
    "Sauna": "Bath",
    "Den":"LivingRoom",
    "Alcove":"LivingRoom",
    "Library": "Room",
    "RecreationRoom":"Room",
    "DressingRoom":'Closet',
    "Attic" : "Storage",
    "Office" : "Room",
    "TechnicalRoom":"Utility",
    "DraughtLobby" : "Entry",
    "Hall" : "Entry",
    "Basement" : "Storage"
}

data_raw["type_cleaned"] = data_raw["type"].apply(lambda x: merge.get(x,x))
data_raw["exclude"] = ~data_raw["type"].isin(exclusion)

data_cleaned = data_raw[~data_raw["type"].isin(exclusion)]

data_cleaned.groupby("type_cleaned",as_index=False).count().sort_values("path")

c:\Users\milad\Documents\GitHub\arch7211-spring2021\data\room_info.csv


Unnamed: 0,type_cleaned,path,type,classes,floor_index,num_sides,area,proportion_floor_area,perimeter,compactness,...,contains_Sink,contains_SpaceForAppliance,contains_Stove,contains_Toilet,contains_TumbleDryer,contains_Urinal,contains_WallCabinet,contains_WashingMachine,contains_WaterTap,exclude
2,CarPort,279,279,279,279,279,279,279,279,279,...,279,279,279,279,279,279,279,279,279,279
6,Garage,414,414,414,414,414,414,414,414,414,...,414,414,414,414,414,414,414,414,414,414
4,Dining,954,954,954,954,954,954,954,954,954,...,954,954,954,954,954,954,954,954,954,954
12,Utility,1489,1489,1489,1489,1489,1489,1489,1489,1489,...,1489,1489,1489,1489,1489,1489,1489,1489,1489,1489
11,Storage,1953,1953,1953,1953,1953,1953,1953,1953,1953,...,1953,1953,1953,1953,1953,1953,1953,1953,1953,1953
10,Room,2116,2116,2116,2116,2116,2116,2116,2116,2116,...,2116,2116,2116,2116,2116,2116,2116,2116,2116,2116
3,Closet,3041,3041,3041,3041,3041,3041,3041,3041,3041,...,3041,3041,3041,3041,3041,3041,3041,3041,3041,3041
8,LivingRoom,4428,4428,4428,4428,4428,4428,4428,4428,4428,...,4428,4428,4428,4428,4428,4428,4428,4428,4428,4428
7,Kitchen,4548,4548,4548,4548,4548,4548,4548,4548,4548,...,4548,4548,4548,4548,4548,4548,4548,4548,4548,4548
5,Entry,6050,6050,6050,6050,6050,6050,6050,6050,6050,...,6050,6050,6050,6050,6050,6050,6050,6050,6050,6050


In [4]:
# Merging two lists with unique values in return
def dMerger(df1,df2):
    container = df1
    for data in df2:
        if data not in container:
            container.append(data)
    return container 

# Check redundancy of columns in dataframes
def colUniq (dframe):
    temp = []
    for col in dframe.columns:
        if col not in temp:
            temp.append(col)
    return('num cols in df: ',len(dframe.columns),' num unique cols:',len(temp))

#Save dataset to csv
def Save_df(df,filename,dest_folder):
    if len(df) >1 and 'list' in str(type(df)):
        dSet_PATH = join(ROOT_DIR[:-16],dest_folder)
        for i,dset in enumerate(df):
            exp_path = join(dSet_PATH,filename[i])
            dset.to_csv(exp_path, index= False)
            print(filename[i]," is saved in: ",exp_path)
    elif type(df) != 'list':
        dSet_PATH = join(ROOT_DIR[:-16],dest_folder)
        exp_path = join(dSet_PATH,filename)
        df.to_csv(exp_path,index= False, columns=df.columns)
        print(filename," is saved in: ",exp_path)
    print("\nDone!") 

In [5]:
# # Dividing the raw dataset into three sub categories (tier A,B,C)
# # Tiers are: fur = furniture related features, con= space connectivity features, spc = space characteristics features
# tier_fur = ['type_cleaned']
# tier_spc = []
# tier_con = ['type_cleaned']
# for col in data_cleaned.columns:
#     if 'contains_' in col:
#         tier_fur.append(col)
#     elif '_to_' in col:
#         tier_con.append(col)
#     else:
#         tier_spc.append(col)
# tier_2a = dMerger(tier_spc,tier_con)
# tier_2b = dMerger(tier_spc,tier_fur)

# col_select = {
#     'tierA' : tier_fur,
#     'tierB' : tier_con,
#     'tierC' : tier_spc,
#     'tier_2a' : tier_2a,
#     'tier_2b' : tier_2b 
#             } 


In [6]:
#Defining Train and Target data
shuffled_data = data_cleaned.sample(frac=1)
X = shuffled_data.drop([x for x in['path','type','classes','open_to','door_to','contains','type_cleaned','exclude'] if x in shuffled_data.columns], axis =1)
y = shuffled_data['type_cleaned']
#Defining Train and Target data
#Split the data into training data, and test data 
X_train , X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, stratify=y, random_state=31)
X_train_train , X_train_val, y_train_train, y_train_val = train_test_split(X_train,y_train, test_size= 0.2, stratify=y_train, random_state=31)

In [7]:
dsets = [X_train_train,X_train_val,y_train_train,y_train_val,X_test,y_test]
filenames = ['X_train_train.csv','X_train_val.csv','y_train_train.csv','y_train_val.csv','X_test.csv','y_test.csv']
Save_df(dsets,filenames,'Train_Test_Val')

X_train_train.csv  is saved in:  c:\Users\milad\Documents\GitHub\arch7211-spring2021\Train_Test_Val\X_train_train.csv
X_train_val.csv  is saved in:  c:\Users\milad\Documents\GitHub\arch7211-spring2021\Train_Test_Val\X_train_val.csv
y_train_train.csv  is saved in:  c:\Users\milad\Documents\GitHub\arch7211-spring2021\Train_Test_Val\y_train_train.csv
y_train_val.csv  is saved in:  c:\Users\milad\Documents\GitHub\arch7211-spring2021\Train_Test_Val\y_train_val.csv
X_test.csv  is saved in:  c:\Users\milad\Documents\GitHub\arch7211-spring2021\Train_Test_Val\X_test.csv
y_test.csv  is saved in:  c:\Users\milad\Documents\GitHub\arch7211-spring2021\Train_Test_Val\y_test.csv

Done!


In [8]:
Save_df(data_cleaned,'data_cleaned.csv','Train_Test_Val')

data_cleaned.csv  is saved in:  c:\Users\milad\Documents\GitHub\arch7211-spring2021\Train_Test_Val\data_cleaned.csv

Done!


In [18]:
print('X_Train')
print(X_train_train.info())
print('\n')
print('y_Train')
print(y_train_train.unique(),len(y_train_val))
print('\n')
print('X_val')
print('Validation sets')
print(X_train_val.info())
print('\n')
print('y_val')
print(y_train_val.unique(), len(y_train_val))

X_Train
<class 'pandas.core.frame.DataFrame'>
Int64Index: 27106 entries, 37389 to 54006
Columns: 104 entries, floor_index to contains_WaterTap
dtypes: float64(5), int64(99)
memory usage: 21.7 MB
None


y_Train
['Closet' 'Bedroom' 'Room' 'Storage' 'Outdoor' 'Kitchen' 'Dining' 'Bath'
 'LivingRoom' 'Entry' 'Utility' 'Garage' 'CarPort'] 6777


X_val
Validation sets
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6777 entries, 6233 to 39972
Columns: 104 entries, floor_index to contains_WaterTap
dtypes: float64(5), int64(99)
memory usage: 5.7 MB
None


y_val
['Outdoor' 'Dining' 'Bedroom' 'Entry' 'Bath' 'Room' 'LivingRoom' 'Kitchen'
 'Closet' 'Utility' 'Storage' 'Garage' 'CarPort'] 6777


['Closet' 'Bedroom' 'Room' 'Storage' 'Outdoor' 'Kitchen' 'Dining' 'Bath'
 'LivingRoom' 'Entry' 'Utility' 'Garage' 'CarPort']


In [12]:
print(y_train_val.unique())

['Outdoor' 'Dining' 'Bedroom' 'Entry' 'Bath' 'Room' 'LivingRoom' 'Kitchen'
 'Closet' 'Utility' 'Storage' 'Garage' 'CarPort']


In [13]:
print(X_train_val.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6777 entries, 6233 to 39972
Columns: 104 entries, floor_index to contains_WaterTap
dtypes: float64(5), int64(99)
memory usage: 5.7 MB
None
