In [None]:
###
# App Name:  Creating fake data of 50 projects
# App URI: https://DataDrivenConstruction.io/
# Description: Creating fake data of 50 projects, with data based on the 
# first four files, to show the possibility of creating a model for predicting 
# project prices, which will be based on the company's 50 projects
# DataDrivenConstruction
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
###

In [24]:
#loading nessary libs
import pandas as pd
from sklearn import preprocessing
import random
import json
import numpy as np

In [25]:
#define constants
THRESHOLD = .7      #minimal level of  NAN-values in thr field. If it less than THRESHOLD this field will be remove from dataset 
NUM_OF_FILES = 50   #quantity of output files
NUM_OF_ROWS = 269   #quantity of rows in generated file
CAT_PURE_OUT = False #True - categorical data saving out as pure, False - as encoded  

In [26]:
#import source file
df = pd.read_csv('source.csv', low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269 entries, 0 to 268
Columns: 138 entries, feature_1 to Interior Type
dtypes: float64(65), int64(1), object(72)
memory usage: 290.1+ KB


In [27]:
#sample of source file
df.sample(6)

Unnamed: 0,feature_1,Design Option,Category,Family,Type,Horizontal Profile Offset,Vertical Profile Offset,Length,Family and Type,Type Id,...,Number,Automatically Embed,Curtain Panel,Join Condition,Adjust for Mullion Size,Layout,Spacing,Border 1 Type,Border 2 Type,Interior Type
3,10445184,,OST_Fascia,Fascia,Fascia,0.0,-200.0,127094.0,Fascia,Fascia,...,,,,,,,,,,
89,10428925,,OST_CurtainWallMullions,38x76mm,38x76mm,,,54.0,38x76mm,38x76mm,...,,,,,,,,,,
187,10885923,,OST_Windows,WK - Three Panel Window 600mm,WK - Three Panel Window 600mm,,,,WK - Three Panel Window 600mm,WK - Three Panel Window 600mm,...,,,,,,,,,,
259,10899334,,OST_Walls,plaster,plaster,,,4851.0,plaster,plaster,...,,,,,,,,,,
247,10348085,,OST_Walls,Mesh Fence,Mesh Fence,,,2184.0,Mesh Fence,Mesh Fence,...,1.0,No,Fencing,0.0,Yes,1.0,1800.0,38x76mm,38x76mm,
31,10446357,,OST_PlumbingFixtures,CAMPINES BASIN 540 x 405,CAMPINES BASIN 540 x 405,,,,CAMPINES BASIN 540 x 405,CAMPINES BASIN 540 x 405,...,,,,,,,,,,


In [28]:
#for digital fields replacing NAN to mean value
for clm in df.columns:
    if df[clm].dtypes =='float64':
        df[clm] = df[clm].fillna(df[clm].mean())    

In [29]:
#checking dataset condition
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269 entries, 0 to 268
Columns: 138 entries, feature_1 to Interior Type
dtypes: float64(65), int64(1), object(72)
memory usage: 290.1+ KB


In [30]:
#removing fields with quntity of NAN, more than threshold
drop_thresh = df.shape[0] * THRESHOLD
df = df.dropna(thresh=drop_thresh, how='all', axis='columns')

In [31]:
#replacing missing value to random value for categorical fields
for clm in [clm for clm in df.columns if df[clm].dtypes == 'O']:
    if df[clm].isna().sum() > 0:
        print(clm)
        df[clm] = df[clm].fillna(random.choice(df[clm])) 

Family
Type
Family and Type
Type Id
Phase Created
Phase Demolished
Type Name
Family Name
Volume
Area


In [32]:
#Making lists of true categorical and single-value fields.Removing single-value fields.
true_categorical = [clm for clm in df.columns if df[clm].dtypes == 'O' and len(df[clm].unique()) > 1]
true_categorical.append('feature_1')
single_value =  [clm for clm in df.columns if len(df[clm].unique()) == 1]
df = df.drop(single_value, axis=1)

In [33]:
#checking dataset condition
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269 entries, 0 to 268
Data columns (total 39 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   feature_1                   269 non-null    int64  
 1   Category                    269 non-null    object 
 2   Family                      269 non-null    object 
 3   Type                        269 non-null    object 
 4   Length                      269 non-null    float64
 5   Family and Type             269 non-null    object 
 6   Type Id                     269 non-null    object 
 7   Type Name                   269 non-null    object 
 8   Family Name                 265 non-null    object 
 9   Base Offset                 269 non-null    float64
 10  Top Offset                  269 non-null    float64
 11  Volume                      269 non-null    object 
 12  Structural Material         269 non-null    float64
 13  Area                        269 non

In [34]:
#define Label encoder, making encoder values maping.
le = preprocessing.LabelEncoder()
le_maping = {}
for clm in true_categorical:
    df[clm] = le.fit_transform(df[clm])
    le_maping[clm] = dict(zip(le.transform(le.classes_), le.classes_))

In [35]:
#Saving maping to the file
np.save('le_maping.npy', le_maping) 

#For decode mapind from file use this constraction
#le_maping = np.load('le_maping.npy',allow_pickle='TRUE').item()


In [36]:
#checking dataset condition
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269 entries, 0 to 268
Data columns (total 39 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   feature_1                   269 non-null    int64  
 1   Category                    269 non-null    int32  
 2   Family                      269 non-null    int32  
 3   Type                        269 non-null    int32  
 4   Length                      269 non-null    float64
 5   Family and Type             269 non-null    int32  
 6   Type Id                     269 non-null    int32  
 7   Type Name                   269 non-null    int32  
 8   Family Name                 269 non-null    int32  
 9   Base Offset                 269 non-null    float64
 10  Top Offset                  269 non-null    float64
 11  Volume                      269 non-null    int32  
 12  Structural Material         269 non-null    float64
 13  Area                        269 non

In [41]:
df['price'] = random.uniform(115000, 370000)

In [42]:
#sample of target dataset
df.sample(11)

Unnamed: 0,feature_1,Category,Family,Type,Length,Family and Type,Type Id,Type Name,Family Name,Base Offset,...,Coarse Scale Fill Color,Height Offset From Level,Perimeter,Sill Height,Head Height,Wall Closure,Panel Width,Location Line,Number,price
217,123,17,5,5,6604.0,5,5,5,3,-600.0,...,0.0,431.666667,38455.333333,429.62963,1953.148148,0.153846,887.888889,2.0,2.75,150175.989714
243,183,17,1,1,530.0,1,1,1,3,0.0,...,0.0,431.666667,38455.333333,429.62963,1953.148148,0.153846,887.888889,2.0,2.75,150175.989714
93,191,2,6,6,1715.0,6,6,6,24,36.206897,...,197379.0,431.666667,38455.333333,429.62963,1953.148148,0.153846,887.888889,1.888889,2.75,150175.989714
171,117,4,22,22,3553.267442,22,22,22,21,36.206897,...,197379.0,431.666667,38455.333333,900.0,3050.0,4.0,887.888889,1.888889,2.75,150175.989714
25,215,16,13,13,5600.0,13,13,13,35,36.206897,...,197379.0,431.666667,38455.333333,429.62963,1953.148148,0.153846,887.888889,1.888889,2.75,150175.989714
120,187,3,26,26,3553.267442,26,26,26,31,36.206897,...,197379.0,431.666667,38455.333333,429.62963,1953.148148,0.153846,887.888889,1.888889,2.75,150175.989714
54,42,2,6,6,1571.0,6,6,6,24,36.206897,...,197379.0,431.666667,38455.333333,429.62963,1953.148148,0.153846,887.888889,1.888889,2.75,150175.989714
147,130,11,52,52,3553.267442,52,52,52,2,36.206897,...,0.0,431.666667,38455.333333,429.62963,1953.148148,0.153846,887.888889,1.888889,2.75,150175.989714
200,15,17,5,5,1900.0,5,5,5,3,-1200.0,...,0.0,431.666667,38455.333333,429.62963,1953.148148,0.153846,887.888889,2.0,2.75,150175.989714
225,20,17,1,1,3550.0,1,1,1,3,0.0,...,0.0,431.666667,38455.333333,429.62963,1953.148148,0.153846,887.888889,2.0,2.75,150175.989714


In [43]:
#saving taget dataset to file
df.to_csv('0_1house_R.csv', index=False)

In [49]:
#Generating NUM_OF_FILES files, replacing categorical values to random choise from field, real values to random values from appropriate range
for i in range(NUM_OF_FILES):
    print('Preparing files:', i)
    tmp_df = pd.DataFrame()
    price = random.uniform(115000, 370000)
    for clm in df.columns:
        min_bound, max_baound = df[clm].min(), df[clm].max()
        if clm in true_categorical:
            tmp_df[clm] = random.sample(df[clm].tolist(), NUM_OF_ROWS)
            if CAT_PURE_OUT:
                tmp_df = tmp_df.replace({clm:le_maping[clm]})
        else:
            tmp_df[clm] = [random.uniform(min_bound, max_baound) for val in range (0, NUM_OF_ROWS)]                 
    tmp_df['price'] = NUM_OF_ROWS * [price]    
    tmp_df.to_csv(str(i + 1) + '_1house_R.csv', index=False)
    print('Saving files:', i, ' - OK')
    print()

Preparing files: 0
Saving files: 0  - OK

Preparing files: 1
Saving files: 1  - OK

Preparing files: 2
Saving files: 2  - OK

Preparing files: 3
Saving files: 3  - OK

Preparing files: 4
Saving files: 4  - OK

Preparing files: 5
Saving files: 5  - OK

Preparing files: 6
Saving files: 6  - OK

Preparing files: 7
Saving files: 7  - OK

Preparing files: 8
Saving files: 8  - OK

Preparing files: 9
Saving files: 9  - OK

Preparing files: 10
Saving files: 10  - OK

Preparing files: 11
Saving files: 11  - OK

Preparing files: 12
Saving files: 12  - OK

Preparing files: 13
Saving files: 13  - OK

Preparing files: 14
Saving files: 14  - OK

Preparing files: 15
Saving files: 15  - OK

Preparing files: 16
Saving files: 16  - OK

Preparing files: 17
Saving files: 17  - OK

Preparing files: 18
Saving files: 18  - OK

Preparing files: 19
Saving files: 19  - OK

Preparing files: 20
Saving files: 20  - OK

Preparing files: 21
Saving files: 21  - OK

Preparing files: 22
Saving files: 22  - OK

Preparin

In [47]:
[random.uniform(115000, 370000)]

[271032.7861785048]