In [34]:
import random as rd
import pandas as pd
import math
import numpy as np

from itertools import chain, combinations

In [35]:
#Reading the input table
masterTable = pd.read_csv('data.csv')

In [36]:
#Displaying the data
masterTable.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,Unknown 1,Unknown 2,Unknown 3
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [37]:
#Taking only 1st ten attributes
dataset = masterTable.iloc[: , :10]

In [38]:
dataset.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points
0,2596,51,3,258,0,510,221,232,148,6279
1,2590,56,2,212,-6,390,220,235,151,6225
2,2804,139,9,268,65,3180,234,238,135,6121
3,2785,155,18,242,118,3090,238,238,122,6211
4,2595,45,2,153,-1,391,220,234,150,6172


In [39]:
# Generate subsets of all attrbitues to get all combinations
def generate_subsets(arr):
    all_subsets = []
    for r in range(1, len(arr) + 1):
        subsets_r = list(combinations(arr, r))
        all_subsets.extend(subsets_r)

    return [list(subset) for subset in all_subsets]

In [40]:
# Random Centric: Uniform random distribution over data domain by first computing the range and finding
# a random no in the range of values

def genr_uniform_distr(col,count):
    l_uni = []
    col = col.to_list()
    min_val = min(col)
    max_val = max(col)
    for i in range(count):
        l_uni.append(rd.randint(min_val,max_val))
    return l_uni
# Data Centric: Uniform random distribution over data domain by returning a random values present in the column
def genr_data_distr(col,count):
    l_data = []
    col =  col.to_list()
    for i in range(count):
        index = rd.choice(col)
        l_data.append(index)
    return l_data


In [41]:
# Uniformly ditributed width for random centric queries
def uni_widths(col,query_centre):
    col = col.to_list()
    min_val = min(col)
    max_val = max(col)
    lower_bound = 0
    if(abs(query_centre - min_val) < abs(query_centre - max_val) ):
        upper_bound = abs(query_centre - min_val)
    else:
        upper_bound = abs(query_centre - max_val)
  
    width = np.random.uniform(0, upper_bound, 1)
    return math.ceil(width[0])

In [42]:
# Exponentially ditributed width for random centric queries
def expo_width(col,query_centre):
    col = col.to_list()
    min_val = min(col)
    max_val = max(col)
    lower_bound = 0
    if(abs(query_centre - min_val) < abs(query_centre - max_val) ):
        upper_bound = abs(query_centre - min_val)
    else:
        upper_bound = abs(query_centre - max_val)
 
    scale = (upper_bound) // 4
    width = np.random.exponential(scale)
    return math.ceil(width)

In [43]:
#Finding upper and lowe bound for each query centre
def find_bounds(col,n):
    l_bounds = []
    if(n%2 ==0):
        n_uniform = n_data = n//2
    else:
        n_uniform = n//2
        n_data = n_uniform + 1
    uniform_list = genr_uniform_distr(col,n_uniform)
    data_list = genr_data_distr(col,n_data)
    for i in range(n_uniform):
        query_centre = uniform_list[i]
        width = uni_widths(col,query_centre)
        lb = query_centre - width
        ub = query_centre + width
        bounds = [lb,ub]
        l_bounds.append(bounds)
    for i in range(n_data):
        query_centre = data_list[i]
        width = uni_widths(col,query_centre)
        lb = query_centre - width
        ub = query_centre + width
        bounds = [lb,ub]
        l_bounds.append(bounds)
    return l_bounds

In [44]:
# Get lower bound and lower bound by passing the cloumn and the number of samples to be generated
def getRanges(cols, noSamplesSet):
    attToDist = dict()
    for c in cols:
        attToDist[c] = find_bounds(dataset[c], noSamplesSet)
    return attToDist

In [45]:
#Helper function to create a template skeleton dataframe to fill in the values
def create_zero_dataframe(n, D):
    data = {f"Attribute_{i}": [0.1] * n for i in range(1, D+1)}
    df = pd.DataFrame(data)
    return df

def generateSkeletonData(n, columns):
    zero_df = create_zero_dataframe(n, 2*len(columns))
    newColNames =[]
    for c in columns:
        newColNames.append(c + " LB")
        newColNames.append(c + " UB")
    zero_df.columns = newColNames
    return zero_df

In [46]:
#Helper function to find index
def findIndex(col, columns):
    for i,c in enumerate(columns):
        if (c == col):
            return i

In [47]:
n = 20000
#getting list of all columns
columns = list(dataset.columns)
#getting all subsets
subsets = generate_subsets(columns)
#No of sampes/ subset
sampleSize = math.ceil(n/len(subsets))
#getting the ubber bound and lower bound for each query centre
distributions = getRanges(columns, sampleSize)
#Populating the values into the skeleton dataframe template
data = generateSkeletonData(sampleSize*len(subsets), columns)
ctr = 0
for s in subsets:
    for c in s:
        index = findIndex(c, columns)
        data.iloc[ctr : ctr+sampleSize, 2*index : 2*index+2] = distributions[c]
    ctr = ctr + sampleSize


In [48]:
data

Unnamed: 0,Elevation LB,Elevation UB,Aspect LB,Aspect UB,Slope LB,Slope UB,Horizontal_Distance_To_Hydrology LB,Horizontal_Distance_To_Hydrology UB,Vertical_Distance_To_Hydrology LB,Vertical_Distance_To_Hydrology UB,Horizontal_Distance_To_Roadways LB,Horizontal_Distance_To_Roadways UB,Hillshade_9am LB,Hillshade_9am UB,Hillshade_Noon LB,Hillshade_Noon UB,Hillshade_3pm LB,Hillshade_3pm UB,Horizontal_Distance_To_Fire_Points LB,Horizontal_Distance_To_Fire_Points UB
0,3799.0,3849.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
1,2223.0,3065.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
2,1932.0,3142.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
3,2234.0,3842.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
4,1897.0,1979.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20455,2676.0,2768.0,320.0,342.0,1.0,9.0,155.0,999.0,-96.0,144.0,855.0,1347.0,227.0,253.0,252.0,254.0,71.0,245.0,326.0,794.0
20456,2426.0,3780.0,311.0,335.0,4.0,30.0,133.0,247.0,-90.0,118.0,707.0,3673.0,189.0,245.0,195.0,249.0,111.0,131.0,255.0,467.0
20457,2574.0,2888.0,6.0,110.0,6.0,30.0,37.0,97.0,-137.0,97.0,278.0,2220.0,222.0,252.0,218.0,246.0,149.0,177.0,284.0,2294.0
20458,3170.0,3456.0,4.0,40.0,6.0,10.0,20.0,196.0,-119.0,125.0,2415.0,3979.0,230.0,238.0,217.0,249.0,77.0,149.0,2243.0,5329.0
