-
Notifications
You must be signed in to change notification settings - Fork 0
/
ConditionalParameterAggregation.py
224 lines (175 loc) · 9.82 KB
/
ConditionalParameterAggregation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
from scipy import stats
import random
import numpy as np
import pandas as pd
import CleanData
import timeit
import PullDataPostgreSQL
# Conditional Parameter Aggregation (CPA) is one of the most important parts
# of the entire SDV paper. It is what allows the user to synthesize an entire
# database instead of a single table. there are a couple steps that will be taken
# in this paper in accordance with the SDV paper. Keep in mind that the end result
# is to create a series of extended tables that include each original table's data
# and metrics from all the associated children tables
# 1) Start the extended table by saving all the data from the original table.
# 2) Iteratively go through each value in the original table's primary key
# 2a) Go iteratively through each child table
# 2ai) Find all primary key value instances in all children tables
# 2aj) Perform Gaussian Copula and find feature covariance
# 2ak) find alpha and beta values for distribution
# 2al) save all covariance and alpha beta values into the extended table
def ConditionalParameterAggregaation(df, children):
# df is the information from the original table. This includes missing value indices
# and has all datetime values converted to EPOCH
#
# children is a list of all child tables
#
# cur is a database cursor object that will be used to pull data in the future
for childstr in children:
print(childstr)
child = pd.DataFrame.from_csv('%s.csv' % childstr)
child.fillna(value=np.nan, inplace=True)
# saves all data as categorical or not. ignores the primary key
logicalCategorical = CleanData.IdentifyCategorical(child)
# preallocates memory for points to be appended to in the future
df = MakeBlankDataFrame(df, child, childstr, logicalCategorical)
# iterate over all IDs in the primary key with the intent of finding and
# inputting data
for c in range(len(df[df.columns[0]])):
print(c)
ID = df[df.columns[0]][c]
# pulls all data in the child table corresponding to the specific ID
data = pd.DataFrame(child[child[df.columns[0]] == ID])
# iterates over every column in the dataset
for y in range(1, len(child.columns)):
column = child.columns[y]
# if the column is Categorical
if logicalCategorical[y]:
uniqueCategories = sorted(child[child.columns[y]].unique().tolist())
# finds the percentage of each variable in the column of the temporary
# dataset. then saves that
if len(data) == 0:
for z in range(len(uniqueCategories)):
cat = uniqueCategories[z]
colname = 'Categ_%s_%s_%s' % (cat, column, childstr)
df.loc[c, colname] = None
else:
count = CalculateCategoricalPercentage(data, uniqueCategories, column)
# adds the points to the correct column
for z in range(len(uniqueCategories)):
cat = uniqueCategories[z]
colname = 'Categ_%s_%s_%s' % (cat, column, childstr)
df.loc[c, colname] = count.loc[0,cat]
# if the column is continuous
else:
points = ['alpha', 'beta', 'loc', 'scale']
# fit the data to a beta distribution and append it to the extended table
# initial dataframe to make sure that numbers aren't left out
if len(data) == 0:
for z in range(3):
colname = 'Cont_%s_%s_%s' % (points[z], column, childstr)
df.loc[c, colname] = None
else:
statistics = list(stats.beta.fit(data[column]))
for z in range(4):
colname = 'Cont_%s_%s_%s' % (points[z], column, childstr)
df.loc[c, colname] = statistics[z]
return df
def CalculateCategoricalPercentage(data, uniqueCategories, column):
# initial dataframe to make sure that numbers aren't left out
d1 = pd.DataFrame([0] * len(uniqueCategories)).T
d1.columns = uniqueCategories
# finds the percentages to be added to the df
count = data[column].value_counts()
count = (count + d1) / sum(count)
count[count.isnull()] = 0
count = count
return count
def MakeBlankDataFrame(df, child, childstr, logicalCategorical):
# The point of this function is to create a blank dataframe to enter points into int he future
# It uses column names as metadata storage.
#
# df is the original dataframe getting appended to.
# child is the child dataframe being appended
# childstr is the specific name of the child dataframe
# logicalCategorical is a logical list indicating whether each column is categorical or continuous
# ignores the primary key
for y in range(1, len(child.columns)):
column = child.columns[y]
# For categorical variables, we must create a column for each category.
if logicalCategorical[y]:
uniqueCategories = child[child.columns[y]].unique().tolist()
for z in range(len(uniqueCategories)):
cat = uniqueCategories[z]
colname = 'Categ_%s_%s_%s' % (cat, column, childstr)
df = pd.concat([df, pd.DataFrame(np.zeros([len(df)]), columns=[colname])], axis=1)
# For continuous, we must create 4 columns for beta distribution values
else:
points = ['alpha', 'beta', 'loc', 'scale']
for z in range(4):
colname = 'Cont_%s_%s_%s' % (points[z], column, childstr)
df = pd.concat([df, pd.DataFrame(np.zeros([len(df)]), columns=[colname])], axis=1)
return df
def MakeFakeData(Continuous):
# I'm making fake data to debug this with. It's a temporary funcution
if Continuous == 1:
primaryKey = np.linspace(1,100, num=100)
data1 = stats.norm.rvs(loc=10, scale=1, size=100)
data2 = stats.norm.rvs(loc=20, scale=1, size=100)
data3 = stats.norm.rvs(loc=5, scale=2, size=100)
data4 = stats.norm.rvs(loc=100, scale=10, size=100)
df = np.concatenate([primaryKey, data1, data2, data3, data4])
df = df.reshape([5, 100])
df = pd.DataFrame(df.T)
df.columns = ['df_id', 'data1', 'data2', 'data3', 'data4']
foreignKey = [random.sample(primaryKey.tolist(), 1) for _ in range(500)]
foreignKey = np.array([item[0] for item in foreignKey])
data1 = stats.norm.rvs(loc=60, scale=1.5, size=500)
data2 = stats.norm.rvs(loc=30, scale=3, size=500)
data3 = stats.norm.rvs(loc=5, scale=4, size=500)
data4 = stats.norm.rvs(loc=100, scale=1, size=500)
child1 = np.concatenate([foreignKey, data1, data2, data3, data4])
child1 = child1.reshape([5, 500])
child1 = pd.DataFrame(child1.T)
child1.columns = ['df_id', 'data1', 'data2', 'data3', 'data4']
foreignKey = [random.sample(primaryKey.tolist(), 1) for _ in range(500)]
foreignKey = np.array([item[0] for item in foreignKey])
data1 = stats.norm.rvs(loc=6, scale=5, size=500)
data2 = stats.norm.rvs(loc=35, scale=3.5, size=500)
data3 = stats.norm.rvs(loc=2, scale=.5, size=500)
data4 = stats.norm.rvs(loc=10, scale=2, size=500)
child2 = np.concatenate([foreignKey, data1, data2, data3, data4])
child2 = child2.reshape([5, 500])
child2 = pd.DataFrame(child2.T)
child2.columns = ['df_id', 'data1', 'data2', 'data3', 'data4']
else:
primaryKey = np.linspace(1,100, num=100)
data1 = [i[0] for i in [random.sample([1, 2, 3, 4, 5], 1) for _ in range(100)]]
data2 = [i[0] for i in [random.sample([1, 2, 3, 4, 5], 1) for _ in range(100)]]
data3 = [i[0] for i in [random.sample([1, 2, 3, 4, 5], 1) for _ in range(100)]]
data4 = [i[0] for i in [random.sample([1, 2, 3, 4, 5], 1) for _ in range(100)]]
df = np.concatenate([primaryKey, data1, data2, data3, data4])
df = df.reshape([5, 100])
df = pd.DataFrame(df.T)
df.columns = ['df_id', 'data1', 'data2', 'data3', 'data4']
foreignKey = [random.sample(primaryKey.tolist(), 1) for _ in range(500)]
foreignKey = np.array([item[0] for item in foreignKey])
data1 = [i[0] for i in [random.sample([1, 2, 3, 4, 5], 1) for _ in range(500)]]
data2 = [i[0] for i in [random.sample([1, 2, 3, 4, 5], 1) for _ in range(500)]]
data3 = [i[0] for i in [random.sample([1, 2, 3, 4, 5], 1) for _ in range(500)]]
data4 = [i[0] for i in [random.sample([1, 2, 3, 4, 5], 1) for _ in range(500)]]
child1 = np.concatenate([foreignKey, data1, data2, data3, data4])
child1 = child1.reshape([5, 500])
child1 = pd.DataFrame(child1.T)
child1.columns = ['df_id', 'data1', 'data2', 'data3', 'data4']
foreignKey = [random.sample(primaryKey.tolist(), 1) for _ in range(500)]
foreignKey = np.array([item[0] for item in foreignKey])
data1 = [i[0] for i in [random.sample([10, 20, 30, 40, 50], 1) for _ in range(500)]]
data2 = [i[0] for i in [random.sample([10, 20, 30, 40, 50], 1) for _ in range(500)]]
data3 = [i[0] for i in [random.sample([10, 20, 30, 40, 50], 1) for _ in range(500)]]
data4 = [i[0] for i in [random.sample([10, 20, 30, 40, 50], 1) for _ in range(500)]]
child2 = np.concatenate([foreignKey, data1, data2, data3, data4])
child2 = child2.reshape([5, 500])
child2 = pd.DataFrame(child2.T)
child2.columns = ['df_id', 'data1', 'data2', 'data3', 'data4']
return df, child1, child2