Skip to content

Commit

Permalink
duplicated discarted
Browse files Browse the repository at this point in the history
  • Loading branch information
jdamodhar committed Oct 18, 2023
1 parent 3f307e6 commit 004b7e7
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 7 deletions.
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ Example configuration file:
| outcometime3 | dependentDateRange | incometime2\|1D\|3W\|%Y-%m-%d %H:%M:%S |
| model1 | category | Customers\|Lending\|Web_Lending |
| model | category | Customers\|Lending\|Web_Lending \| |
| null1 | category | \| |
| gender1 | category | 0\|1\|~0.4\|0.5\|0.1 |
| probability1 | floatRange | 0.001\|1\|3 |
| float1 | floatRange | 0.001\|0.3\|5 |
Expand All @@ -103,6 +104,7 @@ incometime2,dateRange,2021-10-10 | 2022-10-26|%Y-%m-%d %H:%M:%S
outcometime3,dependentDateRange,incometime2|1D|3W|%Y-%m-%d %H:%M:%S
model1,category,Customers|Lending|Web_Lending
model,category,Customers|Lending|Web_Lending|
null1,category,
gender1,category,0|1|~0.4|0.5|0.1
probability1,floatRange,0.001|1|3
float1,floatRange,0.001|0.3|5
Expand All @@ -123,8 +125,9 @@ Explanation of data patterns as per defined in the configuration file :
- `dateRange`: This indicates that the `dateRange1` and `incometime2` columns should contain random date values within the range from `2021-10-10` to `2022-10-26`. The format of the dates in `incometime2` also includes `%Y-%m-%d %H:%M:%S`. For other formats reference given below.
- `dependentDateRange`: This indicates that the `outcometime3` column should contain random duration values within the range from `1D` to `3W` in addition to the `incometime2`.Here `1D` means 1 day and `3W` means 3 weeks. Other compatable inputs are `10S` means 10 seconds, `5m` means 5 minutes, `2h` means 2 hours, `3d` means 3 days, `4W` means 4 weeks. The format of the dates in `outcometime3` also includes`%Y-%m-%d %H:%M:%S`. format reference given below.
- `category`: This indicates that the `model1` column should contain random categorical values chosen from the options "Customers", "Lending", and "Web_Lending".
**Note 1**: If you want to add empty value in the column then add `|` at the end of the values as in `model`.
**Note 2**: : If you want categorical values with probilities then add `~` at the end of the values as in `gender1` input `0|1|~0.4|0.5|0.1`. here ~ is seperater between categorical values and probilities ["0", "1",""] ~ ["0.4", "0.5", "0.1"].
**Note 1**: If you want to add some empty value in the column then add `|` at the end of the values as in `model`.
**Note 2**: If you want categorical values with probilities then add `~` at the end of the values as in `gender1` input `0|1|~0.4|0.5|0.1`. here ~ is seperater between categorical values and probilities ["0", "1",""] ~ ["0.4", "0.5", "0.1"].
**Note 3**: If you want full null/empty values in the column then add `|` at the end of the values as in `null1` column.
- `floatRange`: This indicates that the `probability1` and `float` columns should contain random float values within a given range. The range for `probability1` is from `0.001` to `1`, with a precision of 3 decimal places. The range for `float` is from `0.001` to `0.3`, with a precision of 5 decimal places.
- `intRange`: This indicates that the `number1` column should contain random integer values within the range from 10 to 25.
- `constant`: This indicates that the `test1` column should contain a constant value (`Done`) for all rows.
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pandas
pyarrow
exrex
colorama
1 change: 1 addition & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ twine
pandas
pyarrow
exrex
colorama
pytest-cov
14 changes: 9 additions & 5 deletions sdgp/sdgp.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,11 @@ def __init__(self, volume: int, file: str, conf_file: str,
split('.csv')[0]:
self.file = self.file+'_'
self.conf_file_path = conf_file.strip()
self.conf_df = self.checkFile(self.conf_file_path)
self.conf_df = self.checkFile(self.conf_file_path).astype('str')
self.conf_dict = self.conf_df.to_dict(
orient='index') # Configuration dictionary
self.conf_types = {x.get('type') for x in self.conf_dict.values()}
self.conf_columns = [x.get('name') for x in
self.conf_types = {x.get('type').strip() for x in self.conf_dict.values()}
self.conf_columns = [x.get('name').strip() for x in
self.conf_dict.values()]
self.allowed_types = [
'uniqueIndex', 'dateRange', 'date', 'category',
Expand Down Expand Up @@ -172,7 +172,7 @@ def saveInCSV(self):
df (pd.DataFrame): DataFrame to save.
file_name (str): File name for the CSV file.
"""
df = self.df_mock.sample(self.volume).drop_duplicates()
df = self.df_mock
self.mock_file_csv_path = f"{self.file}\
_{self.choice}_{self.volume}.csv"
df.to_csv(self.mock_file_csv_path, index=False, header=True)
Expand All @@ -189,7 +189,7 @@ def saveInParquet(self):
df (pd.DataFrame): DataFrame to save.
file_name (str): File name for the Parquet file.
"""
df = self.df_mock.sample(self.volume).drop_duplicates()
df = self.df_mock
self.mock_file_parquet_path = f"{self.file}\
_{self.choice}_{self.volume}.parquet"
table = pa.Table.from_pandas(df, preserve_index=False)
Expand Down Expand Up @@ -523,6 +523,9 @@ def editMockDataAndGenerate(self):
and saves it.
"""
df = self.checkFile(self.csv_file_path)
if df.shape[0] > self.n:
raise ValueError(f"given no. of rows is greater than {self.n}")
self.conf_columns = [*df.columns]
self.genMockData(df)
self.generateWithConf()
self.output()
Expand All @@ -533,5 +536,6 @@ def justScaleData(self):
on the existing data, and saves it.
"""
df = self.checkFile(self.csv_file_path)
self.conf_columns = [*df.columns]
self.genMockData(df)
self.output()

0 comments on commit 004b7e7

Please sign in to comment.