duplicated discarted

damodhar918 · Oct 18, 2023 · 004b7e7 · 004b7e7
1 parent 3f307e6
commit 004b7e7
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -82,6 +82,7 @@ Example configuration file:
 | outcometime3  | dependentDateRange | incometime2\|1D\|3W\|%Y-%m-%d %H:%M:%S                  |
 | model1        | category           | Customers\|Lending\|Web_Lending                         |
 | model         | category           | Customers\|Lending\|Web_Lending \|                      |
+| null1         | category           | \|                      |
 | gender1       | category           | 0\|1\|~0.4\|0.5\|0.1                                    |
 | probability1  | floatRange         | 0.001\|1\|3                                             |
 | float1        | floatRange         | 0.001\|0.3\|5                                           |
@@ -103,6 +104,7 @@ incometime2,dateRange,2021-10-10 | 2022-10-26|%Y-%m-%d %H:%M:%S
 outcometime3,dependentDateRange,incometime2|1D|3W|%Y-%m-%d %H:%M:%S
 model1,category,Customers|Lending|Web_Lending
 model,category,Customers|Lending|Web_Lending|
+null1,category,
 gender1,category,0|1|~0.4|0.5|0.1
 probability1,floatRange,0.001|1|3
 float1,floatRange,0.001|0.3|5
@@ -123,8 +125,9 @@ Explanation of data patterns as per defined in the configuration file :
 - `dateRange`: This indicates that the `dateRange1` and `incometime2` columns should contain random date values within the range from `2021-10-10` to `2022-10-26`. The format of the dates in `incometime2` also includes `%Y-%m-%d %H:%M:%S`. For other formats reference given below.
 - `dependentDateRange`: This indicates that the `outcometime3` column should contain random duration values within the range from `1D` to `3W` in addition to the `incometime2`.Here `1D` means 1 day and `3W` means 3 weeks. Other compatable inputs are `10S` means 10 seconds, `5m` means 5 minutes, `2h` means 2 hours, `3d` means 3 days, `4W` means 4 weeks. The format of the dates in `outcometime3` also includes`%Y-%m-%d %H:%M:%S`. format reference given below.
 - `category`: This indicates that the `model1` column should contain random categorical values chosen from the options "Customers", "Lending", and "Web_Lending".
-**Note 1**: If you want to add empty value in the column then add `|` at the end of the values as in `model`.
-**Note 2**: : If you want categorical values with probilities then add `~` at the end of the values as in `gender1` input `0|1|~0.4|0.5|0.1`. here ~ is seperater between categorical values and probilities ["0", "1",""] ~ ["0.4", "0.5", "0.1"].
+**Note 1**: If you want to add some empty value in the column then add `|` at the end of the values as in `model`.
+**Note 2**: If you want categorical values with probilities then add `~` at the end of the values as in `gender1` input `0|1|~0.4|0.5|0.1`. here ~ is seperater between categorical values and probilities ["0", "1",""] ~ ["0.4", "0.5", "0.1"].
+**Note 3**: If you want full null/empty values in the column then add `|` at the end of the values as in `null1` column.
 - `floatRange`: This indicates that the `probability1` and `float` columns should contain random float values within a given range. The range for `probability1` is from `0.001` to `1`, with a precision of 3 decimal places. The range for `float` is from `0.001` to `0.3`, with a precision of 5 decimal places.
 - `intRange`: This indicates that the `number1` column should contain random integer values within the range from 10 to 25.
 - `constant`: This indicates that the `test1` column should contain a constant value (`Done`) for all rows.

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 pandas
 pyarrow
 exrex
+colorama
diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -10,4 +10,5 @@ twine
 pandas
 pyarrow
 exrex
+colorama
 pytest-cov
diff --git a/sdgp/sdgp.py b/sdgp/sdgp.py
@@ -93,11 +93,11 @@ def __init__(self, volume: int, file: str, conf_file: str,
                     split('.csv')[0]:
                 self.file = self.file+'_'
             self.conf_file_path = conf_file.strip()
-            self.conf_df = self.checkFile(self.conf_file_path)
+            self.conf_df = self.checkFile(self.conf_file_path).astype('str')
             self.conf_dict = self.conf_df.to_dict(
                 orient='index')  # Configuration dictionary
-            self.conf_types = {x.get('type') for x in self.conf_dict.values()}
-            self.conf_columns = [x.get('name') for x in
+            self.conf_types = {x.get('type').strip() for x in self.conf_dict.values()}
+            self.conf_columns = [x.get('name').strip() for x in
                                  self.conf_dict.values()]
             self.allowed_types = [
                 'uniqueIndex', 'dateRange', 'date', 'category',
@@ -172,7 +172,7 @@ def saveInCSV(self):
             df (pd.DataFrame): DataFrame to save.
             file_name (str): File name for the CSV file.
         """
-        df = self.df_mock.sample(self.volume).drop_duplicates()
+        df = self.df_mock
         self.mock_file_csv_path = f"{self.file}\
 _{self.choice}_{self.volume}.csv"
         df.to_csv(self.mock_file_csv_path, index=False, header=True)
@@ -189,7 +189,7 @@ def saveInParquet(self):
             df (pd.DataFrame): DataFrame to save.
             file_name (str): File name for the Parquet file.
         """
-        df = self.df_mock.sample(self.volume).drop_duplicates()
+        df = self.df_mock
         self.mock_file_parquet_path = f"{self.file}\
 _{self.choice}_{self.volume}.parquet"
         table = pa.Table.from_pandas(df, preserve_index=False)
@@ -523,6 +523,9 @@ def editMockDataAndGenerate(self):
         and saves it.
         """
         df = self.checkFile(self.csv_file_path)
+        if df.shape[0] > self.n:
+            raise ValueError(f"given no. of rows is greater than {self.n}")
+        self.conf_columns = [*df.columns]
         self.genMockData(df)
         self.generateWithConf()
         self.output()
@@ -533,5 +536,6 @@ def justScaleData(self):
         on the existing data, and saves it.
         """
         df = self.checkFile(self.csv_file_path)
+        self.conf_columns = [*df.columns]
         self.genMockData(df)
         self.output()