In [2]:
from database_utils import DatabaseConnector
from data_extraction import DataExtractor
import pandas as pd

# Initialize DatabaseConnector for RDS
rds_db_connector = DatabaseConnector(config_path='aws_db_creds.yaml')

# Initialize DataExtractor
data_extractor = DataExtractor(rds_db_connector)


Database engine initialized successfully.


In [4]:
# List tables in the database
tables = data_extractor.list_tables()
print(f"Tables in the RDS database: {tables}")

# Check if 'orders_table' is present and extract it
target_table = 'orders_table'
if target_table in tables:
    df = data_extractor.read_rds_table(target_table)
    if df is not None:
        print("Data before cleaning:")
        print(df.head())  # Display the first few rows of the DataFrame
    else:
        print(f"Failed to read data from table {target_table}")
else:
    print(f"Table {target_table} not found in the database.")

Tables in the database: ['legacy_store_details', 'dim_card_details', 'legacy_users', 'orders_table']
Tables in the RDS database: ['legacy_store_details', 'dim_card_details', 'legacy_users', 'orders_table']
Data before cleaning:
   level_0  index                             date_uuid first_name last_name  \
0        0      0  9476f17e-5d6a-4117-874d-9cdb38ca1fa6       None      None   
1        1      1  0423a395-a04d-4e4a-bd0f-d237cbd5a295       None      None   
2        2      2  65187294-bb16-4519-adc0-787bbe423970       None      None   
3        3      3  579e21f7-13cb-436b-83ad-33687a4eb337       None      None   
4        4      4  00ab86c3-2039-4674-b9c1-adbcbbf525bd       None      None   

                              user_uuid       card_number    store_code  \
0  93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8    30060773296197   BL-8387506C   
1  8fe96c3a-d62d-4eb5-b313-cf12d9126a49   349624180933183  WEB-1388012W   
2  fc461df4-b919-48b2-909e-55c95a03fe6b  3529023891650490   CH-01D

In [5]:
df.head()

Unnamed: 0,level_0,index,date_uuid,first_name,last_name,user_uuid,card_number,store_code,product_code,1,product_quantity
0,0,0,9476f17e-5d6a-4117-874d-9cdb38ca1fa6,,,93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8,30060773296197,BL-8387506C,R7-3126933h,,3
1,1,1,0423a395-a04d-4e4a-bd0f-d237cbd5a295,,,8fe96c3a-d62d-4eb5-b313-cf12d9126a49,349624180933183,WEB-1388012W,C2-7287916l,,2
2,2,2,65187294-bb16-4519-adc0-787bbe423970,,,fc461df4-b919-48b2-909e-55c95a03fe6b,3529023891650490,CH-01D85C8D,S7-1175877v,,2
3,3,3,579e21f7-13cb-436b-83ad-33687a4eb337,,,6104719f-ef14-4b09-bf04-fb0c4620acb0,213142929492281,CL-C183BE4B,D8-8421505n,,2
4,4,4,00ab86c3-2039-4674-b9c1-adbcbbf525bd,,,9523a6d3-b2dd-4670-a51a-36aebc89f579,502067329974,SO-B5B9CB3B,B6-2596063a,,2


In [10]:
import re

# Define a regex pattern for UUID
uuid_pattern = re.compile(r'^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$')

# Check if all values match the UUID pattern
invalid_uuids = df[~df['user_uuid'].apply(lambda x: bool(uuid_pattern.match(str(x))))]

print(f"Number of invalid UUIDs: {len(invalid_uuids)}")
if len(invalid_uuids) > 0:
    print("Invalid UUIDs:")
    print(invalid_uuids)


Number of invalid UUIDs: 0


In [11]:
duplicate_uuids3 = df['user_uuid'].duplicated().sum()
print(f"Number of duplicate UUIDs: {duplicate_uuids3}")


Number of duplicate UUIDs: 104839


In [13]:
unique_values = df['user_uuid'].nunique()
total_values = df['user_uuid'].count()
print(f"Number of unique values in 'user_uuid' column: {unique_values}")
print(f"Total number of values in 'user_uuid' column: {total_values}")


Number of unique values in 'user_uuid' column: 15284
Total number of values in 'user_uuid' column: 120123


In [14]:
unique_values = df['card_number'].nunique()
total_values = df['card_number'].count()
print(f"Number of unique values in 'card_number' column: {unique_values}")
print(f"Total number of values in 'card_number' column: {total_values}")


Number of unique values in 'card_number' column: 15284
Total number of values in 'card_number' column: 120123


In [15]:
# Check if all card numbers are between 13 and 19 digits long
invalid_length_card_numbers = df[~df['card_number'].apply(lambda x: 13 <= len(str(x)) <= 19)]
print(f"Number of card numbers with invalid length: {len(invalid_length_card_numbers)}")
if len(invalid_length_card_numbers) > 0:
    print("Card numbers with invalid length:")
    print(invalid_length_card_numbers)


Number of card numbers with invalid length: 10148
Card numbers with invalid length:
        level_0   index                             date_uuid first_name  \
4             4       4  00ab86c3-2039-4674-b9c1-adbcbbf525bd       None   
9             9       9  e764e21c-f9ce-4e0a-8c01-4218e92e424c       None   
35           35      35  983cfec4-5d2b-41b0-9899-9efacbf6467f       None   
42           42      42  9f740e23-6eb8-465e-b421-6460706aa6b7      Marie   
49           49      49  2feecc4f-94b5-4ce3-aeb7-981e7570fd93    Rachael   
...         ...     ...                                   ...        ...   
120114    81466   81466  9e4b398e-c733-41d8-b28e-95550b0f1e41       None   
120115    87690   87689  e982f82e-ddb6-424c-b9a0-0712169faa44       None   
120116    95080   95080  d950d99a-f49f-4a5b-b81d-e4d6570c4fac       None   
120117   109637  109636  bbf9ba49-face-43e3-b732-d5089117e07e       None   
120118   110549  110548  f0e8fff6-9998-4661-954b-0e258e09d33c       None   

   

In [16]:
# Check if all card numbers are numeric
non_numeric_card_numbers = df[~df['card_number'].apply(lambda x: str(x).isdigit())]
print(f"Number of non-numeric card numbers: {len(non_numeric_card_numbers)}")
if len(non_numeric_card_numbers) > 0:
    print("Non-numeric card numbers:")
    print(non_numeric_card_numbers)


Number of non-numeric card numbers: 0


In [17]:
duplicate_card_numbers = df['card_number'].duplicated().sum()
print(f"Number of duplicate card numbers: {duplicate_card_numbers}")


Number of duplicate card numbers: 104839


In [18]:
import re

# Define a regular expression pattern for the valid format
pattern = r'^[A-Za-z0-9]{2}-[A-Za-z0-9]{8}$'

# Identify invalid store_code entries
invalid_store_codes = df[~df['store_code'].str.match(pattern, na=False)]

# Display the invalid entries
print(f"Number of invalid entries in 'store_code' column: {len(invalid_store_codes)}")
if len(invalid_store_codes) > 0:
    print("Invalid store_code entries:")
    print(invalid_store_codes['store_code'])


Number of invalid entries in 'store_code' column: 26957
Invalid store_code entries:
1         WEB-1388012W
5         WEB-1388012W
8         WEB-1388012W
11        WEB-1388012W
12        WEB-1388012W
              ...     
120106    WEB-1388012W
120107    WEB-1388012W
120110    WEB-1388012W
120114    WEB-1388012W
120120    WEB-1388012W
Name: store_code, Length: 26957, dtype: object


In [19]:
null_values_product_quantity = df['product_quantity'].isnull().sum()
print(f"Number of null values in 'product_quantity' column: {null_values_product_quantity}")


Number of null values in 'product_quantity' column: 0


In [20]:
non_numeric_values = df[~df['product_quantity'].apply(lambda x: str(x).isdigit())]
print(f"Number of non-numeric values in 'product_quantity' column: {len(non_numeric_values)}")
if len(non_numeric_values) > 0:
    print("Non-numeric values in 'product_quantity' column:")
    print(non_numeric_values['product_quantity'])


Number of non-numeric values in 'product_quantity' column: 0


In [21]:
data_cleaner = DataCleaning()
df = data_cleaner.convert_data_types(df, ['product_quantity'])


NameError: name 'DataCleaning' is not defined

In [23]:
from data_cleaning import DataCleaning

# Step 1: Instantiate the DataCleaning class
data_cleaner = DataCleaning()

# Step 2: Extract the data from the RDS database
db_connector = DatabaseConnector(config_path='aws_db_creds.yaml')
data_extractor = DataExtractor(db_connector)
orders_df = data_extractor.read_rds_table('orders_table')

# Display the first few rows of the dataframe before cleaning
print("Data before cleaning:")
print(orders_df.head())

# Step 3: Apply the clean_orders_data method to clean the dataframe
cleaned_orders_df = data_cleaner.clean_orders_data(orders_df)

# Step 4: Display the first few rows of the dataframe after cleaning
print("Data after cleaning:")
print(cleaned_orders_df.head())

# Step 5: Check the data types to ensure product_quantity is numeric
print("Data types after cleaning:")
print(cleaned_orders_df.dtypes)

Database engine initialized successfully.
Data before cleaning:
   level_0  index                             date_uuid first_name last_name  \
0        0      0  9476f17e-5d6a-4117-874d-9cdb38ca1fa6       None      None   
1        1      1  0423a395-a04d-4e4a-bd0f-d237cbd5a295       None      None   
2        2      2  65187294-bb16-4519-adc0-787bbe423970       None      None   
3        3      3  579e21f7-13cb-436b-83ad-33687a4eb337       None      None   
4        4      4  00ab86c3-2039-4674-b9c1-adbcbbf525bd       None      None   

                              user_uuid       card_number    store_code  \
0  93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8    30060773296197   BL-8387506C   
1  8fe96c3a-d62d-4eb5-b313-cf12d9126a49   349624180933183  WEB-1388012W   
2  fc461df4-b919-48b2-909e-55c95a03fe6b  3529023891650490   CH-01D85C8D   
3  6104719f-ef14-4b09-bf04-fb0c4620acb0   213142929492281   CL-C183BE4B   
4  9523a6d3-b2dd-4670-a51a-36aebc89f579      502067329974   SO-B5B9CB3B   

  pr

In [24]:
cleaned_orders_df.head()

Unnamed: 0,level_0,index,date_uuid,user_uuid,card_number,store_code,product_code,product_quantity
0,0,0,9476f17e-5d6a-4117-874d-9cdb38ca1fa6,93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8,30060773296197,BL-8387506C,R7-3126933h,3
1,1,1,0423a395-a04d-4e4a-bd0f-d237cbd5a295,8fe96c3a-d62d-4eb5-b313-cf12d9126a49,349624180933183,WEB-1388012W,C2-7287916l,2
2,2,2,65187294-bb16-4519-adc0-787bbe423970,fc461df4-b919-48b2-909e-55c95a03fe6b,3529023891650490,CH-01D85C8D,S7-1175877v,2
3,3,3,579e21f7-13cb-436b-83ad-33687a4eb337,6104719f-ef14-4b09-bf04-fb0c4620acb0,213142929492281,CL-C183BE4B,D8-8421505n,2
4,4,4,00ab86c3-2039-4674-b9c1-adbcbbf525bd,9523a6d3-b2dd-4670-a51a-36aebc89f579,502067329974,SO-B5B9CB3B,B6-2596063a,2
