In [None]:
import pandas as pd

In [None]:
# setting options to more visible rows/columns
pd.set_option('display.max_rows', 1000)
pd.set_option("display.max_colwidth", 100)

In [None]:
# import orders.csv
url = "https://drive.google.com/file/d/1Vu0q91qZw6lqhIqbjoXYvYAQTmVHh6uZ/view?usp=sharing" 
path = "https://drive.google.com/uc?export=download&id="+url.split("/")[-2]
orders = pd.read_csv(path)

In [None]:
# import orderlines.csv
url = "https://drive.google.com/file/d/1FYhN_2AzTBFuWcfHaRuKcuCE6CWXsWtG/view?usp=sharing" 
path = "https://drive.google.com/uc?export=download&id="+url.split("/")[-2]
orderlines = pd.read_csv(path)

In [None]:
# import products.csv
url = "https://drive.google.com/file/d/1afxwDXfl-7cQ_qLwyDitfcCx3u7WMvkU/view?usp=sharing" 
path = "https://drive.google.com/uc?export=download&id="+url.split("/")[-2]
products = pd.read_csv(path)

In [None]:
#------------ ORDERS --------------------


# Duplicates 
orders.duplicated().sum()

In [None]:
# Missing values
print(f"5 missing values represents {((orders.total_paid.isna().sum() / orders.shape[0])*100).round(5)}% of the rows in our DataFrame")

In [None]:
orders.total_paid.isna().value_counts(normalize=True)

In [None]:
orders.isna().any()

In [None]:
# we drop the missing values in orders total_paid
orders = orders.loc[~orders.total_paid.isna(), :]

In [None]:
# check datatypes
orders_cl = orders
orders_cl.dtypes

In [None]:
# fixing datatypes of columns
orders["created_date"] = pd.to_datetime(orders["created_date"])

In [None]:
orders["total_paid"] = pd.to_numeric(orders["total_paid"])

In [None]:
#------------ ORDERLINES --------------------


# Duplicates 
orderlines.duplicated().sum()

In [None]:
# missing values orderlines
orderlines.isna().sum()

In [None]:
# fix datatypes
orderlines["date"] = pd.to_datetime(orderlines["date"])

In [None]:
# check percentage of corrupted data in unit_price
two_dot_percentage = ((orderlines.unit_price.str.contains("\d+\.\d+\.\d+").value_counts()[1] / orderlines.shape[0])*100).round(2)
print(f"The 2 dot problem represents {two_dot_percentage}% of the rows in our DataFrame")

In [None]:
# investigate df
orderlines.loc[orderlines.unit_price.str.contains("\d+\.\d+\.\d+"), :]

In [None]:
orderlines[orderlines.id_order.isin([527342, 299549, 452946, 527364])]

In [None]:
# corresponding row in orders, investigating several examples to find pattern
orders.loc[orders.order_id.isin([527342, 299549, 452946, 527364]), :]

In [None]:
#we can just drop the first dot here
orderlines.loc[orderlines["unit_price"].str.contains("\d+\.\d+\.\d+"), "unit_price"].str.replace(".", "", 1)

In [None]:
# make a new clean df
orderlines_cl = orderlines

In [None]:
# fix corrupted data
orderlines_cl.loc[orderlines["unit_price"].str.contains("\d+\.\d+\.\d+"), "unit_price"] = orderlines.loc[orderlines["unit_price"].str.contains("\d+\.\d+\.\d+"), "unit_price"].str.replace(".", "", 1)

In [None]:
# now, to check if that worked, convert string (object) to float numeric
orderlines_cl["unit_price"] = pd.to_numeric(orderlines["unit_price"])

In [None]:
# fix other datatypes
orderlines_cl["date"] = pd.to_datetime(orderlines_cl["date"])

In [None]:
# check datatypes
orderlines_cl.dtypes

In [None]:
#------------ PRODUCTS --------------------


# Duplicates 
products.duplicated().value_counts(normalize=True)# theres a lot of duplicates, almost 50%

In [None]:
products_cl = products.drop_duplicates()

In [None]:
# missing values
products_cl.isna().sum()

In [None]:
# how much corrupted data in price?
products_cl["price"].str.contains("\d+\.\d+\.\d+").value_counts(normalize=True)
# 3.5% have two dots in price, we can drop that

In [None]:
# dropping corrupted rows
products_cl = products_cl.loc[~products_cl["price"].str.contains("\d+\.\d+\.\d+", na=False), :]

In [None]:
# drop missing data
products_cl = products_cl.loc[~products_cl.price.isna(), :]
products_cl = products_cl.loc[~products_cl.desc.isna(), :]
products_cl = products_cl.loc[~products_cl.type.isna(), :]

In [None]:
# and then change it to numeric to also check if that worked
#products_cl["price"] = pd.to_numeric(products_cl["price"])

In [None]:
# how much corrupted data in promo_price?
products_cl["promo_price"].str.contains("\d+\.\d+\.\d+").value_counts(normalize=True)
# 43%, we cannot drop that, we have to fix it

In [None]:
# I have to write a code depending on position of first comma
# get number of digits before point (if there is one)
# cut promo price, insert digit
#products_cl["price"] = products_cl["price"].astype(str)
products_cl["decimals"] = products_cl["price"].str.split(".", expand=True)[0]
products_cl["len"] = products_cl["decimals"].apply(len)
products_cl.sample(10)

In [None]:
products_cl["fixed_promo"] = products_cl["promo_price"].str.replace(".", "")

In [None]:
# function to place the digit at the right spot in the string
def decs(df):
    df["a"], df["b"] = df["fixed_promo"][:df["len"]], df["fixed_promo"][df["len"]:]
    df["new"] = df["a"] + "." + df["b"]
    return df["new"]

In [None]:
# applying function on promo_price
products_cl.loc[products_cl["promo_price"].str.contains("\d+\.\d+\.\d+"), "new"] = products_cl.apply(decs, axis=1)

In [None]:
products_cl.sample(10)

In [None]:
# fixing the column
products_cl["new"] = products_cl["new"].astype(float)
products_cl["new"] = products_cl["new"].round(2)

In [None]:
# eventually change dtype of price
products_cl["price"] = products_cl["price"].astype(float)

In [None]:
#products_cl[products_cl["promo_price"]>products_cl["price"]]

In [None]:
# replace promo price where promo price has two dots with new (not when new NaN)
products_cl.loc[products_cl["promo_price"].str.contains("\d+\.\d+\.\d+"), "promo_price"] = products_cl.loc[products_cl["new"].notna(), "new"]

In [None]:
products_cl.info()

In [None]:
products_cl["new"] = products_cl["new"].astype(float)

In [None]:
products_cl["promo_price"] = products_cl["promo_price"].astype(float).round(2)

In [None]:
# now check for wrong digits by sorting out differences that are unreasonable
products_cl.loc[products_cl["promo_price"] > 3* products_cl["price"], "new"] = products_cl.apply(decs, axis=1)

In [None]:
products_cl.loc[products_cl["promo_price"] > 3* products_cl["price"], "promo_price"] = products_cl["new"].div(10)

In [None]:
products_cl.loc[products_cl["promo_price"] > 3* products_cl["price"], :]

In [None]:
# change to numeric to also check if this worked
products_cl["promo_price"] = pd.to_numeric(products_cl["promo_price"])

In [None]:
products_cl.dtypes

In [None]:
# check price - promo_price and check for meand/median/percentile and outliers, if numbers make sense
products_cl["new_promo"] = pd.to_numeric(products_cl["new"]).round(2)
products_cl["price"] = pd.to_numeric(products_cl["price"])
products_cl["discount"] = products_cl["price"] - products_cl["promo_price"]
products_cl["discount"].describe()

In [None]:
#products_cl.to_parquet("products_cl.parquet")

In [None]:
#orders_cl.to_parquet("orders_cl.parquet")

In [None]:
#orderlines_cl.to_parquet("orderlines_cl.parquet")