In [88]:
# Importing dependencies
import os
from typing import List, Tuple
import numpy as np
import pandas as pd

In [None]:
# Reading data from csv and loading it into a DataFrame for in-memory processing
dogs = pd.read_csv("data/raw_data/coin_afrique-raw-dogs-data.csv")
goats = pd.read_csv("data/raw_data/coin_afrique-raw-goats-data.csv")
chickens_rabbits_pigeons = pd.read_csv("data/raw_data/coin_afrique-raw-chickens_rabbits_pigeons-data.csv")
other_animals = pd.read_csv("data/raw_data/coin_afrique-raw-other_animals-data.csv")

data_list = [
    ("dogs", dogs), ("goats", goats),
    ("chickens_rabbits_pigeons", chickens_rabbits_pigeons),
    ("other_animals", other_animals)
]

In [90]:
def prune_data(data: pd.DataFrame, keep_columns: List[str]) -> pd.DataFrame:
    return data[list(set(data.columns.tolist()) & set(keep_columns))]

In [91]:
def price_to_numeric(data: pd.DataFrame) -> pd.DataFrame:
    data_copy = data.drop_duplicates()

    # prices with space in them, results to NaN, so we remove the spaces and the currencies
    data_copy["prix"] = pd.to_numeric(data_copy["prix"].str.replace(" ", "").str.rstrip("CFA"), errors="coerce")

    return data_copy

In [92]:
def impute_nan(data: pd.DataFrame) -> pd.DataFrame:
    data_copy = data.copy()

    # Replacing missing prices by the Median
    data_copy["prix"] = data_copy["prix"].fillna(data_copy["prix"].median())

    # Replacing missing adresse and image_link with a constant
    data_copy["adresse"] = data_copy["adresse"].fillna("Adresse Inconnue")
    data_copy["image_lien"] = data_copy["image_lien"].str.lstrip("background-image:url(").str.rstrip(")")
    data_copy["image_lien"] = data_copy["image_lien"].fillna("Lien Inconnu")

    # Replacing Name/Details with a constant
    if "nom" in data_copy.columns:
        data_copy["nom"] = data_copy["nom"].fillna(data_copy["nom"].mode())
    elif "details" in data_copy.columns:
        data_copy["details"] = data_copy["details"].fillna("Details Inconnus")
    
    return data_copy

In [93]:
def treat_outliers(data: pd.DataFrame, factor=1.5) -> pd.DataFrame:
    data_copy = data.copy()
    for col in data_copy.select_dtypes(include="number").columns:
        if not (-0.5 <= data_copy[col].skew() <= 0.5):
            q1 = data_copy[col].quantile(0.25)
            q3 = data_copy[col].quantile(0.75)
            IQR = q3 - q1
            lowerBound = q1 - factor * IQR
            upperBound = q3 + factor * IQR
            data_copy[col] = data_copy[col].clip(lower=lowerBound, upper=upperBound)
    return data_copy

In [94]:
def data_to_csv(data: pd.DataFrame, table_name: str) -> None:
    data.to_csv(f"data/clean_data/cleaned-{table_name}-data.csv", index=False)

In [95]:
def main():
    keep_columns = ["details", "nom", "prix", "adresse", "image_lien"]
    for table_name, data in data_list:
        pruned_data = prune_data(data, keep_columns=keep_columns)
        type_casted_data = price_to_numeric(pruned_data)
        no_nan_data = impute_nan(type_casted_data)
        no_outliers_data = treat_outliers(no_nan_data)
        data_to_csv(no_outliers_data, table_name=table_name)

In [None]:
# Launching the Machine ðŸš‚  ;)
main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy["prix"] = pd.to_numeric(data_copy["prix"].str.replace(" ", "").str.rstrip("CFA"), errors="coerce")
