In [140]:
import pandas as pd
from shared_functions import * 
import os


In [141]:
pd.set_option("display.max_columns", 25)
pd.set_option("display.max_colwidth",20)

In [142]:
notebook = os.path.basename(globals()['__vsc_ipynb_file__'])

Open File

In [143]:
source_path = f"{raw_files_folder}/userprofile.csv"

In [144]:
df_orig = read_files(source_path, notebook)  

Validate column names in the source

In [145]:
source_columns = df_orig.columns.to_list()

In [146]:
expected_columns = ['userID', 'latitude', 'longitude', 'smoker', 'drink_level',
       'dress_preference', 'ambience', 'transport', 'marital_status', 'hijos',
       'birth_year', 'interest', 'personality', 'religion', 'activity',
       'color', 'weight', 'budget', 'height']

In [147]:
source_columns_name_check(source_columns ,expected_columns, notebook)
   

Remove leading, trailing whitespaces

In [148]:
df = strip_df(df_orig)

Remove rows where mandatory data are missing 


In [149]:
# columns that are not allowed to have missing values

mandatory_columns_customers = ["userID" , "smoker", "drink_level", "marital_status","birth_year", "activity"]


In [150]:
df_mandatory_cols = mandatory_columns_null_handling(df, mandatory_columns_customers, notebook)


In [151]:
# drop irrelevant column 

df_mandatory_cols.drop("hijos", axis=1, inplace=True)

Handle missing data in optional columns 

In [152]:
df_optional_cols = optional_columns_null_handling(df_mandatory_cols, mandatory_columns_customers, notebook)


Replace specified chars, capitalise first letter

In [153]:
format_columns_customers = ["smoker", "drink_level", "dress_preference", "ambience", "transport", "marital_status", "interest", "personality", "religion", "activity", "color", "budget" ]

In [154]:
replace_char_dict_customers = {"_": " ", "-": " "}

In [155]:
df_format_cols = format_columns(df_optional_cols, format_columns_customers, replace_char_dict_customers, notebook)


Constraints

In [156]:
# converting height into cm 

df_format_cols["height"] = (df_format_cols["height"] * 100).astype("int64")

In [157]:
def constraints():

    try: 
        if df_format_cols["smoker"].isin(["False", "True"]).all():
            pass
        else:
            raise ValueError
            

        if df_format_cols["birth_year"].isin(list(range(1920, 2015))).all():
            pass
        else:
            raise ValueError
    

        if df_format_cols["marital_status"].isin(["Single", "Married", "Divorced", "Widow"]).all():
            pass
        else:
            raise ValueError  


        if df_format_cols["height"].isin(list(range(70, 230))).all():
            pass
        else:
            raise ValueError   

        if df_format_cols["weight"].isin(list(range(40, 300))).all():
            pass
        else:
            raise ValueError  
    

    except Exception:
        trace = traceback.format_exc(limit=1)

        with open("log.txt", "a") as f:
            print(f"An error occured running the <{notebook}>", file=f)
            print()
            print(trace, file=f)  

        sys.exit()
            

In [158]:
constraints()



Check duplicates

In [159]:
df_final_customers = df_format_cols.drop_duplicates()


In [160]:
customer_unique_cust_id = df_final_customers["userID"].unique()

In [161]:
%store customer_unique_cust_id

Stored 'customer_unique_cust_id' (ndarray)
