# DATA CLEANING NOTEBOOK
Felix A. Westphal
DLMDWME01

### Import

In [38]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder

### Parameter

In [39]:
FILE_PATH_RAW_DATA = r"../data/raw/PSP_Jan_Feb_2019.xlsx"
FILE_PATH_CLEANED_DATA = r"../data/processed/Cleaned_Input_Data.csv"
FILE_PATH_ENCODED_DATA = r"../data/processed/Encoded_Input_Data.csv"
TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S"

### Value Types and Ranges

In [40]:
COLUMN_HEADERS = ["tmsp", "country", "amount", "success", "PSP", "3D_secured", "card"]
VALUE_RANGE_TMSP = [datetime.strptime("2010-01-01 00:00:00", TIMESTAMP_FORMAT), datetime.now()]
VALUE_RANGE_AMOUNT = [0.0, None]
VALUE_RANGE_PSP = ["Moneycard", "Goldcard", "UK_Card", "Simplecard"]
VALUE_RANGE_CARD = ["Master", "Visa", "Diners"]
DEFINED_TYPES = ["datetime64[ns]", "string", "float32", bool, "string", bool, "string"]
VALUE_RANGES = [VALUE_RANGE_TMSP, None, VALUE_RANGE_AMOUNT, None, VALUE_RANGE_PSP, None, VALUE_RANGE_CARD]
value_range_dict = {k : {"type" : v1, "range" : v2} for k, v1, v2 in zip(COLUMN_HEADERS, DEFINED_TYPES, VALUE_RANGES)}

### Load Raw Data and remove NAN values

In [41]:
raw_data = pd.read_excel(FILE_PATH_RAW_DATA, index_col=0)       # Load raw data file
print(f"Raw Data loaded from Excel: \n{raw_data.head()}")
sum_nan_column = raw_data.isna().sum()                          # Check for nan values
print(f"Number of NAN values per column: \n{sum_nan_column}")
total_sum_nan = sum_nan_column.sum()
print(f"Total number of NAN values: {total_sum_nan}")
if total_sum_nan > 0:
    filtered_df = raw_data.dropna()
    print(f"Filtered Data Frame: \n{filtered_df.head()}")
else:
    filtered_df = raw_data
num_entries = len(filtered_df.index)
print(f"Number of entries of the Filtered Data Frame: {num_entries}")
filtered_df.drop_duplicates(inplace=True)
print(f"Number of removed duplicated entries: {num_entries - len(filtered_df.index)}")
num_entries = len(filtered_df.index)
print(f"Number of entries of the Filtered Data Frame after dropping duplicates: {num_entries}")

Raw Data loaded from Excel: 
                 tmsp  country  amount  success         PSP  3D_secured  \
0 2019-01-01 00:01:11  Germany      89        0     UK_Card           0   
1 2019-01-01 00:01:17  Germany      89        1     UK_Card           0   
2 2019-01-01 00:02:49  Germany     238        0     UK_Card           1   
3 2019-01-01 00:03:13  Germany     238        1     UK_Card           1   
4 2019-01-01 00:04:33  Austria     124        0  Simplecard           0   

     card  
0    Visa  
1    Visa  
2  Diners  
3  Diners  
4  Diners  
Number of NAN values per column: 
tmsp          0
country       0
amount        0
success       0
PSP           0
3D_secured    0
card          0
dtype: int64
Total number of NAN values: 0
Number of entries of the Filtered Data Frame: 50410
Number of removed duplicated entries: 81
Number of entries of the Filtered Data Frame after dropping duplicates: 50329


### Check valid Value Range

In [42]:
print(f"\n========== CHECKING VALUE RANGES AND TYPES ==========")
# --- Loop through coloumns and check whether datatype is as defined, change if not.
num_rows_filtered = len(filtered_df.index)
for column_header in COLUMN_HEADERS:
    print(f"--- Current Column: {column_header} ---")
    column = filtered_df[column_header]
    defined_type = value_range_dict[column_header]["type"]
    current_type = column.dtype
    print(f"Current Type / Defined Type: {current_type} / {defined_type}")
    if current_type != defined_type:
        if defined_type == datetime:
            column = pd.to_datetime(column, errors="coerce", format=TIMESTAMP_FORMAT)
        elif defined_type == float:
            column = pd.to_numeric(column, errors="coerce", downcast=defined_type)
        else:
            column = column.astype(defined_type)
        current_type = column.dtype
        print(f"Changed type to {current_type}")
    filtered_df[column_header] = column

    # --- Check whether values are within defined value range.
    defined_value_range = value_range_dict[column_header]["range"]
    if defined_value_range:
        if len(defined_value_range) == 2:
            min_value_defined = defined_value_range[0]
            max_value_defined = defined_value_range[1]
            if min_value_defined:
                min_value = min(column)
                print(f"Min Value / Defined min Value: {min_value} / {min_value_defined}")
                if min_value < min_value_defined:
                    print(f"Found entry with smaller value than defined. Removing entry!")
                    filtered_df = filtered_df[filtered_df[column_header] >= min_value_defined]
            if max_value_defined:
                max_value = max(column)
                print(f"Max Value / Defined max Value: {max_value} / {max_value_defined}")
                if max_value > max_value_defined:
                    print(f"Found entry with larger value than defined. Removing entry!")
                    filtered_df = filtered_df[filtered_df[column_header] <= max_value_defined]
        else:
            unique_values = list(pd.unique(filtered_df[column_header]))
            not_defined_values = [value for value in unique_values if value not in defined_value_range]
            if len(not_defined_values) > 0:
                filtered_df = filtered_df[~filtered_df[column_header].isin(not_defined_values)]             # Remove all columns which are outside the defined range

cleaned_df = filtered_df.dropna()
num_rows_cleaned = len(cleaned_df.index)
num_rows_deleted = num_rows_filtered - num_rows_cleaned
print(f"\n--- Deleted {num_rows_deleted} rows from Filtered Data Frame. ---")
print(f"\n========== CLEANED DATA FRAME ========== \n{filtered_df.head()}")
cleaned_df.to_csv(FILE_PATH_CLEANED_DATA, index=False)
print(f"Saved Cleaned Data Frame as .csv to: {FILE_PATH_CLEANED_DATA}")


--- Current Column: tmsp ---
Current Type / Defined Type: datetime64[ns] / datetime64[ns]
Min Value / Defined min Value: 2019-01-01 00:01:11 / 2010-01-01 00:00:00
Max Value / Defined max Value: 2019-02-28 23:48:19 / 2023-05-20 10:06:39.442688
--- Current Column: country ---
Current Type / Defined Type: object / string
Changed type to string
--- Current Column: amount ---
Current Type / Defined Type: int64 / float32
Changed type to float32
--- Current Column: success ---
Current Type / Defined Type: int64 / <class 'bool'>
Changed type to bool
--- Current Column: PSP ---
Current Type / Defined Type: object / string
Changed type to string
--- Current Column: 3D_secured ---
Current Type / Defined Type: int64 / <class 'bool'>
Changed type to bool
--- Current Column: card ---
Current Type / Defined Type: object / string
Changed type to string

--- Deleted 0 rows from Filtered Data Frame. ---

                 tmsp  country  amount  success         PSP  3D_secured  \
0 2019-01-01 00:01:11  G

### Categorical Encoding
Choosing One-Hot Encoding as the categorical features (country, PSP, card) are not ordinal.

In [43]:
print(f"\n========== ONE HOT ENCODING CATEGORICAL DATA ==========")
ohe = OneHotEncoder()
encoded_country = ohe.fit_transform(cleaned_df[['country']]).toarray()
categories_country = ohe.categories_[0]
print(f"Encoded the following country categories: {categories_country}")

encoded_psp = ohe.fit_transform(cleaned_df[['PSP']]).toarray()
categories_psp = ohe.categories_[0]
print(f"Encoded the following PSP categories: {categories_psp}")

encoded_card = ohe.fit_transform(cleaned_df[['card']]).toarray()
categories_card = ohe.categories_[0]
print(f"Encoded the following card categories: {categories_card}")

# --- Insert encoded data into dataframe
encoded_df = cleaned_df
encoded_df[categories_country] = np.array(encoded_country, dtype=bool)
encoded_df[categories_psp] = np.array(encoded_psp, dtype=bool)
encoded_df[categories_card] = np.array(encoded_card, dtype=bool)

# --- Drop previous categorical columns
encoded_df = encoded_df.drop(['country'], axis=1)
encoded_df = encoded_df.drop(['PSP'], axis=1)
encoded_df = encoded_df.drop(['card'], axis=1)

print(f"\n========== ENCODED DATA FRAME ========== \n{encoded_df.head()}")
encoded_df.to_csv(FILE_PATH_ENCODED_DATA, index=False)
print(f"Saved Encoded Data Frame as .csv to: {FILE_PATH_ENCODED_DATA}")


Encoded the following country categories: ['Austria' 'Germany' 'Switzerland']
Encoded the following PSP categories: ['Goldcard' 'Moneycard' 'Simplecard' 'UK_Card']
Encoded the following card categories: ['Diners' 'Master' 'Visa']

                 tmsp  amount  success  3D_secured  Austria  Germany  \
0 2019-01-01 00:01:11    89.0    False       False    False     True   
1 2019-01-01 00:01:17    89.0     True       False    False     True   
2 2019-01-01 00:02:49   238.0    False        True    False     True   
3 2019-01-01 00:03:13   238.0     True        True    False     True   
4 2019-01-01 00:04:33   124.0    False       False     True    False   

   Switzerland  Goldcard  Moneycard  Simplecard  UK_Card  Diners  Master  \
0        False     False      False       False     True   False   False   
1        False     False      False       False     True   False   False   
2        False     False      False       False     True    True   False   
3        False     False      F