# Klarna Case Study

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge


## Data Retrieval

In [None]:
def get_col_info():
    colInfo = 'CaseStudyCols.csv'

    with open(colInfo, newline='') as csv_file:
        reader = csv.reader(csv_file, delimiter=';')
        # skip over the first row
        reader.__next__()
        rows = list(reader)

    col_names_types = {row[0]: row[1] for row in rows}

    return col_names_types

In [None]:
"""
Note that in reality the column 'default' and
'worst_status_active_inv' as actually ints, but
since they have NA values, we load them as objects
for now.

But note that the entries where default = NA are actually our
entries that we are supposed to predict (validate set?)
"""

df = pd.read_csv('dataset.csv', delimiter=';', dtype = get_col_info(), keep_default_na=True)

## Data Processing & Wrangling

In [None]:
# Strip out the NA entries for the column 'default'
# First we save the entries to a seperate dataframe

defaults_df = df[df['default'].isna().copy()]

df = df[df['default'].notna().copy()]
df['default'] = pd.to_numeric(df['default']).astype('int32')

df.info()

## EDA

## Columns with higher percentage of NA

In [None]:
# Explore the dataset, in particular look at the NA counts
has_na = []
for col in df.columns:
    perc_na = 0
    perc_na = round(df[col].isna().sum()/len(df), 2)
    if perc_na > 0:
        has_na.append([col, perc_na])


# Plot out the columns with NA (their percentages) - no need to show it each time
# fig, ax = plt.subplots(figsize=(10, 5), dpi=100)
# na_plot = sns.barplot(x=[n[0] for n in has_na], y=[n[1] for n in has_na],  ax=ax)
# na_plot.set_xticklabels(na_plot.get_xticklabels(), rotation=90, horizontalalignment='right')
# na_plot.set_title("% of values = NA");


In [None]:
## Let's explore some of the columns (other than default)
## Look in particular at the columns with high NA.
## Do we need those columns or can we simply drop them?

In [None]:
## Make a list of columns that have high % NA
# % NA threshhold for columns to ignore
na_threshold = .5
cols_to_exclude = [col[0] for col in has_na if col[1] > na_threshold]

In [None]:
## We can see that 'merchant_category', 'merchant_group'
## and 'name_in_email' are categorical (object = string)
## These will either need to be ignored (not a good idea)
## or encoded.

In [None]:
# Have a look at the two 'merchant_' columns
for c in df.columns:
    if(c[:8] == 'merchant'):
            print(df[c].value_counts())
df.info()

## Look for correlations

In [None]:
def corr_matrix(df):
    correlation_matrix = df.corr()
    column_names = correlation_matrix.columns

    # Convert the correlation matrix into a DataFrame
    corr_df = correlation_matrix.stack().reset_index()

    # Rename the columns
    corr_df.columns = ['feature_1','feature_2', 'correlation']

    # Remove "self correlations"
    no_self_correlation = (corr_df['feature_1'] != corr_df['feature_2'])
    corr_df = corr_df[no_self_correlation]

    # Absolute correlation
    corr_df['absolute_correlation'] = np.abs(corr_df['correlation'])

    # Correlation by pairs of features
    return corr_df.sort_values(by="absolute_correlation", ascending=False)

corr_matrix(df).head(10)

In [None]:
## There are already 36 columns in the original dataset. A bit less after we
## removed the high percentage NA columns. Before getting into the pipeline
## let's remove some columns that are probably not needed.

## First, columns that are highly correlated (corr > .8)
cols_to_exclude +=  ['max_paid_inv_0_24m', 'num_arch_ok_0_12m', 'status_max_archived_0_24_months']

## Next, categorical columns that 'probably' don't add much value
## We still have the 'merchant_group' column that should be usefule
cols_to_exclude += ['merchant_category', 'name_in_email']

## Pipeline

In [None]:
cols_to_exclude += ['default', 'uuid']

# Defining the features and the target
X = df.drop(columns=cols_to_exclude)
y = df['default']

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
X_train.info()

In [None]:
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.preprocessing import MinMaxScaler, RobustScaler


In [None]:
# Impute then scale numerical values:
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('standard_scaler', RobustScaler())
])

# Encode categorical values
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Parallelize "num_transformer" and "cat_transfomer"
numeric_cols = X.select_dtypes(include='number').describe().columns.to_list()
cat_cols = X.select_dtypes(include=['object', 'bool']).describe().columns.to_list()

preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, numeric_cols),
    ('cat_transformer', cat_transformer, cat_cols),
] )

# Add estimator
pipeline = make_pipeline(preprocessor, Ridge())


In [None]:
preprocessor

In [None]:
X_train_transformed = preprocessor.fit_transform(X_train)

print("Original training set")
display(X_train.head(5))

print("Preprocessed training set")
transformed_df = pd.DataFrame(
    X_train_transformed,
    columns=preprocessor.get_feature_names_out()
)

transformed_df.head(5)

In [None]:
#The reality is that the scaling/encoding should not change the correlations
# corr_matrix(transformed_df)

In [None]:
pipeline

In [None]:
# Train Pipeline
pipeline.fit(X_train,y_train)

# Make predictions
pipeline.predict(X_test.iloc[0:1])

# Score model
pipeline.score(X_test,y_test)