# Klarna Case Study

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv

## Data Retrieval

In [None]:
def get_col_info():
    colInfo = 'CaseStudyCols.csv'

    with open(colInfo, newline='') as csv_file:
        reader = csv.reader(csv_file, delimiter=';')
        # skip over the first row
        reader.__next__()
        rows = list(reader)

    col_names_types = {row[0]: row[1] for row in rows}

    return col_names_types


In [None]:
dataSet = 'dataset.csv'

"""
Note that in reality the column 'default' and
'worst_status_active_inv' as actually ints, but
since they have NA values, we load them as objects
for now.

But note that the entries where default = NA are actually our
entries that we are supposed to predict (validate set?)
"""

df = pd.read_csv(dataSet, delimiter=';', dtype = get_col_info(), keep_default_na=True)

Data Processing & Wrangling

In [None]:
df.info()

In [None]:
# Strip out the NA entries for the column 'default'
# First we save the entries to a seperate dataframe

validate_df = df[df['default'].isna().copy()]

df = df[df['default'].notna().copy()]

df.info()

In [None]:
# Explore the dataset, in particular look at the NA counts
has_na = []
for col in df.columns:
    perc_na = 0
    perc_na = round(df[col].isna().sum()/len(df), 2)
    if perc_na > 0:
        has_na.append([col, perc_na])


# Plot out the columns with NA (their percentages) - no need to show it each time
# fig, ax = plt.subplots(figsize=(10, 5), dpi=100)
# na_plot = sns.barplot(x=[n[0] for n in has_na], y=[n[1] for n in has_na],  ax=ax)
# na_plot.set_xticklabels(na_plot.get_xticklabels(), rotation=90, horizontalalignment='right')
# na_plot.set_title("% of values = NA");


In [None]:
df.info()

In [None]:
df.drop(columns=[col[0] for col in has_na if col[1] > .5], inplace=True)
df.drop(columns=['uuid', 'default'], inplace=True)
df.info()

In [None]:
print(f"There are {len(df.select_dtypes(include='number').describe().columns.to_list())} numeric columns now")
print(f"There are {len(df.select_dtypes(exclude='number').describe().columns.to_list())} NON numeric columns now")

In [None]:
numerics_df = df.select_dtypes(include='number') #.describe().columns.to_list()

In [None]:
for col in numerics_df:
    print(f"{col}:            {round(numerics_df[col].var(skipna=True), 2)}")

In [None]:
for col in numerics_df:
    print(numerics_df[col].describe())
    print(f"Number of NA values: {numerics_df[col].isna().sum()}")
    print("=====================================\n")

In [None]:
def plotNumCols(col):
    plt.figure(figsize=(14, 4))

    plt.subplot(1, 2, 1)
    plt.title(col.name)
    sns.boxplot(col)

    plt.subplot(1, 2, 2)
    plt.title(col.name)
    sns.scatterplot(x=col, y=col.value_counts());


In [None]:
for col in numerics_df:
    plotNumCols(numerics_df[col])