# Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools


from eda.chart_utils import plot_categorical_variable
from feat.feature_builder import FeatureHandler

import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')

# Data import

In [None]:
customers = pd.read_csv("/workspaces/hnb/data/customer_data.csv")
trans = pd.read_csv("/workspaces/hnb/data/transactions_data.csv")
trans["Date"] = [datetime.datetime.strptime(date_, '%Y-%m-%d') for date_ in trans["Date"]]

* The data of transations is free of missing values.
* The data of customers has missing values in education and for those users without loyalty card there is not information in Loyalty Points.

In [None]:
trans.head(3)

* it is possible to compute percentage of completed transactions for each user

In [None]:
num_intrans = trans.loc[:,["Customer ID", "Incomplete Transaction"]].groupby("Customer ID").sum()
num_trans = trans.loc[:,["Customer ID", "Incomplete Transaction"]].groupby("Customer ID").count()
c = (num_intrans/num_trans)
cc = customers.merge(c, on="Customer ID")
cc = cc.merge(num_intrans, on="Customer ID")
cc = cc.merge(num_trans, on="Customer ID")

plt.hist(cc["Incomplete Transaction_x"], bins=30)
plt.show()


In [None]:
customers.head(3)

In [None]:
complete_df = trans.merge(customers, on="Customer ID")


In [None]:
complete_df.head(3)

In [None]:
vh = FeatureHandler(complete_df)
vh.run_feat_buider()

# Transaction variables
### Univariant transaction variables study

In [None]:
plot_categorical_variable(vh.df["Incomplete Transaction"].values, complete_df.index.values)

In [None]:
plot_categorical_variable(complete_df["Education"].values, trans.index.values, violin=False)

In [None]:
variable = "Education"
count_val = complete_df.loc[:,[variable, "Incomplete Transaction","Region"]].groupby([variable]).count()
sum_val = complete_df.loc[:,[variable, "Incomplete Transaction","Region"]].groupby([variable]).sum()
print(count_val)
sum_val/count_val

# Multivariate transaction variables study 

In [None]:
df_numeric = vh.df.iloc[:,3:].select_dtypes(include=['number']).drop(columns=['Incomplete Transaction'])
distribution_cols = []
distribution_cols += [x for x in df_numeric.columns]


In [None]:
fig = plt.figure(figsize=(17, 80))
i = 1
for col in distribution_cols:
    ax = fig.add_subplot(df_numeric.shape[1], 2, i)
    #violin = sns.violinplot(x=col, data = df, palette="muted", inner = 'box',linewidth=3)
    box = sns.boxplot(x=col,data=vh.df, linewidth=3, color="skyblue")
    #box = ax.boxplot(df[col], flierprops=dict(markerfacecolor='r', marker='s'), vert=False, patch_artist=True)
    ax.set_title(col, fontweight = 'bold')
    ax.set_xlabel(None)
    ax = fig.add_subplot(df_numeric.shape[1], 2, i+1)
    p1 = sns.kdeplot(vh.df.loc[vh.df["Incomplete Transaction"]==0,col], shade=True, color="salmon", label='Yes purchase')
    p1 = sns.kdeplot(vh.df.loc[vh.df["Incomplete Transaction"]==1,col], shade=True, color="skyblue", label='No purchase')
    ax.set_title(col, fontweight = 'bold')
    ax.legend(fontsize=15)
    i+=2
    
plt.tight_layout()
plt.show

### Analysis of multiple variables in joint plots

In [None]:
combis = list(itertools.product(distribution_cols, distribution_cols))
len(combis)


In [None]:
fig = plt.figure(figsize=(17, 80))
import numpy as np
noise_x = np.random.normal(0,0.3,vh.df.shape[0])
noise_y = np.random.normal(0,0.3,vh.df.shape[0])

i = 1
for a,b in combis:
    if a!=b:
        sns.jointplot(
            x=complete_df[a]+noise_x,
            y=complete_df[b]+noise_y,
            hue=complete_df["Incomplete Transaction"]
        )
        plt.savefig(f"/workspaces/hnb/images/eda/combi_{a}_{b}")
        plt.close()


# Analysis of correlations

In [None]:
pd.set_option('display.max_columns', None)
vh.df.describe()


In [None]:
from scipy import stats
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

newdf = vh.df.select_dtypes(include=numerics)
for i in newdf.columns:
    try:
        print(f"variable: {i} with point biserial correlation coefficient->", stats.pointbiserialr(vh.df[i], vh.df["Incomplete Transaction"]))
    except ValueError:
        pass

In [None]:
newdf = vh.df.select_dtypes(include="object")
for i in newdf.columns:
    try:
        crosstab = pd.crosstab(index=vh.df[i], columns=vh.df["Incomplete Transaction"])
        print(f"variable: {i} with chi-square p-value->", stats.chi2_contingency(crosstab)[1])
    except ValueError:
        pass