---

## Investigating Fraud Cases

In [None]:
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
data_train_clean = pd.read_csv("data/data_train_clean.csv")

In [None]:
frauds = data_train_clean.query('FraudResult == 1')

In [None]:
frauds['ValueInMio']= frauds.Value/1000000

In [None]:
frauds.head()

In [None]:
!pip install plotly

In [None]:
frauds.groupby('ProductCategory').count()

In [None]:
#What is the amount percentage of frauds?

sum_fraud = data_train_clean[data_train_clean['FraudResult'] == 1]['Value'].sum() 
sum_fraud / data_train_clean['Value'].sum() * 100

In [None]:
sum_fraud2 = data_train_clean[data_train_clean['FraudResult'] == 1]['TransactionId'].count() 
sum_fraud2 / data_train_clean['TransactionId'].count() * 100

In [None]:
import plotly.express as px

a = px.pie(data_train_clean.groupby('FraudResult').sum()[['Value']].reset_index(), 
           values = 'Value', names ='FraudResult',
           title='Volume of Fraud transactions',
           color_discrete_sequence=px.colors.qualitative.Prism,
   #        color_discrete_map={'0':'lightcyan',
     #                          '1':'cyan',
      #                          })
                                  )
a.update_layout(title_x=0.5)

                                  
                
#a.write_image('graphs/Volume_of_fraud_transactions.png')

In [None]:
pip install -U kaleido

In [None]:

a = px.pie(frauds.groupby('DebitCredit').sum()[['Value']].reset_index(), 
           values = 'Value', names ='DebitCredit',
           title='Frauds regarding Debit and Credit',
           color_discrete_sequence=px.colors.qualitative.Prism,
   #        color_discrete_map={'0':'lightcyan',
     #                          '1':'cyan',
      #                          })
                                  )
a.update_layout(title_x=0.5)
#fig.savefig('../graphs/distribution-conditions.png', dpi=300, transparent=True, pad_inches=1)

In [None]:
data_train_clean.groupby('ProductCategory').count()

In [None]:
#frauds.groupby(['AccountId', 'CustomerId']).mean()

In [None]:
# Erstellen der Heatmap aller Variablen mit Korrelationskoeffizienten

plt.figure(figsize = (20,10))
corr = frauds.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
    annot=True, 
 #   fmt="f"
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
import matplotlib.patches as mpatches
pop_a = mpatches.Patch(color='darkcyan', label='legal')
pop_b = mpatches.Patch(color='cyan', label='fraud')

In [None]:
g = sns.histplot(data=frauds, x="TransactionWeekday",color = ['darkcyan'])
plt.suptitle('Frauds per Weekday', fontsize=16)
plt.ylabel('Number of frauds', fontsize=14)
plt.xlabel('Weekday', fontsize=14);

In [None]:
g = sns.barplot(data=frauds, x="TransactionWeekday", y ='ValueInMio',palette = ['darkcyan'])
plt.ylabel('Value UGX in MIO', fontsize=14)
plt.xlabel('Transaction per Weekday', fontsize=14)
plt.yticks(ticks=np.linspace(0,3.5,8))
plt.suptitle('Fraudvolume per Weekday', fontsize=16);


In [None]:
frauds.query('ChannelId == 2')

In [None]:
g = sns.histplot(data=frauds, x="DayTime",color= ['darkcyan'])
plt.suptitle('Frauds per Daytime', fontsize=16)
plt.ylabel('Number of frauds', fontsize=14)
plt.xlabel('Daytime', fontsize=14);

In [None]:
g = sns.barplot(data=frauds, x="DayTime", y ='ValueInMio',palette = ['darkcyan'])
plt.suptitle('Fraudvolume per Daytime', fontsize=16)
plt.ylabel('Value UGX in MIO', fontsize=14)
plt.xlabel('Daytime', fontsize=14);

In [None]:
frauds = data_train_clean[data_train_clean["FraudResult"] == 1]

In [None]:
provider_compare = data_train_clean.groupby(["FraudResult","ProviderId"],as_index=False)["Value"].sum()
provider_compare

In [None]:
product_compare = data_train_clean.groupby(["FraudResult","ProductCategory"],as_index=False)["Value"].sum()
product_compare

In [None]:
pricing_compare = data_train_clean.groupby(["FraudResult","PricingStrategy"],as_index=False)["Value"].sum()
pricing_compare

In [None]:
channel_compare = data_train_clean.groupby(["FraudResult","ChannelId"],as_index=False)["Value"].sum()
channel_compare.head(10)

In [None]:
sns.barplot(x="ProductCategory", y="Value", data=product_compare.query('FraudResult ==1'),palette = ['darkcyan','darkslategrey','cyan', 'cadetblue'])
plt.suptitle('Fraudvolume per Product Category', fontsize=16)
plt.ylabel('Value UGX in MIO', fontsize=14)
plt.xlabel('Product Category', fontsize=14);

In [None]:
sns.barplot(data=channel_compare, x="ChannelId", y="Value", hue="FraudResult",palette = ['darkcyan','cyan', 'cadetblue'])
plt.xticks( ticks = range(5) ,labels = ['web','android', 'IOS', 'checkout','pay later'] )
plt.legend(handles=[pop_a,pop_b])
plt.suptitle('Fraudvolume per Channel', fontsize=16)
plt.ylabel('Value UGX in MIO', fontsize=14)
plt.xlabel('Channel', fontsize=14);

In [None]:
p= sns.histplot(data=frauds, x="ChannelId",color= ['darkcyan'])
plt.suptitle('Frauds per Channel', fontsize=16)
plt.ylabel('Number of frauds', fontsize=14)
plt.xlabel('Channel', fontsize=14)
plt.xticks( ticks = [0,1,2,3,4] )
#plt.xticks( ticks = range(5) ,labels = ['web','android', 'IOS', 'checkout','pay later'] );

In [None]:
sns.barplot(data=provider_compare, x="ProviderId", y="Value", hue="FraudResult",palette = ['darkcyan','cyan', 'cadetblue'])
plt.legend(handles=[pop_a,pop_b])
plt.suptitle('Fraudvolume per Provider', fontsize=16)
plt.ylabel('Value UGX in MIO', fontsize=14)
plt.xlabel('Provider', fontsize=14);

In [None]:
p= sns.histplot(data=frauds, x="ProviderId",color= ['darkcyan'])
plt.suptitle('Frauds per Provider', fontsize=16)
plt.ylabel('Number of frauds', fontsize=14)
plt.xlabel('Provider', fontsize=14);

In [None]:
sns.barplot(data=product_compare, x="ProductCategory", y="Value", hue="FraudResult",palette = ['darkcyan','cyan', 'cadetblue'])

plt.xticks(rotation=45);
# Creating legend with color box

plt.legend(handles=[pop_a,pop_b])
plt.suptitle('Fraudvolume per Product Category', fontsize=16)
plt.ylabel('Value UGX in MIO', fontsize=14)
plt.xlabel('Product Category', fontsize=14)
plt.xticks(ha = 'right');

In [None]:
sns.barplot(data=pricing_compare, x="PricingStrategy", y='Value', hue="FraudResult",palette = ['darkcyan','cyan', 'cadetblue'])
plt.suptitle('Fraudvolume per Pricing Strategy', fontsize=16)
plt.ylabel('Value UGX in MIO', fontsize=14)
plt.xlabel('Pricing Strategy', fontsize=14)
plt.xticks(ticks=np.linspace(1,4.0,4), ha = 'center');

In [None]:
TransactionsToDate = data_train_clean.groupby(["TransactionsToDate"],as_index=False)["FraudResult"].sum()
TransactionsToDate.head(15)
TTDa = TransactionsToDate.query('FraudResult > 2')
TTDa

In [None]:
g = sns.barplot(data=TTDa, x="TransactionsToDate", y="FraudResult",palette = ['darkcyan','cyan', 'cadetblue'])
g.set_xlabel("Transactions to Date", fontsize = 14)
g.set_ylabel("Number of Frauds", fontsize = 14)
plt.suptitle('Previous Transactions', fontsize=16);

In [None]:
batch = sns.histplot(data = frauds, x ='TransactionInBatch',color = ['darkcyan'])
plt.ylabel('Number of frauds', fontsize=14)
plt.xlabel('Transaction per Batch', fontsize=14)
plt.xticks(ticks=np.linspace(1,4.0,4), ha = 'center')
plt.suptitle('Frauds per Batchsize', fontsize=16);

Erkenntnisse:
- Insgesamt gibt es nur 5 DebitCredit = 1 in frauds (Credit)
- Frauds gibt es in allen PricingStrategien; In Strategie 1 nur DebitCredit =0
- Frauds gibt es an allen Wochentagen
- Frauds treten gehäuft bei Productid 15 auf
- Frauds treten vor allem bei Channelid 3 auf; alle 5 Creditfrauds fallen in Channelid 2 (ausschließlich credits)
- Frauds gibt es bei allen Providerids; alle 5 Creditfrauds fallen in Providerid 4 (ausschließlich credits)
- Frauds verteilen sich recht gleichmäßig auf die vers. Tageszeiten (nachts etwas weniger)