In [28]:
import pandas as pd

# Load the dataset and drop the 'Time' column
creditcard = pd.read_csv('creditcard.csv')

# drop the 'Time' column
creditcard = creditcard.drop(['Time'], axis=1)

# normalize 'V1' to 'V28' between -1 and 1 with a loop
for i in range(1, 29):
    creditcard['V'+str(i)] = (creditcard['V'+str(i)] - creditcard['V'+str(i)].min()) / (creditcard['V'+str(i)].max() - creditcard['V'+str(i)].min()) * 2 - 1

# normalize 'Amount' between -1 and 1
creditcard['Amount'] = (creditcard['Amount'] - creditcard['Amount'].min()) / (creditcard['Amount'].max() - creditcard['Amount'].min()) * 2 - 1

# create quartiles for 'Amount'
amount_quartiles = pd.qcut(creditcard['Amount'], 15, labels=['0', '1', '2', '3', '4','5','6','7','8','9','10','11','12','13','14'])
amount_quartile_dummies = pd.get_dummies(amount_quartiles, prefix='Amount')

# concatenate the original dataset with the new columns
creditcard = pd.concat([creditcard, amount_quartile_dummies], axis=1)
creditcard = creditcard.drop(['Amount'], axis=1)

# create quartiles for 'V1' to 'V28'
for i in range(1, 29):
    v_quartiles = pd.qcut(creditcard['V'+str(i)], 15, labels=False)
    v_quartile_dummies = pd.get_dummies(v_quartiles, prefix='V'+str(i))
    creditcard = pd.concat([creditcard, v_quartile_dummies], axis=1)
    creditcard = creditcard.drop(['V'+str(i)], axis=1)

# replace class values with 'normal' and 'fraud'
creditcard['Class'] = creditcard['Class'].replace(0, 'normal')
creditcard['Class'] = creditcard['Class'].replace(1, 'fraud')
class_dummies = pd.get_dummies(creditcard['Class'], prefix='Class')
creditcard = pd.concat([creditcard, class_dummies], axis=1)
creditcard = creditcard.drop(['Class'], axis=1)

# display the modified dataset
creditcard

Unnamed: 0,Amount_0,Amount_1,Amount_2,Amount_3,Amount_4,Amount_5,Amount_6,Amount_7,Amount_8,Amount_9,...,V28_7.0,V28_8.0,V28_9.0,V28_10.0,V28_11.0,V28_12.0,V28_13.0,V28_14.0,Class_fraud,Class_normal
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65448,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
65449,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
65450,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
65451,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [31]:
# calculate support of Dual itemset

#calculate 2-frequent item/normal
support_2 ={}
not_include = ['Class_fraud','Class_normal']
for i in range(len(creditcard.columns)):
    for j in range(i+1, len(creditcard.columns)):
        col_i = creditcard.columns[i]
        col_j = creditcard.columns[j]
        if (col_i not in not_include) and (col_j not in not_include):
          freq_rule = sum((creditcard[col_i] == 1) & (creditcard[col_j] == 1))
          if freq_rule != 0 :
            support_2[(col_i, col_j)] = freq_rule
    
support_2 = pd.DataFrame(support_2.items(), columns=['items', 'support'])

In [60]:
# Calculate the proportion of normal transactions in the creditcard dataset
prob_normal = sum((creditcard['Class_normal'] == 1))/ len(creditcard.index)

# Calculate the proportion of fraudulent transactions in the creditcard dataset
prob_fraud = sum((creditcard['Class_fraud'] == 1))/ len(creditcard.index)


In [61]:
# Sort the 'support_2' DataFrame by the 'support' column in descending order
support_2 = support_2.sort_values('support', ascending=False)

# Display the sorted 'support_2' DataFrame
support_2

Unnamed: 0,items,support
39262,"(V6_14, V24_14.0)",3734
34042,"(V5_14, V6_14)",2730
34312,"(V5_14, V24_14.0)",2613
5889,"(Amount_14, V2_0)",2562
6173,"(Amount_14, V20_14.0)",2388
...,...,...
11140,"(V1_11, V28_13.0)",1
11086,"(V1_11, V25_0.0)",1
10729,"(V1_10, V27_13.0)",1
10327,"(V1_9, V27_13.0)",1


In [100]:
# Select the top 1000 frequent itemsets from the 'support_2' DataFrame
freq_item_2 = support_2.head(1000)

In [101]:
# Define a function named 'confidence_normal' that takes a row of a DataFrame as input
def confidence_normal(row):
  # Extract the names of the items from the 'items' column of the input row
  names = row["items"]
  # Calculate the confidence for the itemset by counting the occurrences of the items and the class label in the 'creditcard' dataset
  confidence = sum((creditcard[names[0]] == 1) & (creditcard[names[1]] == 1) & (creditcard['Class_normal'] == 1)) / row['support']
  # Return the confidence value
  return confidence


In [102]:
# Calculate the confidence values for the itemsets in the 'freq_item_2' DataFrame using the 'confidence_normal' function
freq_item_2['confident'] = freq_item_2.apply(confidence_normal, axis=1)

# Calculate the interest values for the itemsets in the 'freq_item_2' DataFrame by subtracting the normal class probability from the confidence values
freq_item_2['intrest'] = freq_item_2['confident'] - prob_normal

# Display the updated 'freq_item_2' DataFrame
freq_item_2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  freq_item_2['confident'] = freq_item_2.apply(confidence_normal, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  freq_item_2['intrest'] = freq_item_2['confident'] - prob_normal


Unnamed: 0,items,support,confident,intrest
39262,"(V6_14, V24_14.0)",3734,1.000000,0.002597
34042,"(V5_14, V6_14)",2730,1.000000,0.002597
34312,"(V5_14, V24_14.0)",2613,1.000000,0.002597
5889,"(Amount_14, V2_0)",2562,0.999610,0.002207
6173,"(Amount_14, V20_14.0)",2388,0.997906,0.000503
...,...,...,...,...
84995,"(V20_14.0, V28_12.0)",641,1.000000,0.002597
61206,"(V12_2.0, V14_13.0)",641,1.000000,0.002597
49956,"(V9_4, V27_13.0)",641,1.000000,0.002597
17932,"(V2_14, V12_14.0)",641,1.000000,0.002597


In [103]:
freq_item_2_5p = freq_item_2.sort_values('confident',ascending = False)
freq_item_2_5p.head()

Unnamed: 0,items,support,confident,intrest
39262,"(V6_14, V24_14.0)",3734,1.0,0.002597
12317,"(V1_14, V28_6.0)",734,1.0,0.002597
8751,"(V1_6, V3_12)",733,1.0,0.002597
76410,"(V16_14.0, V26_7.0)",733,1.0,0.002597
13119,"(V2_2, V4_1)",732,1.0,0.002597


In [69]:
# Filter the 'support_2' DataFrame to select the itemsets with a support value between 400 and 1000 (inclusive)
item_2_frad = support_2[(support_2['support'] >= 400) & (support_2['support'] <= 1000)]
item_2_frad


Unnamed: 0,items,support
34193,"(V5_14, V17_0.0)",999
18397,"(V3_0, V18_14.0)",997
18263,"(V3_0, V10_0)",996
12151,"(V1_14, V16_14.0)",996
11935,"(V1_14, V2_3)",995
...,...,...
70345,"(V14_12.0, V25_2.0)",400
14799,"(V2_6, V12_1.0)",400
79938,"(V18_6.0, V26_10.0)",400
10364,"(V1_10, V3_7)",400


In [70]:
# Define a function named 'confidence_fraud' that takes a row of a DataFrame as input
def confidence_fraud(row):
  # Extract the names of the items from the 'items' column of the input row
  names = row["items"]
  # Calculate the confidence for the itemset by counting the occurrences of the items and the fraud class label in the 'creditcard' dataset
  confidence = sum((creditcard[names[0]] == 1) & (creditcard[names[1]] == 1) & (creditcard['Class_fraud'] == 1)) / row['support']
  # Return the calculated confidence value
  return confidence

In [71]:
# Apply the 'confidence_fraud' function to each row of the 'item_2_frad' DataFrame to calculate the confidence for each itemset
item_2_frad['confident'] = item_2_frad.apply(confidence_fraud, axis=1)

# Calculate the interest for each itemset by subtracting the probability of fraud from the confidence value
item_2_frad['interest'] = item_2_frad['confident'] - prob_fraud

# Display the resulting DataFrame with the calculated confidence and interest values for each itemset
item_2_frad

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_2_frad['confident'] = item_2_frad.apply(confidence_fraud, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_2_frad['interest'] = item_2_frad['confident'] - prob_fraud


Unnamed: 0,items,support,confident,interest
34193,"(V5_14, V17_0.0)",999,0.001001,-0.001581
18397,"(V3_0, V18_14.0)",997,0.009027,0.006445
18263,"(V3_0, V10_0)",996,0.146586,0.144004
12151,"(V1_14, V16_14.0)",996,0.000000,-0.002582
11935,"(V1_14, V2_3)",995,0.000000,-0.002582
...,...,...,...,...
70345,"(V14_12.0, V25_2.0)",400,0.000000,-0.002582
14799,"(V2_6, V12_1.0)",400,0.000000,-0.002582
79938,"(V18_6.0, V26_10.0)",400,0.000000,-0.002582
10364,"(V1_10, V3_7)",400,0.000000,-0.002582


In [104]:
# Sort the 'item_2_frad' DataFrame by the 'confident' column in descending order
item_2_frad = item_2_frad.sort_values('confident', ascending=False)

# Display the resulting DataFrame with the itemsets sorted by confidence value in descending order
item_2_frad_5p = item_2_frad.head()
item_2_frad_5p

Unnamed: 0,items,support,confident,interest
28973,"(V4_14, V14_0.0)",402,0.343284,0.340702
28942,"(V4_14, V11_14.0)",419,0.319809,0.317227
60503,"(V11_14.0, V16_0.0)",449,0.300668,0.298086
52838,"(V10_0, V12_0.0)",488,0.29918,0.296598
39473,"(V7_0, V18_0.0)",400,0.27,0.267418


In [78]:
# Define a function to calculate the support for all 3-itemsets in the input DataFrame
def get_support_3(df):
    # Initialize an empty dictionary to store the support values for each 3-itemset
    support_3 = {}

    # Define a list of items to exclude from the 3-itemset combinations
    not_include = ['fraud', 'Normal']

    # Loop through all pairs of rows in the input DataFrame
    for i in range(df.shape[0]):
        items = set(df.iloc[i]['items'])
        for j in range(i+1, df.shape[0]):
            items_i = set(df.iloc[j]['items'])

            # If the two sets of items have at least one item in common, combine them into a 3-itemset
            if not items.isdisjoint(items_i):
                item_3 = tuple(items.union(items_i))

                # Calculate the support for the 3-itemset using the 'creditcard' DataFrame
                sup = sum((creditcard[item_3[0]] == 1) & (creditcard[item_3[1]] == 1) & (creditcard[item_3[2]] == 1))

                # If the support is not zero, add it to the 'support_3' dictionary
                if sup != 0:
                    support_3[item_3] = sup

    # Return the resulting dictionary of 3-itemset support values
    return support_3


In [79]:
support_3_normal=get_support_3(freq_item_2)

In [80]:
item_3_normal = pd.DataFrame(support_3_normal.items(), columns=['items', 'support'])
item_3_normal=item_3_normal.sort_values('support', ascending=False)
item_3_normal

Unnamed: 0,items,support
0,"(V5_14, V6_14, V24_14.0)",2526
56,"(V2_0, V20_14.0, Amount_14)",1839
2,"(V8_13, V6_14, V24_14.0)",1584
1,"(V3_0, V6_14, V24_14.0)",1544
57,"(V23_0.0, V2_0, Amount_14)",1528
...,...,...
7028,"(V23_0.0, V20_0.0, V1_8)",1
7439,"(V1_14, V27_5.0, V7_0)",1
6733,"(V2_4, V1_14, V7_0)",1
1392,"(V2_0, V23_14.0, V1_8)",1


In [89]:
def calculate_interest(df, prob_normal):
    df['confident'] = df.apply(lambda row: (sum((creditcard[row['items'][0]] == 1) & (creditcard[row['items'][1]] == 1) & (creditcard[row['items'][2]] == 1) &(creditcard['Class_normal'] ==1)))/ row['support'], axis=1)
    df['interest'] = df['confident'] - prob_normal
    return df

freq_item_3 = item_3_normal.iloc[:1000]
freq_item_3 = calculate_interest(freq_item_3, prob_normal)
freq_item_3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['confident'] = df.apply(lambda row: (sum((creditcard[row['items'][0]] == 1) & (creditcard[row['items'][1]] == 1) & (creditcard[row['items'][2]] == 1) &(creditcard['Class_normal'] ==1)))/ row['support'], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['interest'] = df['confident'] - prob_normal


Unnamed: 0,items,support,confident,interest
0,"(V5_14, V6_14, V24_14.0)",2526,1.000000,0.002597
56,"(V2_0, V20_14.0, Amount_14)",1839,0.999456,0.002054
2,"(V8_13, V6_14, V24_14.0)",1584,1.000000,0.002597
1,"(V3_0, V6_14, V24_14.0)",1544,1.000000,0.002597
57,"(V23_0.0, V2_0, Amount_14)",1528,1.000000,0.002597
...,...,...,...,...
4323,"(V9_14, V28_0.0, V1_0)",354,0.994350,-0.003052
3594,"(V7_14, V5_0, V6_13)",354,1.000000,0.002597
462,"(V8_13, V4_0, V24_14.0)",352,1.000000,0.002597
6474,"(V24_14.0, V7_1, V8_14)",352,1.000000,0.002597


In [106]:
freq_item_3 = freq_item_3.sort_values('confident', ascending=False)
freq_item_3_5p = freq_item_3.head()
freq_item_3_5p

Unnamed: 0,items,support,confident,interest
2883,"(V14_0.0, V2_14, V10_0)",356,0.373596,-0.623807
624,"(V3_0, V11_14.0, V2_14)",399,0.333333,-0.664069
4177,"(V11_14.0, V14_0.0, V6_0)",375,0.322667,-0.674736
4174,"(V14_0.0, V6_0, V10_0)",412,0.293689,-0.703713
2211,"(V3_0, V14_0.0, V11_14.0)",506,0.290514,-0.706889


In [107]:
# Select top 1000 frequent itemsets containing 2 items with Class_fraud transactions
item_2_frad = item_2_frad.iloc[:1000]

In [108]:
item_3_frad=get_support_3(item_2_frad)

In [112]:
item_3_frad = pd.DataFrame(item_3_frad.items(), columns=['items', 'support'])
item_3_frad=item_3_frad.sort_values('support', ascending=False)
item_3_frad

Unnamed: 0,items,support
4442,"(V3_0, V2_14, V6_0)",752
17328,"(V17_14.0, V3_0, V18_14.0)",747
4910,"(V8_14, V2_14, V5_0)",694
4675,"(V17_14.0, V3_0, V6_0)",668
16435,"(V17_14.0, V14_0.0, V18_14.0)",634
...,...,...
15071,"(V23_0.0, V20_0.0, V1_8)",1
2509,"(V1_10, V8_14, V4_14)",1
16193,"(V21_14.0, V4_13, V1_9)",1
2707,"(V1_10, V2_14, V4_14)",1


In [113]:
def calculate_interest(df, prob_fraud):
    df['confident'] = df.apply(lambda row: (sum((creditcard[row['items'][0]] == 1) & (creditcard[row['items'][1]] == 1) & (creditcard[row['items'][2]] == 1) &(creditcard['Class_fraud'] ==1)))/ row['support'], axis=1)
    df['interest'] = df['confident'] - prob_normal
    return df

item_3_frad = calculate_interest(freq_item_3, prob_fraud)
item_3_frad

Unnamed: 0,items,support,confident,interest
2883,"(V14_0.0, V2_14, V10_0)",356,0.373596,-0.623807
624,"(V3_0, V11_14.0, V2_14)",399,0.333333,-0.664069
4177,"(V11_14.0, V14_0.0, V6_0)",375,0.322667,-0.674736
4174,"(V14_0.0, V6_0, V10_0)",412,0.293689,-0.703713
2211,"(V3_0, V14_0.0, V11_14.0)",506,0.290514,-0.706889
...,...,...,...,...
3008,"(V17_14.0, V4_13, V6_0)",461,0.000000,-0.997403
1162,"(V9_14, V1_0, V27_14.0)",461,0.000000,-0.997403
146,"(V23_0.0, Amount_14, V21_13.0)",460,0.000000,-0.997403
154,"(V23_0.0, Amount_14, V28_12.0)",460,0.000000,-0.997403


In [115]:
item_3_frad =item_3_frad.sort_values('confident', ascending=False)
item_3_frad_5p = item_3_frad.head()
item_3_frad_5p

Unnamed: 0,items,support,confident,interest
2883,"(V14_0.0, V2_14, V10_0)",356,0.373596,-0.623807
624,"(V3_0, V11_14.0, V2_14)",399,0.333333,-0.664069
4177,"(V11_14.0, V14_0.0, V6_0)",375,0.322667,-0.674736
4174,"(V14_0.0, V6_0, V10_0)",412,0.293689,-0.703713
2211,"(V3_0, V14_0.0, V11_14.0)",506,0.290514,-0.706889


In [133]:
def calculate_precision(row):
    if (row['V14_0.0']==1) & (row['V2_14']==1) & (row['V10_0']==1):
        return 1
    elif (row['V3_0']==1) & (row['V11_14.0']==1) & (row['V2_14']==1):
        return 1
    elif (row['V11_14.0']==1) & (row['V14_0.0']==1) & (row['V6_0']==1):
        return 1
    elif (row['V14_0.0']==1) & (row['V6_0']==1) & (row['V10_0']==1):
        return 1
    elif (row['V3_0']==1) & (row['V14_0.0']==1) & (row['V11_14.0']==1):
        return 1
    else:
        return 0


In [144]:
creditcard['precision_fraud'] = creditcard.apply(calculate_precision, axis=1)

In [151]:
count = ((creditcard['Class_fraud'] == 1) & (creditcard['precision_fraud'] == 1)).sum()
percentage = (count / 492) * 100
percentage

30.284552845528456