In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
credit_card = pd.read_csv('../input/creditcard.csv')

In [None]:
credit_card.head()

In [None]:
credit_card.shape

In [None]:
credit_card.describe()

In [None]:
credit_card.Class.unique()

In [None]:
total_fraud_count = sum(credit_card.Class[credit_card.Class == 1])

In [None]:
print('The total fraud happened is ', total_fraud_count, ' which is', round(total_fraud_count/credit_card.shape[0] * 100, 4) , '%')

In [None]:
credit_card[credit_card.Class == 1].head()

### Sample 100 transactions from the whole dataset with random values (2 records from class 1 and 98 records from class 2)

In [None]:
non_fradaulent = credit_card[credit_card.Class == 0].sample(98, random_state=1)
fradaulent = credit_card[credit_card.Class == 1].sample(2, random_state=1)
new_credit_card = pd.concat([non_fradaulent, fradaulent]).sample(100, random_state=1)

In [None]:
new_credit_card.head()

### Our objective is to find if a transaction is a fraud or not
##### Since there are 32 features, it's better to do a heatmap to see the correation between values of different features

In [None]:
fig, ax = plt.subplots(figsize=(20,15))
corr = new_credit_card.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values, annot=True, linewidths=.5, ax=ax, vmin=0, vmax=1)

#### The heatmap doesn't show good correlation between values of features.
#### Considering correlation aobe 0.50, we can see that V11 and class are related. V10 and V28, Amount and V7. But the correlation doesn't say much about the fradaulent and non fradaulent transaction.

### Similarity between any two vectors

In [None]:
def cosineSimilarity(vi, vj):
    return np.dot(vi, vj) / (np.linalg.norm(vi) * np.linalg.norm(vj))

In [None]:
class Similarity:
    def __init__(self, inner_id_, class_, innerClass_, similarity_):
        self.inner_id_ = inner_id_
        self.class_ = class_
        self.similarity_ = similarity_
        self.innerClass_ = innerClass_
    
    def __str__(self):
        return 'Class '+ str(int(self.class_))+' matching with id : '+ str(self.inner_id_) +' class '+ str(int(self.innerClass_)) +'. The similarity percent is '+ str(self.similarity_)
    

In [None]:
similarity_map = {}

for id_, row in new_credit_card.iterrows():
    similarity_list = []
    for inner_id_, inner_row in new_credit_card.iterrows():
        similarity_percent = cosineSimilarity(row.values[:-1], inner_row[:-1])
        similarity_list.append(Similarity(inner_id_, row.Class, inner_row.Class, similarity_percent))
    similarity_map[id_] = similarity_list
        

### Least Similarities

In [None]:
for key, val in similarity_map.items():
    print('Given transaction id : ', key)
    val.sort(key = lambda x : x.similarity_, reverse=False)
    for similar_ in val[:10]:
        print(similar_)

### Most Similarities

In [None]:
for key, val in similarity_map.items():
    print('Given transaction id : ', key)
    val.sort(key = lambda x : x.similarity_, reverse=True)
    for similar_ in val[:10]:
        print(similar_)