In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import missingno as ms

**Description:**

<img src = 'desc.png'/>

In [None]:
df = pd.read_csv('../input/creditcard.csv')
df.head()

In [None]:
ms.matrix(df)

* Great!, No Missing values in the dataset.

In [None]:
fraud = df[df['Class'] == 1]
not_fraud = df[df['Class'] == 0]

In [None]:
col = list(df.columns)
vectors = col[1:-2]

In [None]:

for i in vectors:
    
    sns.FacetGrid(df, hue = 'Class', size = 4) \
       .map(sns.distplot, i) \
       .add_legend()
    title = "Feature " + i
    plt.title(title)
    plt.show()

#### Observations:

* Most of the feature distributions overlap when potted differentiating on the two classes.
* The distribution plots of **V3, V4, V9, V10, V11, V12, V14, V16, V17, V18, V19** are not so much overlapping as the remaining features.





In [None]:
sel_col = ['V3', 'V4', 'V9', 'V10', 'V11', 'V12', 'V16', 'V17', 'V18','Class']
for i in sel_col[:-1]:
    counts, bin_edges = np.histogram(fraud[i], bins=30, 
                                 density = True)

    pdf = counts/(sum(counts))


#compute CDF
    cdf = np.cumsum(pdf)
    plt.plot(bin_edges[1:],pdf)
    plt.plot(bin_edges[1:], cdf)

    counts, bin_edges = np.histogram(not_fraud[i], bins=30, 
                                 density = True)

    pdf = counts/(sum(counts))


#compute CDF
    cdf = np.cumsum(pdf)
    plt.plot(bin_edges[1:],pdf)
    plt.plot(bin_edges[1:], cdf)
    plt.legend(['fraud_pdf', 'fraud_cdf','not_fraud_pdf', 'not_fraud_cdf'])
    plt.title("Feature {0} distributions".format(i))

    plt.show();

**Observations:**
* Simple conditioned models can be made from the above distribution plots by placing few **constraints** on the various **features**.

In [None]:
sel_col = ['V3', 'V4', 'V9', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V19','Class']
sel_data = df[sel_col]
print(sel_data.head())
g = sns.pairplot(sel_data, hue = 'Class', size = 5)
g.savefig("pairplot.png")

**Observations:**

* The following relationship scactter plots are more **clustered** than remaining **pair scatter plot - relationships.**

In [None]:
col_dic =  {'V28': ['V18', 'V17'],
         'V27': ['V18', 'V17', 'V11', 'V12'],
         'V25':['V17', 'V12'],
          'V24':['V18', 'V17'],
          'V23': ['V19', 'V18', 'V17', 'V14', 'V12', 'V11'],
          'V22': ['V18', 'V17', 'V11'],
           'V21': ['V18', 'V16'],
            'V20': ['V18', 'V17', 'V11'],
        'V18': ['V23', 'V21', 'V20', 'V17', 'V16'],
        'V17': ['V18'],
        'V16': ['V18', 'V17', 'V12'],
        'V14': ['V23', 'V20', 'V10', 'V6'],
         'V12': ['V23'],
        'V11': ['V27', 'V23', 'V12'],
        'V8':['V18', 'V17', 'V14', 'V11'],
        'V7':['V18', 'V11'],
        'V6': ['V10', 'V9', 'V8', 'V7'],
         'V5': ['V6', 'V5'],
        'V4': ['V3'],
        'V2':['V1'],
        'V1':['V2']}

In [None]:
for i in col_dic:
    aga = col_dic[i]
    for j in aga:
        sns.lmplot(x=i, y=j, data = df, hue = 'Class', size = 5, fit_reg= False)
        plt.show()

**Observations:**
* These are the plots where the **fraud** observations tend to lie along a **line**.
* Most values of the **fraud** observations tend to lie around **zero**.

In [None]:
sns.FacetGrid(df, hue = 'Class', size = 6) \
       .map(sns.distplot, 'Time') \
       .add_legend()
plt.show()

In [None]:
counts, bin_edges = np.histogram(fraud['Time'], bins=30, 
                                 density = True)

pdf = counts/(sum(counts))


#compute CDF
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:], cdf)

counts, bin_edges = np.histogram(not_fraud['Time'], bins=30, 
                                 density = True)

pdf = counts/(sum(counts))


#compute CDF
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:], cdf)
plt.legend(['fraud_pdf', 'fraud_cdf','not_fraud_pdf', 'not_fraud_cdf'])
plt.title('Time distributions')

plt.show();

In [None]:
sns.FacetGrid(df, hue = 'Class', size = 6) \
       .map(sns.distplot, 'Amount') \
       .add_legend()
plt.show()

In [None]:
counts, bin_edges = np.histogram(fraud['Amount'], bins=30, 
                                 density = True)

pdf = counts/(sum(counts))


#compute CDF
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:], cdf)

counts, bin_edges = np.histogram(not_fraud['Amount'], bins=30, 
                                 density = True)

pdf = counts/(sum(counts))


#compute CDF
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:], cdf)
plt.legend(['fraud_pdf', 'fraud_cdf','not_fraud_pdf', 'not_fraud_cdf'])
plt.title("Amount distributions")

plt.show();

**Observations:**

* Fraud **CDF Time distribtion** of **fraud** is always on top of **not fraud CDF**, therefore we can conclude that the **fraud member** takes large intervals of time than the **not fraud** member.
* The **fraud** members transaction amount is less than the amount of **not fraud** and decreases immediately  to **zero**.

In [None]:
print("The shape of the dataset is: ", df.shape)

* The dataset has **284807** observaions, and **31** attributes.

In [None]:
col = list(df.columns)
vec = col

In [None]:
df_vec = df[vec]

In [None]:
df_vec.head()

* Since computing the metrics for each of the observations in the dataset is computationally expensive, I have choosen random 10 observations using **np.random.randint** function of **Numpy**. 

## Computing cosine-similiarity.

In [None]:
np.random.seed(101)

In [None]:
choice = np.random.randint(0, 284807, size = 30)
print("These are my randomly choosen observation indcies: ",choice)

* Let's make the randomly choosen observation indices as columns for the new DataFrame for storing the **metrics**.

In [None]:
met_df = pd.DataFrame(columns = choice)
met_df

* Created the metric Dataframe under the name **met_df** for storing the metrics.

In [None]:
# Iterating through all the randomly choosen Observations/Vectors.
for i in choice:
    '''
        i          - Loop variable.
        
        multiplier - The ith observation values in the dataset that is randomly choosen.
        
        drop_df    - The dataframe obtained by removing the ith observation.
        
        metric     - This metric is computed taking each row in the "drop_df" DataFrame and dot product with "multiplier" and divided
                     by product of length of "multiplier" and length of each "observation". This variable is a pandas series.
                 
        met_df     - This DataFrame consists of all the calculated metrics.
        
        The lambda function leveraged most of the work in the whole calculation.
    ''' 
    multiplier = list(df_vec.iloc[i])
    drop_df = df_vec.drop(df.index[i])
    metric = drop_df.apply(lambda x: np.dot(multiplier, x) / (len(multiplier) * len(x)), axis = 1) 
    met_df[i] = sorted(metric)

In [None]:
print("The shape of the met_df is: ",met_df.shape)

* The number of observations in the **met_df** DataFrame is one less than the actual number of observations in the data set due to the metrics calculated in the way of **one-many** relationship.
* Saving the **met_df** dataframe for the future use.

In [None]:
met_df.to_csv('metrics.csv')

* Calculating the lowest 10 observations and storing into **lowest_10** DataFrame.

In [None]:
lowest_10 = met_df.iloc[:10]

In [None]:
lowest_10

In [None]:
low_10 = list(lowest_10.columns)

for i in low_10:
    print("\n\nThe Class value of {0} is {1}".format(i, df.loc[i]['Class']))
    print("The lowest 10 values of {0} index observations, are {1} \n".format(i, lowest_10[i].values))
    

**Done!**