# Creates a subset of the data for webapp
- Shuffles dataset and split into trainingDF and groundtruthDF
- Split a small set of data
- Create a json call for each
- Aggregate results

In [2]:
import pandas as pd
import json

In [35]:
dataDF = pd.read_csv("/mnt/data/Fraud_Detection/creditcard.csv")
dataDF = dataDF.sample(frac=1).reset_index(drop=True)

## Creates big training data & ground truth data

In [79]:
# get numbers of rows and half of it
maxLimit=dataDF.shape[0]
halfLimit=int(maxLimit/2)
print("Max:"+str(maxLimit)+" Half:"+str(halfLimit))

# generates trainingDF and groundTruthDF CSV
trainingDF=dataDF.iloc[0:halfLimit]
filename="/mnt/data/Fraud_Detection/training.csv"
trainingDF.to_csv(filename, encoding='utf-8', index=False)

groundtruthDF=dataDF.iloc[halfLimit+1:maxLimit-1]
filename="/mnt/data/Fraud_Detection/groundtruth.csv"
groundtruthDF.to_csv(filename, encoding='utf-8', index=False)

Max:284807 Half:142403


### Number of records with fraudulent transactions in groundtruth

In [76]:
fraudCountT=trainingDF.loc[trainingDF['Class'] == 1].shape[0]
fraudCountGT=groundtruthDF.loc[groundtruthDF['Class'] == 1].shape[0]
print("#Frauds in Training:"+str(fraudCountT)+" #Frauds in GroundTruth:"+str(fraudCountGT))

#Frauds in Training:247 #Frauds in GroundTruth:245


### Creates tiny sets of testing data in /mnt/data/Fraud_Detection/webapp_sets/ from groundtruth
While we are at it, let's rename the column Time to Hour so it works with the model and remove the Class column as we are not supposed to know the result

In [104]:
# continues code here
def createRecordSet(fractionFraudulent,nblines):
    if nblines>=groundtruthDF.loc[groundtruthDF['Class'] == 1].shape[0]:
        print("Too many lines!")
        return(None)
    else:
        fractionLegit=nblines-int(fractionFraudulent)
        fraudDF=groundtruthDF.loc[groundtruthDF['Class'] == 1].iloc[0:fractionFraudulent]
        if fractionLegit>0:
            legitDF=groundtruthDF.loc[groundtruthDF['Class'] == 0].iloc[0:fractionLegit]
            returnDF=pd.concat([legitDF, fraudDF], ignore_index=True, sort=False)
        else:
            returnDF=fraudDF
        return(returnDF.sample(frac=1).reset_index(drop=True))

In [105]:
fractions=(0,1,10,50,100)
for x in fractions:
    myDF=createRecordSet(x,100)
    filename="/mnt/data/Fraud_Detection/webapp_sets/records_"+str(x)+"pc_fraud.csv"
    myDF.to_csv(filename, encoding='utf-8', index=False)

In [100]:
# old version for creating chuncks
#for i in range (0,4):
#    chunkDF=groundtruthDF.iloc[i*1000:(i+1)*1000,:-1].rename(columns={"Time":"Hour"})
#    filename="/mnt/data/Fraud_Detection/webapp_sets/records_set"+str(i)+".csv"
#    chunkDF.to_csv(filename, encoding='utf-8', index=False)
#chunkDF.head()

### Creates individual jsons from a tiny set to test API CAll

In [68]:
recordDF=pd.read_csv("/mnt/data/Fraud_Detection/webapp_sets/records_set1.csv")
for t in range (0,10):
    chunkDF=recordDF.iloc[t+1]+recordDF.iloc[t,0]
    result = chunkDF.to_json(orient="index")
    final='{ "data": '+result+" }"
    filename="/mnt/data/Fraud_Detection/webapp_sets/unique_record"+str(t)+".json"
    with open(filename, 'w') as f:
        f.write(final)
    print(final)

{ "data": {"Hour":234219.0,"V1":66572.7500779378,"V2":66570.5621951367,"V3":66569.0390461323,"V4":66571.3199887549,"V5":66571.4363763772,"V6":66570.1803675344,"V7":66571.7352913386,"V8":66570.5501346556,"V9":66571.4273481993,"V10":66570.7578501606,"V11":66570.3781747512,"V12":66571.972014748,"V13":66572.1418725439,"V14":66571.3805755447,"V15":66571.3805321802,"V16":66570.362150116,"V17":66570.6077367464,"V18":66570.5802268249,"V19":66570.9693042717,"V20":66571.192792776,"V21":66571.2175768152,"V22":66571.4486072827,"V23":66570.8730657123,"V24":66571.6551086659,"V25":66571.3693749729,"V26":66570.4355140653,"V27":66570.9644091608,"V28":66570.9793652803,"Amount":66750.7} }
{ "data": {"Hour":326226.0,"V1":167650.141152543,"V2":167648.0346421896,"V3":167646.2699505012,"V4":167648.1054032007,"V5":167648.5876703934,"V6":167647.3363827471,"V7":167648.2832953707,"V8":167647.6339339851,"V9":167648.5272256182,"V10":167647.9816705407,"V11":167646.5756103525,"V12":167648.4633828924,"V13":167649.169

In [70]:
groundtruthDF.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
142404,166446.0,2.033125,-0.219438,-1.647962,-0.076795,0.450596,-0.275463,0.048083,-0.118781,0.152556,...,0.246104,0.722522,0.000856,0.299494,0.086846,0.764053,-0.099704,-0.07454,26.9,0
142405,61221.0,-0.153375,0.956542,0.93035,0.700915,0.440249,-0.321076,0.71095,-0.048005,-0.422859,...,0.09057,0.256541,-0.252545,-0.471079,-0.185967,-0.279131,0.164216,0.153293,8.2,0
142406,129139.0,0.886001,-1.9342,-2.707576,0.805183,-0.093002,-0.967718,1.393796,-0.509613,-0.184278,...,0.482352,-0.00147,-0.561248,-0.279345,0.025408,0.370957,-0.230484,0.015434,628.2,0
142407,151225.0,-0.04448,1.177755,2.505564,4.418745,-0.172407,1.082779,-0.091466,0.187058,-0.917967,...,-0.008533,0.371899,-0.039626,-0.159318,-0.949063,0.22877,0.336033,0.044186,4.29,0
142408,123723.0,0.129189,0.919443,-0.583558,-0.764522,1.159843,-0.194323,0.791388,0.132294,-0.16784,...,-0.323167,-0.84025,0.037828,0.011545,-0.431753,0.130124,0.217175,0.06738,0.89,0


In [72]:
### index of fraudulent transactions

In [73]:
groundtruthDF.idxmax()

Time      215875
V1        274790
V2        163779
V3        250986
V4        241303
V5        225810
V6        161354
V7        161354
V8        191412
V9        215150
V10       210369
V11       278879
V12       258261
V13       161354
V14       270334
V15       161354
V16       161354
V17       161637
V18       238575
V19       194742
V20       238575
V21       199185
V22       180741
V23       179267
V24       161354
V25       178361
V26       142643
V27       161354
V28       279693
Amount    161354
Class     142667
dtype: int64