# Creates a subset of the data for webapp
- Shuffles dataset and split into trainingDF and groundtruthDF
- Split a small set of data
- Create a json call for each
- Aggregate results

In [2]:
import pandas as pd
import json

In [108]:
dataDF = pd.read_csv("/mnt/data/Fraud_Detection/creditcard.csv")
dataDF = dataDF.sample(frac=1).reset_index(drop=True)

## Creates big training data & ground truth data

In [109]:
# get numbers of rows and half of it
maxLimit=dataDF.shape[0]
halfLimit=int(maxLimit/2)
print("Max:"+str(maxLimit)+" Half:"+str(halfLimit))

# generates trainingDF and groundTruthDF CSV
trainingDF=dataDF.iloc[0:halfLimit]
filename="/mnt/data/Fraud_Detection/training.csv"
trainingDF.to_csv(filename, encoding='utf-8', index=False)

groundtruthDF=dataDF.iloc[halfLimit+1:maxLimit-1]
filename="/mnt/data/Fraud_Detection/groundtruth.csv"
groundtruthDF.to_csv(filename, encoding='utf-8', index=False)

Max:284807 Half:142403


### Number of records with fraudulent transactions in groundtruth

In [110]:
fraudCountT=trainingDF.loc[trainingDF['Class'] == 1].shape[0]
fraudCountGT=groundtruthDF.loc[groundtruthDF['Class'] == 1].shape[0]
print("#Frauds in Training:"+str(fraudCountT)+" #Frauds in GroundTruth:"+str(fraudCountGT))

#Frauds in Training:248 #Frauds in GroundTruth:244


### Creates tiny sets of testing data in /mnt/data/Fraud_Detection/webapp_sets/ from groundtruth
While we are at it, let's rename the column Time to Hour so it works with the model and remove the Class column as we are not supposed to know the result

In [104]:
# continues code here
def createRecordSet(fractionFraudulent,nblines):
    if nblines>=groundtruthDF.loc[groundtruthDF['Class'] == 1].shape[0]:
        print("Too many lines!")
        return(None)
    else:
        fractionLegit=nblines-int(fractionFraudulent)
        fraudDF=groundtruthDF.loc[groundtruthDF['Class'] == 1].iloc[0:fractionFraudulent]
        if fractionLegit>0:
            legitDF=groundtruthDF.loc[groundtruthDF['Class'] == 0].iloc[0:fractionLegit]
            returnDF=pd.concat([legitDF, fraudDF], ignore_index=True, sort=False)
        else:
            returnDF=fraudDF
        return(returnDF.sample(frac=1).reset_index(drop=True))

In [105]:
fractions=(0,1,10,50,100)
for x in fractions:
    myDF=createRecordSet(x,100)
    filename="/mnt/data/Fraud_Detection/webapp_sets/records_"+str(x)+"pc_fraud.csv"
    myDF.to_csv(filename, encoding='utf-8', index=False)

In [100]:
# old version for creating chuncks
#for i in range (0,4):
#    chunkDF=groundtruthDF.iloc[i*1000:(i+1)*1000,:-1].rename(columns={"Time":"Hour"})
#    filename="/mnt/data/Fraud_Detection/webapp_sets/records_set"+str(i)+".csv"
#    chunkDF.to_csv(filename, encoding='utf-8', index=False)
#chunkDF.head()

### Creates individual jsons from a tiny set to test API CAll

In [120]:
recordDF=createRecordSet(1,5)
for t in range (0,5):
    chunkDF=recordDF.iloc[t,:-1]
    result = chunkDF.to_json(orient="index")
    final='{ "data": '+result+" }"
    filename="/mnt/data/Fraud_Detection/webapp_sets/unique_record"+str(t)+".json"
    with open(filename, 'w') as f:
        f.write(final)
recordDF

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,58785.0,-0.99123,0.383246,1.419725,-0.85161,-1.059652,-0.852024,-0.192014,0.280544,-1.591594,...,0.233441,0.427368,-0.056555,0.379458,0.359639,-0.072873,-0.161776,-0.027215,39.65,0
1,51155.0,-11.205461,7.914633,-13.987752,4.333341,-8.48497,-3.506561,-8.935243,7.704449,-2.336584,...,0.942593,-0.987848,-0.279446,-0.027299,0.644344,-0.263078,1.084023,0.211933,99.99,1
2,62346.0,1.049547,-0.275417,0.828154,0.571077,-0.639592,0.239669,-0.487189,0.201429,0.320254,...,0.196761,0.653981,-0.121094,0.100568,0.350442,0.613651,-0.003712,0.011687,51.0,0
3,122678.0,1.848558,-0.478917,-1.0068,0.646239,-0.37574,-0.882943,0.046305,-0.234386,0.765361,...,0.036503,0.046354,0.114275,-0.063654,-0.193171,0.259019,-0.062533,-0.04177,100.0,0
4,163385.0,2.130327,-0.603784,-1.039881,-1.662901,-0.350725,-0.663592,-0.397661,-0.190456,2.051923,...,-0.283994,-0.558878,0.231179,-0.995032,-0.248695,-0.500854,0.030387,-0.046598,8.22,0
