In [1]:
import random
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter


In [2]:
# Loading the 5 train sets
data1 = pd.read_csv('data/train1.csv')
data2 = pd.read_csv('data/train2.csv')
data3 = pd.read_csv('data/train3.csv')
data4 = pd.read_csv('data/train4.csv')
data5 = pd.read_csv('data/train5.csv')


In [12]:
# Store data into array
data = [data1,data2,data3,data4,data5]

In [13]:
# Splitting the data into Dependent and Independent Variables, and saving it to an array
X_Y = []
for d in data:
       x = d[['Rndrng_NPI', 'Rndrng_Prvdr_Type', 'Rndrng_Prvdr_Gndr',
              'Rndrng_Prvdr_State_Abrvtn', 'HCPCS_Cd', 'Tot_Srvcs', 'Tot_Benes',
              'Tot_Bene_Day_Srvcs', 'Avg_Sbmtd_Chrg', 'Avg_Mdcr_Pymt_Amt']]
       y = d['Fraud']
       X_Y.append([x,y])
       print(Counter(y))

Counter({0: 599496, 1: 504})
Counter({0: 599496, 1: 504})
Counter({0: 599496, 1: 504})
Counter({0: 599496, 1: 504})
Counter({0: 599496, 1: 504})


In [26]:
# Performing Oversampling
data_arr = []
ratios = [0.01,0.25,2/3,1]
for i in ratios:
       sample = []
       print(f"Ratio: {i}")
       for d in X_Y:
              # define oversampling strategy
              oversample = RandomOverSampler(sampling_strategy=i,random_state = int(4012/i))
              # fit and apply the transform
              X_over, y_over = oversample.fit_resample(d[0], d[1])
              # summarize class distribution
              print(Counter(y_over))
              X_over['Fraud'] = y_over
              sample.append(X_over)
       data_arr.append(sample)

Ratio: 0.01
Counter({0: 599496, 1: 5994})
Counter({0: 599496, 1: 5994})
Counter({0: 599496, 1: 5994})
Counter({0: 599496, 1: 5994})
Counter({0: 599496, 1: 5994})
Ratio: 0.25
Counter({0: 599496, 1: 149874})
Counter({0: 599496, 1: 149874})
Counter({0: 599496, 1: 149874})
Counter({0: 599496, 1: 149874})
Counter({0: 599496, 1: 149874})
Ratio: 0.6666666666666666
Counter({0: 599496, 1: 399664})
Counter({0: 599496, 1: 399664})
Counter({0: 599496, 1: 399664})
Counter({0: 599496, 1: 399664})
Counter({0: 599496, 1: 399664})
Ratio: 1
Counter({1: 599496, 0: 599496})
Counter({1: 599496, 0: 599496})
Counter({1: 599496, 0: 599496})
Counter({1: 599496, 0: 599496})
Counter({1: 599496, 0: 599496})


In [30]:
# Saving the Oversampling data to csv
for ratio in range(len(data_arr)):
    for data in range(len(data_arr[ratio])):
       data_arr[ratio][data].to_csv(f'data/ROS_{ratios[ratio]}_{data+1}.csv')

In [31]:
# Performing Undersampling
data_arr = []
ratios = [0.01,0.25,2/3,1]
for i in ratios:
       sample = []
       print(f"Ratio: {i}")
       for d in X_Y:
              # define undersampling strategy
              undersample = RandomUnderSampler(sampling_strategy=i,random_state = int(4012/i))
              # fit and apply the transform
              X_under, y_under = undersample.fit_resample(d[0], d[1])
              # summarize class distribution
              print(Counter(y_under))
              X_under['Fraud'] = y_under
              sample.append(X_under)
       data_arr.append(sample)

Ratio: 0.01
Counter({0: 50400, 1: 504})
Counter({0: 50400, 1: 504})
Counter({0: 50400, 1: 504})
Counter({0: 50400, 1: 504})
Counter({0: 50400, 1: 504})
Ratio: 0.25
Counter({0: 2016, 1: 504})
Counter({0: 2016, 1: 504})
Counter({0: 2016, 1: 504})
Counter({0: 2016, 1: 504})
Counter({0: 2016, 1: 504})
Ratio: 0.6666666666666666
Counter({0: 756, 1: 504})
Counter({0: 756, 1: 504})
Counter({0: 756, 1: 504})
Counter({0: 756, 1: 504})
Counter({0: 756, 1: 504})
Ratio: 1
Counter({0: 504, 1: 504})
Counter({0: 504, 1: 504})
Counter({0: 504, 1: 504})
Counter({0: 504, 1: 504})
Counter({0: 504, 1: 504})


In [32]:
# Saving the Undersampling data to csv
for ratio in range(len(data_arr)):
    for data in range(len(data_arr[ratio])):
       data_arr[ratio][data].to_csv(f'data/RUS_{ratios[ratio]}_{data+1}.csv')

In [35]:
# Performing ROS-RUS
data_arr = []
ratios = [0.1,0.25,0.5]
for i in ratios:
       sample = []
       print(f"Ratio: {i}")
       for d in X_Y:
              undersample = RandomUnderSampler(sampling_strategy=504/(i*599496),random_state = int(4012/i))
              X_over, y_over = undersample.fit_resample(d[0],d[1])
              oversample = RandomOverSampler(sampling_strategy=1,random_state = int(4012/i))
              X_over_under, y_over_under = oversample.fit_resample(X_over, y_over)
              print(Counter(y_over_under))
              X_over_under['Fraud'] = y_over_under
              sample.append(X_over_under)
       data_arr.append(sample)


Ratio: 0.1
Counter({0: 59949, 1: 59949})
Counter({0: 59949, 1: 59949})
Counter({0: 59949, 1: 59949})
Counter({0: 59949, 1: 59949})
Counter({0: 59949, 1: 59949})
Ratio: 0.25
Counter({0: 149874, 1: 149874})
Counter({0: 149874, 1: 149874})
Counter({0: 149874, 1: 149874})
Counter({0: 149874, 1: 149874})
Counter({0: 149874, 1: 149874})
Ratio: 0.5
Counter({0: 299748, 1: 299748})
Counter({0: 299748, 1: 299748})
Counter({0: 299748, 1: 299748})
Counter({0: 299748, 1: 299748})
Counter({0: 299748, 1: 299748})


In [36]:
# Saving RUS-ROS data
for ratio in range(len(data_arr)):
    for data in range(len(data_arr[ratio])):
       data_arr[ratio][data].to_csv(f'data/ROS_RUS_{ratios[ratio]}_{data+1}.csv')
