## Load Data

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
card= pd.read_excel('card transactions.xlsx')
card = card[card['Transtype']=='P'] #We only study the Transtype P here
card.head()

Unnamed: 0,Recnum,Cardnum,Date,Merchnum,Merch description,Merch state,Merch zip,Transtype,Amount,Fraud
0,1,5142190439,2010-01-01,5509006296254,FEDEX SHP 12/23/09 AB#,TN,38118.0,P,3.62,0
1,2,5142183973,2010-01-01,61003026333,SERVICE MERCHANDISE #81,MA,1803.0,P,31.42,0
2,3,5142131721,2010-01-01,4503082993600,OFFICE DEPOT #191,MD,20706.0,P,178.49,0
3,4,5142148452,2010-01-01,5509006296254,FEDEX SHP 12/28/09 AB#,TN,38118.0,P,3.62,0
4,5,5142190439,2010-01-01,5509006296254,FEDEX SHP 12/23/09 AB#,TN,38118.0,P,3.62,0


## Fill in Missing Values

In [2]:
card.info() #There are NAs in Merch state, Merch zip
## We don't want to study Merchnum, since this field and Merch description are both unique identifier for merchants
## while Merch description doesn't have NAs.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96398 entries, 0 to 96752
Data columns (total 10 columns):
Recnum               96398 non-null int64
Cardnum              96398 non-null int64
Date                 96398 non-null datetime64[ns]
Merchnum             93199 non-null object
Merch description    96398 non-null object
Merch state          95377 non-null object
Merch zip            92097 non-null float64
Transtype            96398 non-null object
Amount               96398 non-null float64
Fraud                96398 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(3), object(4)
memory usage: 8.1+ MB


In [3]:
## Add values into zip by major corresponding values group by merch description, new field is zip_x
merNum_Des = card.dropna(axis=0).groupby('Merch description').agg({'Merch zip':lambda x:x.value_counts().index[0]})
newcard = card.merge(merNum_Des,right_index=True,left_on='Merch description',how='left')
newcard['Merch zip_x'].fillna(newcard['Merch zip_y'],inplace=True)

In [4]:
## Add values into zip by major corresponding values group by cardnum, new field is zip_x_x
merNum_Des = newcard.dropna(axis=0).groupby('Cardnum').agg({'Merch zip_x':lambda x:x.value_counts().index[0]})
newcard = newcard.merge(merNum_Des,right_index=True,left_on='Cardnum',how='left')
newcard['Merch zip_x_x'].fillna(newcard['Merch zip_x_y'],inplace=True)

In [5]:
## Add values into state by major corresponding values group by zip, new field is state_x
a = newcard[newcard['Merch zip_x_x'].notna()] 
b = a[a['Merch state'].notna()] ## b is data without NAs in zip_x_x and state
merNum_Des = b.groupby('Merch zip_x_x').agg({'Merch state':lambda x:x.value_counts().index[0]})
newcard = newcard.merge(merNum_Des,right_index=True,left_on='Merch zip_x_x',how='left')
newcard['Merch state_x'].fillna(newcard['Merch state_y'],inplace=True)
newcard.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96398 entries, 0 to 96752
Data columns (total 13 columns):
Recnum               96398 non-null int64
Cardnum              96398 non-null int64
Date                 96398 non-null datetime64[ns]
Merchnum             93199 non-null object
Merch description    96398 non-null object
Merch state_x        96290 non-null object
Merch zip_x_x        96344 non-null float64
Transtype            96398 non-null object
Amount               96398 non-null float64
Fraud                96398 non-null int64
Merch zip_y          93185 non-null float64
Merch zip_x_y        96334 non-null float64
Merch state_y        96277 non-null object
dtypes: datetime64[ns](1), float64(4), int64(3), object(5)
memory usage: 10.3+ MB


In [6]:
newcard[newcard['Merch state_x'].isna()].head()

Unnamed: 0,Recnum,Cardnum,Date,Merchnum,Merch description,Merch state_x,Merch zip_x_x,Transtype,Amount,Fraud,Merch zip_y,Merch zip_x_y,Merch state_y
3258,3259,5142153880,2010-01-14,582582822587,DIGITAL TECHNOLOGY CONTRA,,926.0,P,2340.0,0,,20746.0,
3262,3263,5142154098,2010-01-14,582582822587,DIGITAL TECHNOLOGY CONTRA,,926.0,P,2387.0,0,,20639.0,
3540,3541,5142154098,2010-01-17,582582822587,DIGITAL TECHNOLOGY CONTRA,,926.0,P,2300.0,0,,20639.0,
3642,3643,5142153880,2010-01-17,582582822587,DIGITAL TECHNOLOGY CONTRA,,926.0,P,2500.0,0,,20746.0,
4969,4970,5142194136,2010-01-24,597597721468,CRISTALIA ACQUISITION COR,,929.0,P,83.0,0,,90640.0,


In [7]:
## We searched the missing value in Zip on Google and find their corresponding State.
dict = {"907.0":"PR", "922.0":"PR", "920.0":"PR", "801.0":"USVI","31040.0":"GA", "41160.0":"KY", "934.0": "PR",
"902.0": "PR", "738.0": "PR", "90805.0": "CA", "76302.0": "TX", "914.0": "PR", "95461.0": "CA", "50823.0": "Other", 
'926.0': "PR", '929.0':"PR", '1400.0':"Other", '65132.0':"Other", '86899.0':"Other", '23080.0':"Other",
'60528.0':"Other", "48700.0": "CA", "680.0": "PR", "681.0": "PR", "623.0": "PR", "726.0": "PR", "936.0": "PR",
"791.0": "PR", "12108.0": "Other", "nan":'Other'}

In [8]:
## We used the previous dictionary to fill in missing fields in state.
ab = newcard[newcard['Merch state_x'].isna()]
ab['Merch zip_x_x']=ab['Merch zip_x_x'].astype('str')
for i in range(len(ab['Merch zip_x_x'])):
    ab['Merch state_x'].iloc[i]=dict[ab['Merch zip_x_x'].iloc[i]]

In [9]:
## We replace the values in newcard dataset with filled data
ac = newcard[newcard['Merch state_x'].notna()][['Merch state_x']]
ad = pd.concat([ab[['Merch state_x']],ac])
newcard['Merch state_x'] = ad

In [10]:
## No NAs in field state now
newcard[newcard['Merch state_x'].isna()]

Unnamed: 0,Recnum,Cardnum,Date,Merchnum,Merch description,Merch state_x,Merch zip_x_x,Transtype,Amount,Fraud,Merch zip_y,Merch zip_x_y,Merch state_y


In [11]:
newcard['Merch zip_x_x'].fillna(0,inplace=True)

In [12]:
# Select the useful columns and change the column name
fdata = newcard[["Recnum","Cardnum","Date","Merch description","Merch state_x","Merch zip_x_x","Transtype","Amount","Fraud"]]
fdata.columns = ["Recnum","Cardnum","Date","Merch description","Merch state","Merch zip","Transtype","Amount","Fraud"]

In [13]:
fdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96398 entries, 0 to 96752
Data columns (total 9 columns):
Recnum               96398 non-null int64
Cardnum              96398 non-null int64
Date                 96398 non-null datetime64[ns]
Merch description    96398 non-null object
Merch state          96398 non-null object
Merch zip            96398 non-null float64
Transtype            96398 non-null object
Amount               96398 non-null float64
Fraud                96398 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(3), object(3)
memory usage: 7.4+ MB


# Variable Creation

## Amount Variable

In [132]:
%%time
fdata['Date']=pd.to_datetime(fdata['Date'])
tot=pd.DataFrame({'Date':[],'Amount':[], 'Cardnum':[]})
for cardnum in fdata['Cardnum'].unique():
    c=fdata[fdata['Cardnum']==cardnum]
    f=c.groupby('Date')['Amount'].sum()
    f1=f.rolling('7d').sum()
    f1=f1/7
    f1=pd.DataFrame(f1)
    f1.reset_index(inplace=True)
    f1['Cardnum'] = [cardnum]* len(f1)
    tot=pd.concat([tot,f1])

CPU times: user 17.4 s, sys: 109 ms, total: 17.5 s
Wall time: 17.5 s


In [227]:
md = pd.DataFrame(fdata['Merch description'].unique(),columns=['md'])
md.reset_index(inplace=True)

In [228]:
fdata=fdata.merge(md,right_on='md',left_on='Merch description',how='left')
fdata.head()

Unnamed: 0,Recnum,Cardnum,Date,Merch description,Merch state,Merch zip,Transtype,Amount,Fraud,index,md
0,1,5142190439,2010-01-01,FEDEX SHP 12/23/09 AB#,TN,38118.0,P,3.62,0,0,FEDEX SHP 12/23/09 AB#
1,2,5142183973,2010-01-01,SERVICE MERCHANDISE #81,MA,1803.0,P,31.42,0,1,SERVICE MERCHANDISE #81
2,3,5142131721,2010-01-01,OFFICE DEPOT #191,MD,20706.0,P,178.49,0,2,OFFICE DEPOT #191
3,4,5142148452,2010-01-01,FEDEX SHP 12/28/09 AB#,TN,38118.0,P,3.62,0,3,FEDEX SHP 12/28/09 AB#
4,5,5142190439,2010-01-01,FEDEX SHP 12/23/09 AB#,TN,38118.0,P,3.62,0,0,FEDEX SHP 12/23/09 AB#


In [193]:
%%time
desc='FEDEX SHP 12/23/09 AB#'

c=fdata1.ix[desc]
c.reset_index(inplace=True)
c=c[['Recnum','Merch description','Date']]
diff = []
for i,val in enumerate(c['Date']):
    if i == 0:
        diff.append(0)
    else:
        diff.append((c.Date.iloc[i] - c.Date.iloc[i-1]).days)
c['Diff']=diff
c

CPU times: user 42.9 ms, sys: 2.06 ms, total: 45 ms
Wall time: 43.6 ms


## Day since Variables

In [16]:
## by Card
dsc=pd.DataFrame({'Recnum':[],'Cardnum':[],'Date':[],'Diff':[]})
f1data=fdata[['Recnum','Cardnum','Date']]
for cardnum in f1data['Cardnum'].unique():
    c=f1data[f1data['Cardnum']==cardnum]
    c.reset_index(inplace=True)
    diff = []
    for i,val in enumerate(c['Date']):
        if i == 0:
            diff.append(365)
        else:
            diff.append((c.Date.iloc[i] - c.Date.iloc[i-1]).days)
    c['Diff']=diff
    dsc=pd.concat([dsc,c])
dsc['Recnum'] = dsc['Recnum'].astype('int')
dsc = dsc[['Cardnum', 'Date', 'Diff', 'Recnum']]

In [17]:
dsc=dsc[['Diff', 'Recnum']]
dsc.columns=['dscard','Recnum']
dsc

Unnamed: 0,dscard,Recnum
0,365.0,1
1,0.0,5
2,0.0,10
3,0.0,12
4,0.0,21
5,0.0,22
6,0.0,23
7,0.0,28
8,0.0,34
9,0.0,40


In [21]:
## By Merch
fdata.sort_values(['Merch description','Date'], inplace = True)
diff = []
f1 = fdata.reset_index()
for i in range(96398):
    if i == 0:
        diff.append(365)
    else:
        diff.append(max(0,(f1.Date.iloc[i] - f1.Date.iloc[i-1]).days))
f1['Diff']=diff
f2=f1.groupby('Merch description').first()
for rec in f2['Recnum']:
    f1['Diff'][f1['Recnum']==rec]=365

In [22]:
f1=f1[['Diff', 'Recnum']]
f1.columns=['dsmerch','Recnum']

In [23]:
## by Merchant & card
fdata.sort_values(['Merch description','Cardnum','Date'], inplace = True)
diff = []
f11 = fdata.reset_index()
for i in range(96398):
    if i == 0:
        diff.append(365)
    else:
        diff.append(max(0,(f11.Date.iloc[i] - f11.Date.iloc[i-1]).days))
f11['Diff']=diff
f12=f11.groupby(['Merch description','Cardnum']).first()
for rec in f12['Recnum']:
    f11['Diff'][f11['Recnum']==rec]=365

In [24]:
f11=f11[['Diff', 'Recnum']]
f11.columns=['dsmerchcard','Recnum']

In [25]:
## by zip & card
fdata.sort_values(['Merch zip','Cardnum','Date'], inplace = True)
diff = []
f21 = fdata.reset_index()
for i in range(96398):
    if i == 0:
        diff.append(365)
    else:
        diff.append(max(0,(f21.Date.iloc[i] - f21.Date.iloc[i-1]).days))
f21['Diff']=diff
f22=f21.groupby(['Merch zip','Cardnum']).first()
for rec in f22['Recnum']:
    f21['Diff'][f21['Recnum']==rec]=365

In [26]:
f21=f21[['Diff', 'Recnum']]
f21.columns=['dszipcard','Recnum']

In [27]:
## by state & card
fdata.sort_values(['Merch state','Cardnum','Date'], inplace = True)
diff = []
f31 = fdata.reset_index()
for i in range(96398):
    if i == 0:
        diff.append(365)
    else:
        diff.append(max(0,(f31.Date.iloc[i] - f31.Date.iloc[i-1]).days))
f31['Diff']=diff
f32=f31.groupby(['Merch state','Cardnum']).first()
for rec in f32['Recnum']:
    f31['Diff'][f31['Recnum']==rec]=365

In [28]:
f31=f31[['Diff', 'Recnum']]
f31.columns=['dsstatecard','Recnum']

In [29]:
fdata1=fdata.merge(dsc,right_on='Recnum',left_on='Recnum',how='left')
fdata2=fdata1.merge(f1,right_on='Recnum',left_on='Recnum',how='left')
fdata3=fdata2.merge(f11,right_on='Recnum',left_on='Recnum',how='left')
fdata4=fdata3.merge(f21,right_on='Recnum',left_on='Recnum',how='left')
fdata5=fdata4.merge(f31,right_on='Recnum',left_on='Recnum',how='left')

In [30]:
fdata5

Unnamed: 0,Recnum,Cardnum,Date,Merch description,Merch state,Merch zip,Transtype,Amount,Fraud,dscard,dsmerch,dsmerchcard,dszipcard,dsstatecard
0,89917,5142111372,2010-11-30,COMMUNICATION SYSTEMS,AB,55337.0,P,48.58,0,13.0,365,365,212,365
1,44964,5142117641,2010-06-15,AGRICULTURE FOOD & RUR,AB,80202.0,P,37.00,0,5.0,365,365,10,365
2,80133,5142251947,2010-10-06,TROUT UNLIMITED CANADA,AB,98101.0,P,225.00,0,36.0,365,365,36,365
3,83156,5142254784,2010-10-25,ENVIRONMENTAL SERVICES AS,AB,98032.0,P,527.00,0,4.0,365,365,21,365
4,50777,5142291592,2010-07-06,SEQUITER SOFTWARE INC,AB,17201.0,P,752.83,0,8.0,365,365,14,365
5,467,5142111572,2010-01-04,GREENSTAR INC,AK,99501.0,P,100.00,0,365.0,365,365,365,365
6,1126,5142111572,2010-01-07,ARCTIC OFFICE MACHINE,AK,99645.0,P,300.00,0,1.0,365,365,365,3
7,1932,5142111572,2010-01-11,ALASKA PACIFIC UNIVERS,AK,99508.0,P,800.00,0,4.0,365,365,365,4
8,2816,5142111572,2010-01-13,UAA BOOKSTORE,AK,99509.0,P,98.90,0,2.0,365,365,365,2
9,3808,5142111572,2010-01-18,SUNSET SERVICES,AK,99503.0,P,65.00,0,5.0,365,365,365,5


In [31]:
ffdata = fdata5.sort_values('Recnum')

In [32]:
ffdata = ffdata[["Recnum","Cardnum","Date","Merch description","Merch state","Merch zip","Transtype","Amount","Fraud",'dscard','dsmerch','dsmerchcard','dszipcard','dsstatecard']]
ffdata

Unnamed: 0,Recnum,Cardnum,Date,Merch description,Merch state,Merch zip,Transtype,Amount,Fraud,dscard,dsmerch,dsmerchcard,dszipcard,dsstatecard
75187,1,5142190439,2010-01-01,FEDEX SHP 12/23/09 AB#,TN,38118.0,P,3.62,0,365.0,0,365,0,0
31149,2,5142183973,2010-01-01,SERVICE MERCHANDISE #81,MA,1803.0,P,31.42,0,365.0,365,365,365,365
33302,3,5142131721,2010-01-01,OFFICE DEPOT #191,MD,20706.0,P,178.49,0,365.0,365,365,365,365
71917,4,5142148452,2010-01-01,FEDEX SHP 12/28/09 AB#,TN,38118.0,P,3.62,0,365.0,365,365,0,0
75188,5,5142190439,2010-01-01,FEDEX SHP 12/23/09 AB#,TN,38118.0,P,3.62,0,0.0,0,0,0,0
73780,6,5142149874,2010-01-01,FEDEX SHP 12/22/09 AB#,TN,38118.0,P,3.67,0,365.0,0,365,365,365
74932,7,5142189277,2010-01-01,FEDEX SHP 12/28/09 AB#,TN,38118.0,P,3.62,0,365.0,0,365,365,365
56335,8,5142191182,2010-01-01,MIAMI COMPUTER SUPPLY,OH,45429.0,P,230.32,0,365.0,365,365,365,365
20150,9,5142258629,2010-01-01,FISHER SCI ATL,GA,30091.0,P,62.11,0,365.0,365,365,365,365
75189,10,5142190439,2010-01-01,FEDEX SHP 12/23/09 AB#,TN,38118.0,P,3.62,0,0.0,0,0,0,0


In [33]:
ffdata.to_csv('ffdata.csv')

In [34]:
from scipy.stats import ks_2samp

In [65]:
mydata=ffdata[['Fraud', 'dscard', 'dsmerch','dsmerchcard', 'dszipcard', 'dsstatecard']]
goods=mydata[mydata['Fraud']==0]
bads=mydata[mydata['Fraud']==1]

In [80]:
KS={'Fraud':[0], 'dscard':[0], 'dsmerch':[0], 'dsmerchcard':[0], 'dszipcard':[0],'dsstatecard':[0]}
KS=pd.DataFrame.from_dict(KS, orient='index')
KS.reset_index(inplace=True)

In [84]:
KS['ks']=[0,0,0,0,0,0]

In [85]:
i=0
for column in list(mydata.columns):
    KS['ks'][i] = ks_2samp(goods[column],bads[column])[0]
    i = i+1

In [86]:
KS

Unnamed: 0,index,0,ks
0,Fraud,0,1
1,dscard,0,0
2,dsmerch,0,0
3,dsmerchcard,0,0
4,dszipcard,0,0
5,dsstatecard,0,0


In [90]:
mydata.sort_values('dscard').reset_index().index

RangeIndex(start=0, stop=96398, step=1)

In [94]:
mydata['dscardscore']=mydata.sort_values('dscard').reset_index().index
mydata['dsmerchscore']=mydata.sort_values('dsmerch').reset_index().index
mydata['dsmerchcardscore']=mydata.sort_values('dsmerchcard').reset_index().index
mydata['dsstatecardscore']=mydata.sort_values('dsstatecard').reset_index().index
mydata['dszipcardscore']=mydata.sort_values('dszipcard').reset_index().index

In [111]:
mydata.sort_values('dszipcard',ascending=False).iloc[:2892,]['Fraud'].sum()

58

In [104]:
mydata['Fraud'].sum()

1059

In [113]:
mydata1

Unnamed: 0.1,Unnamed: 0,V1,Recnum,Cardnum,Date,Merch description,Merch state,Merch zip,Transtype,Amount,...,cars_mz14_median,cars_mz14_min,cars_mz14_total,cars_mz14_count,cars_mz30_mean,cars_mz30_max,cars_mz30_median,cars_mz30_min,cars_mz30_total,cars_mz30_count
0,1,0,1,5142190439,2010-01-01,FEDEX SHP 12/23/09 AB#,TN,38118,P,3.62,...,3.620,3.62,3.62,1,3.620000,3.62,3.620,3.62,3.62,1
1,2,1,2,5142183973,2010-01-01,SERVICE MERCHANDISE #81,MA,1803,P,31.42,...,31.420,31.42,31.42,1,31.420000,31.42,31.420,31.42,31.42,1
2,3,2,3,5142131721,2010-01-01,OFFICE DEPOT #191,MD,20706,P,178.49,...,178.490,178.49,178.49,1,178.490000,178.49,178.490,178.49,178.49,1
3,4,3,4,5142148452,2010-01-01,FEDEX SHP 12/28/09 AB#,TN,38118,P,3.62,...,3.620,3.62,3.62,1,3.620000,3.62,3.620,3.62,3.62,1
4,5,4,5,5142190439,2010-01-01,FEDEX SHP 12/23/09 AB#,TN,38118,P,3.62,...,3.620,3.62,7.24,2,3.620000,3.62,3.620,3.62,7.24,2
5,6,5,6,5142149874,2010-01-01,FEDEX SHP 12/22/09 AB#,TN,38118,P,3.67,...,3.670,3.67,3.67,1,3.670000,3.67,3.670,3.67,3.67,1
6,7,6,7,5142189277,2010-01-01,FEDEX SHP 12/28/09 AB#,TN,38118,P,3.62,...,3.620,3.62,3.62,1,3.620000,3.62,3.620,3.62,3.62,1
7,8,7,8,5142191182,2010-01-01,MIAMI COMPUTER SUPPLY,OH,45429,P,230.32,...,230.320,230.32,230.32,1,230.320000,230.32,230.320,230.32,230.32,1
8,9,8,9,5142258629,2010-01-01,FISHER SCI ATL,GA,30091,P,62.11,...,62.110,62.11,62.11,1,62.110000,62.11,62.110,62.11,62.11,1
9,10,9,10,5142190439,2010-01-01,FEDEX SHP 12/23/09 AB#,TN,38118,P,3.62,...,3.620,3.62,10.86,3,3.620000,3.62,3.620,3.62,10.86,3


In [117]:
mydata1.sort_values('cars_mz14_total',ascending=False).iloc[:2892,]['Fraud'].sum()

630

In [2]:
import pandas as pd
import numpy as np
mydata = pd.read_csv('var310.csv')
mydata1 = pd.read_csv('ffdata.csv')

In [4]:
mydata1.head()

Unnamed: 0.1,Unnamed: 0,Recnum,Cardnum,Date,Merch description,Merch state,Merch zip,Transtype,Amount,Fraud,dscard,dsmerch,dsmerchcard,dszipcard,dsstatecard
0,75187,1,5142190439,2010-01-01,FEDEX SHP 12/23/09 AB#,TN,38118.0,P,3.62,0,365.0,0,365,0,0
1,31149,2,5142183973,2010-01-01,SERVICE MERCHANDISE #81,MA,1803.0,P,31.42,0,365.0,365,365,365,365
2,33302,3,5142131721,2010-01-01,OFFICE DEPOT #191,MD,20706.0,P,178.49,0,365.0,365,365,365,365
3,71917,4,5142148452,2010-01-01,FEDEX SHP 12/28/09 AB#,TN,38118.0,P,3.62,0,365.0,365,365,0,0
4,75188,5,5142190439,2010-01-01,FEDEX SHP 12/23/09 AB#,TN,38118.0,P,3.62,0,0.0,0,0,0,0


In [6]:
flag1 = mydata.merge(mydata1, left_on="Recnum", right_on="Recnum",how='left')

In [8]:
flag1.columns

Index(['Unnamed: 0_x', 'V1', 'Recnum', 'Cardnum_x', 'Date_x',
       'Merch description_x', 'Merch state_x', 'Merch zip_x', 'Transtype_x',
       'Amount_x',
       ...
       'Merch state_y', 'Merch zip_y', 'Transtype_y', 'Amount_y', 'Fraud_y',
       'dscard', 'dsmerch', 'dsmerchcard', 'dszipcard', 'dsstatecard'],
      dtype='object', length=325)

In [14]:
flag2 = flag1[flag1['Amount_x']!=max(flag1['Amount_x'])]

In [15]:
flag2.shape

(96397, 325)

In [28]:
flag2['Date_x']=pd.to_datetime(flag2['Date_x'])
flag3 = flag2[flag2['Date_x']<= pd.to_datetime('2010-11-01')]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [30]:
type(flag3['Date_x'][1])

pandas._libs.tslibs.timestamps.Timestamp

In [31]:
flag3.to_csv('flag3.csv')

In [39]:
flag3['random'] = np.random.random(size = len(flag3['V1']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [40]:
goods=flag3[flag3['Fraud_x']==0]
bads=flag3[flag3['Fraud_x']==1]

In [81]:
from scipy.stats import ks_2samp
KS=[]
i=0
l = list(flag3.columns)
fl = l[10:-15]+l[-6:]
for column in fl:
    KS.append( [ks_2samp(goods[column],bads[column])[0],column])
    i = i+1

In [82]:
len(KS)

307

In [83]:
from operator import itemgetter
KS_flag=pd.DataFrame(sorted(KS, key=itemgetter(0),reverse=True))

In [85]:
KS_flag1 = KS_flag.reset_index()

In [87]:
KS_flag1['index']=KS_flag1['index']+1

In [77]:
l = list(flag3.columns)
fl = l[10:-15]+l[-6:]
fdrdic = {}
for i in fl:
    a = flag3[['Fraud_y',i]]
    fdr=a.sort_values(i,ascending = False)['Fraud_y'].iloc[:2526,].sum()/880
    fdrdic[i] = fdr

In [80]:
len(fdrdic)

307

In [90]:
fdrdic

{'Fraud_x': 1.0,
 'Actual_avg_card_0': 0.05113636363636364,
 'Actual_max_card_0': 0.013636363636363636,
 'Actual_med_card_0': 0.06363636363636363,
 'Actual_tot_card_0': 0.010227272727272727,
 'Actual_avg_card_1': 0.04659090909090909,
 'Actual_max_card_1': 0.013636363636363636,
 'Actual_med_card_1': 0.045454545454545456,
 'Actual_tot_card_1': 0.00909090909090909,
 'Actual_avg_card_3': 0.05568181818181818,
 'Actual_max_card_3': 0.026136363636363635,
 'Actual_med_card_3': 0.05454545454545454,
 'Actual_tot_card_3': 0.005681818181818182,
 'Actual_avg_card_7': 0.07272727272727272,
 'Actual_max_card_7': 0.029545454545454545,
 'Actual_med_card_7': 0.07954545454545454,
 'Actual_tot_card_7': 0.011363636363636364,
 'Actual_avg_card_14': 0.07386363636363637,
 'Actual_max_card_14': 0.03068181818181818,
 'Actual_med_card_14': 0.07727272727272727,
 'Actual_tot_card_14': 0.0125,
 'Actual_avg_card_30': 0.0875,
 'Actual_max_card_30': 0.03636363636363636,
 'Actual_med_card_30': 0.1125,
 'Actual_tot_card_

In [95]:
fdrdf = pd.DataFrame.from_dict(fdrdic,orient = 'index')
fdrdf.columns=['FDR']

In [98]:
fdr_flag = fdrdf.sort_values(by='FDR',ascending=False).reset_index().reset_index()

In [103]:
fdr_flag['level_0']=fdr_flag['level_0']+1

In [105]:
#KS and FDR Score
KS_flag1.columns=['KS_RANK','KS','VARIABLE']
fdr_flag.columns=['FDR_RANK','VARIABLE','FDR']

In [112]:
KS_flag1.to_csv('KS.csv')
fdr_flag.to_csv('FDR.csv')