In [1]:
import numpy as np
import math
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import time
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold, StratifiedShuffleSplit, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, precision_recall_curve

In [2]:
train_data = pd.read_csv('FeatureEngineeredTrain.csv')
test_data = pd.read_csv('FeatureEngineeredTest.csv')

In [3]:
train_data['PricingStrategy'] = train_data['PricingStrategy'].replace({0:1, 1:2, 2:3})
test_data['PricingStrategy'] = test_data['PricingStrategy'].replace({0:1, 1:2, 2:3})

In [4]:
train_data['Charges_per_Pricing'] = train_data['ExtraCharges']/train_data['PricingStrategy']
test_data['Charges_per_Pricing'] = test_data['ExtraCharges']/test_data['PricingStrategy']

In [5]:
def time_in_seconds(time_series):
    time = time_series.split(' ')[1].split(':')
    hr, mn, sec = int(time[0]), int(time[1]), int(time[2])
    return hr*3600 + mn*60 + sec

train_data['Time_in_Secs'] = train_data['Time'].map(time_in_seconds)
test_data['Time_in_Secs'] = test_data['Time'].map(time_in_seconds)

In [6]:
train_data['Cyclic_Time_x'] = train_data['Time_in_Secs'].map(lambda x: math.sin(2*math.pi*x/86400))
train_data['Cyclic_Time_y'] = train_data['Time_in_Secs'].map(lambda x: math.cos(2*math.pi*x/86400))

test_data['Cyclic_Time_x'] = test_data['Time_in_Secs'].map(lambda x: math.sin(2*math.pi*x/86400))
test_data['Cyclic_Time_y'] = test_data['Time_in_Secs'].map(lambda x: math.cos(2*math.pi*x/86400))

In [7]:
def week_of_trans(time_series):
    time = time_series.split('T')[0].split('-')
    day = int(time[2])
    
    if day in range(1, 8):
        return 'first_week'
    elif day in range(8, 15):
        return 'second_week'
    elif day in range(15, 22):
        return 'third_week'
    elif day in range(22, 29):
        return 'fourth_week'
    else:
        return 'fifth_week'
    
train_data['Week'] = train_data['TransactionStartTime'].map(week_of_trans)
test_data['Week'] = test_data['TransactionStartTime'].map(week_of_trans)

In [8]:
train_data['Amount_to_mean_CustomerId'] = train_data['Amount']/train_data.groupby('CustomerId')['Amount'].transform('mean')
train_data['Amount_to_std_CustomerId'] = train_data['Amount']/train_data.groupby('CustomerId')['Amount'].transform('std')

train_data['Amount_to_mean_ProviderId'] = train_data['Amount']/train_data.groupby('ProviderId')['Amount'].transform('mean')
train_data['Amount_to_std_ProviderId'] = train_data['Amount']/train_data.groupby('ProviderId')['Amount'].transform('std')

train_data['Amount_to_mean_PricingStrategy'] = train_data['Amount']/train_data.groupby('PricingStrategy')['Amount'].transform('mean')
train_data['Amount_to_std_PricingStrategy'] = train_data['Amount']/train_data.groupby('PricingStrategy')['Amount'].transform('std')

train_data['Amount_to_mean_ProductCategory'] = train_data['Amount']/train_data.groupby('ProductCategory')['Amount'].transform('mean')
train_data['Amount_to_std_ProductCategory'] = train_data['Amount']/train_data.groupby('ProductCategory')['Amount'].transform('std')

train_data['Amount_to_mean_ChannelId'] = train_data['Amount']/train_data.groupby('ChannelId')['Amount'].transform('mean')
train_data['Amount_to_std_ChannelId'] = train_data['Amount']/train_data.groupby('ChannelId')['Amount'].transform('std')

train_data['Amount_to_mean_ProductId'] = train_data['Amount']/train_data.groupby('ProductId')['Amount'].transform('mean')
train_data['Amount_to_std_ProductId'] = train_data['Amount']/train_data.groupby('ProductId')['Amount'].transform('std')

In [9]:
train_data['Amount_to_std_CustomerId'].head()

0    0.328699
1   -0.006574
2         inf
3    1.164874
4   -0.037509
Name: Amount_to_std_CustomerId, dtype: float64

In [10]:
test_data['Amount_to_mean_CustomerId'] = test_data['Amount']/test_data.groupby('CustomerId')['Amount'].transform('mean')
test_data['Amount_to_std_CustomerId'] = test_data['Amount']/test_data.groupby('CustomerId')['Amount'].transform('std')

test_data['Amount_to_mean_ProviderId'] = test_data['Amount']/test_data.groupby('ProviderId')['Amount'].transform('mean')
test_data['Amount_to_std_ProviderId'] = test_data['Amount']/test_data.groupby('ProviderId')['Amount'].transform('std')

test_data['Amount_to_mean_PricingStrategy'] = test_data['Amount']/test_data.groupby('PricingStrategy')['Amount'].transform('mean')
test_data['Amount_to_std_PricingStrategy'] = test_data['Amount']/test_data.groupby('PricingStrategy')['Amount'].transform('std')

test_data['Amount_to_mean_ProductCategory'] = test_data['Amount']/test_data.groupby('ProductCategory')['Amount'].transform('mean')
test_data['Amount_to_std_ProductCategory'] = test_data['Amount']/test_data.groupby('ProductCategory')['Amount'].transform('std')

test_data['Amount_to_mean_ChannelId'] = test_data['Amount']/test_data.groupby('ChannelId')['Amount'].transform('mean')
test_data['Amount_to_std_ChannelId'] = test_data['Amount']/test_data.groupby('ChannelId')['Amount'].transform('std')

test_data['Amount_to_mean_ProductId'] = test_data['Amount']/test_data.groupby('ProductId')['Amount'].transform('mean')
test_data['Amount_to_std_ProductId'] = test_data['Amount']/test_data.groupby('ProductId')['Amount'].transform('std')

In [11]:
train_data = train_data.replace([np.inf, -np.inf], np.nan)
test_data = test_data.replace([np.inf, -np.inf], np.nan)

In [12]:
train_data.isin([np.inf, -np.inf, np.nan]).all(axis='columns').value_counts()

False    95662
dtype: int64

In [13]:
original_train = train_data.copy()
original_test = test_data.copy()

In [14]:
null_cols = train_data.columns[np.where(train_data.isnull().sum()>0)]

for col in null_cols:
    train_data[col] = train_data[col].fillna(train_data[col].mean())
    test_data[col] = test_data[col].fillna(test_data[col].mean())

In [15]:
train_data['Time_in_Secs'].sample()

38230    46808
Name: Time_in_Secs, dtype: int64

In [16]:
train_data['Time'].head()

0    2018-11-15 02:18:49
1    2018-11-15 02:19:08
2    2018-11-15 02:44:21
3    2018-11-15 03:32:55
4    2018-11-15 03:34:21
Name: Time, dtype: object

In [17]:
train_data['Time'].dtype

dtype('O')

In [18]:
from scipy.special import i0

In [19]:
train_data['Time'] = pd.to_datetime(train_data['Time'])
test_data['Time'] = pd.to_datetime(test_data['Time'])

In [20]:
train_data['Time_in_Secs'] = train_data['Time_in_Secs'].map(lambda x: 2*math.pi*x/86400)
test_data['Time_in_Secs'] = test_data['Time_in_Secs'].map(lambda x: 2*math.pi*x/86400)

In [22]:
def mn(array):
    a = np.power(np.sum(np.cos(array)), 2)
    b = np.power(np.sum(np.sin(array)), 2)
    c = np.sum(np.cos(array))
    d = np.sum(np.sin(array))
    return 2*np.arctan(d/((np.sqrt(a+b) + c)))



def std(array):
    n = len(array)
    a = np.power((np.sum(np.sin(array)))/n, 2)
    b = np.power((np.sum(np.cos(array)))/n, 2)
    return np.sqrt(np.log(1/(a+b)))

In [26]:
def distribution(df):
    inv_std, time, mean, STD = df[0], df[1], df[2], df[3]
    return np.exp(inv_std*np.cos(time - mean*STD))/(2*np.pi*i0(inv_std))

In [29]:
#math.exp(inv_std*math.cos(t - mn*std))/(2*math.pi*i0(inv_std))

In [32]:
from scipy.stats import sem, t
from scipy import mean
confidence_95 = 0.95
confidence_90 = 0.90

In [33]:
def lower_interval_95(array):
    n = len(array)
    m = mean(array)
    std_err = sem(array)
    h = std_err * t.ppf((1 + confidence_95) / 2, n - 1)

    start = m - h
    return start

    
def upper_interval_95(array):
    n = len(array)
    m = mean(array)
    std_err = sem(array)
    h = std_err * t.ppf((1 + confidence_95) / 2, n - 1)

    end = m + h
    return end
    
    
    
def lower_interval_90(array):
    n = len(array)
    m = mean(array)
    std_err = sem(array)
    h = std_err * t.ppf((1 + confidence_90) / 2, n - 1)

    start = m - h
    return start

    
def upper_interval_90(array):
    n = len(array)
    m = mean(array)
    std_err = sem(array)
    h = std_err * t.ppf((1 + confidence_90) / 2, n - 1)

    end = m + h
    return end
    

In [41]:
agg_func_7days_95 = train_data.set_index('Time').groupby('CustomerId').rolling('7d')['Time_in_Secs'].agg([mn, std]).reset_index()
agg_func_7days_95 = agg_func_7days_95.drop_duplicates(subset=['CustomerId', 'Time']).rename(columns={'mn' : 'mean_7days_0.95', 'std' : 'std_7days_0.95'})
train_data = pd.merge(train_data, agg_func_7days_95, on=['CustomerId', 'Time'], how='left')

train_data['std_7days_0.95'] = train_data['std_7days_0.95'].fillna(0)
train_data['inv_std_7days_0.95'] = 1/train_data['std_7days_0.95']
train_data['inv_std_7days_0.95'] = train_data['inv_std_7days_0.95'].replace([np.inf, -np.inf], np.nan)
train_data['inv_std_7days_0.95'] = train_data['inv_std_7days_0.95'].fillna(0)

train_data['vonmises_time_7days_0.95'] = train_data[['inv_std_7days_0.95', 'Time_in_Secs', 'mean_7days_0.95', 'std_7days_0.95']].apply(distribution, axis=1)
train_data['vonmises_time_7days_0.95'].fillna(train_data['vonmises_time_7days_0.95'].mean(), inplace=True)

low_7days_95 = train_data.set_index('Time').groupby(['CustomerId']).rolling('7d')['vonmises_time_7days_0.95'].apply(lower_interval_95)
low_7days_95 = low_7days_95.reset_index().drop_duplicates(subset=['CustomerId', 'Time'])
low_7days_95 = low_7days_95.rename(columns={'vonmises_time_7days_0.95' : 'lower_interval_7days_0.95'})
train_data = pd.merge(train_data, low_7days_95, on=['CustomerId', 'Time'], how='left')

high_7days_95 = train_data.set_index('Time').groupby(['CustomerId']).rolling('7d')['vonmises_time_7days_0.95'].apply(upper_interval_95)
high_7days_95 = high_7days_95.reset_index().drop_duplicates(subset=['CustomerId', 'Time'])
high_7days_95 = high_7days_95.rename(columns={'vonmises_time_7days_0.95' : 'upper_interval_7days_0.95'})
train_data = pd.merge(train_data, high_7days_95, on=['CustomerId', 'Time'], how='left')

train_data['lower_interval_7days_0.95'].fillna(train_data['lower_interval_7days_0.95'].mean(), inplace=True)
train_data['upper_interval_7days_0.95'].fillna(train_data['upper_interval_7days_0.95'].mean(), inplace=True)

  This is separate from the ipykernel package so we can avoid doing imports until
  del sys.path[0]
  keepdims=keepdims)


In [42]:
train_data['Within_CI95_7days'] = train_data[['vonmises_time_7days_0.95', 'lower_interval_7days_0.95', 'upper_interval_7days_0.95']].apply(check, axis=1)

In [45]:
train_data.drop(['mean_7days_0.95', 'std_7days_0.95', 'inv_std_7days_0.95', 'vonmises_time_7days_0.95', 'lower_interval_7days_0.95', 'upper_interval_7days_0.95'], axis=1, inplace=True)

In [46]:
train_data.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount',
       ...
       'Amount_to_std_ProviderId', 'Amount_to_mean_PricingStrategy',
       'Amount_to_std_PricingStrategy', 'Amount_to_mean_ProductCategory',
       'Amount_to_std_ProductCategory', 'Amount_to_mean_ChannelId',
       'Amount_to_std_ChannelId', 'Amount_to_mean_ProductId',
       'Amount_to_std_ProductId', 'Within_CI95_7days'],
      dtype='object', length=147)

In [52]:
train_data['Within_CI95_7days'].value_counts()#.values.sum()

Yes    47925
No     47737
Name: Within_CI95_7days, dtype: int64

In [49]:
agg_func_14days_95 = train_data.set_index('Time').groupby('CustomerId').rolling('14d')['Time_in_Secs'].agg([mn, std]).reset_index()
agg_func_14days_95 = agg_func_14days_95.drop_duplicates(subset=['CustomerId', 'Time']).rename(columns={'mn' : 'mean_14days_0.95', 'std' : 'std_14days_0.95'})
train_data = pd.merge(train_data, agg_func_14days_95, on=['CustomerId', 'Time'], how='left')

train_data['std_14days_0.95'] = train_data['std_14days_0.95'].fillna(0)
train_data['inv_std_14days_0.95'] = 1/train_data['std_14days_0.95']
train_data['inv_std_14days_0.95'] = train_data['inv_std_14days_0.95'].replace([np.inf, -np.inf], np.nan)
train_data['inv_std_14days_0.95'] = train_data['inv_std_14days_0.95'].fillna(0)

train_data['vonmises_time_14days_0.95'] = train_data[['inv_std_14days_0.95', 'Time_in_Secs', 'mean_14days_0.95', 'std_14days_0.95']].apply(distribution, axis=1)
train_data['vonmises_time_14days_0.95'].fillna(train_data['vonmises_time_14days_0.95'].mean(), inplace=True)

low_14days_95 = train_data.set_index('Time').groupby(['CustomerId']).rolling('14d')['vonmises_time_14days_0.95'].apply(lower_interval_95)
low_14days_95 = low_14days_95.reset_index().drop_duplicates(subset=['CustomerId', 'Time'])
low_14days_95 = low_14days_95.rename(columns={'vonmises_time_14days_0.95' : 'lower_interval_14days_0.95'})
train_data = pd.merge(train_data, low_14days_95, on=['CustomerId', 'Time'], how='left')

high_14days_95 = train_data.set_index('Time').groupby(['CustomerId']).rolling('14d')['vonmises_time_14days_0.95'].apply(upper_interval_95)
high_14days_95 = high_14days_95.reset_index().drop_duplicates(subset=['CustomerId', 'Time'])
high_14days_95 = high_14days_95.rename(columns={'vonmises_time_14days_0.95' : 'upper_interval_14days_0.95'})
train_data = pd.merge(train_data, high_14days_95, on=['CustomerId', 'Time'], how='left')

train_data['lower_interval_14days_0.95'].fillna(train_data['lower_interval_14days_0.95'].mean(), inplace=True)
train_data['upper_interval_14days_0.95'].fillna(train_data['upper_interval_14days_0.95'].mean(), inplace=True)

train_data['Within_CI95_14days'] = train_data[['vonmises_time_14days_0.95', 'lower_interval_14days_0.95', 'upper_interval_14days_0.95']].apply(check, axis=1)

train_data.drop(['mean_14days_0.95', 'std_14days_0.95', 'inv_std_14days_0.95', 'vonmises_time_14days_0.95', 'lower_interval_14days_0.95', 'upper_interval_14days_0.95'], axis=1, inplace=True)

  This is separate from the ipykernel package so we can avoid doing imports until
  del sys.path[0]
  keepdims=keepdims)


In [50]:
train_data.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount',
       ...
       'Amount_to_mean_PricingStrategy', 'Amount_to_std_PricingStrategy',
       'Amount_to_mean_ProductCategory', 'Amount_to_std_ProductCategory',
       'Amount_to_mean_ChannelId', 'Amount_to_std_ChannelId',
       'Amount_to_mean_ProductId', 'Amount_to_std_ProductId',
       'Within_CI95_7days', 'Within_CI95_14days'],
      dtype='object', length=148)

In [51]:
train_data['Within_CI95_14days'].value_counts()

No     55670
Yes    39992
Name: Within_CI95_14days, dtype: int64

In [53]:
agg_func_21days_95 = train_data.set_index('Time').groupby('CustomerId').rolling('21d')['Time_in_Secs'].agg([mn, std]).reset_index()
agg_func_21days_95 = agg_func_21days_95.drop_duplicates(subset=['CustomerId', 'Time']).rename(columns={'mn' : 'mean_21days_0.95', 'std' : 'std_21days_0.95'})
train_data = pd.merge(train_data, agg_func_21days_95, on=['CustomerId', 'Time'], how='left')

train_data['std_21days_0.95'] = train_data['std_21days_0.95'].fillna(0)
train_data['inv_std_21days_0.95'] = 1/train_data['std_21days_0.95']
train_data['inv_std_21days_0.95'] = train_data['inv_std_21days_0.95'].replace([np.inf, -np.inf], np.nan)
train_data['inv_std_21days_0.95'] = train_data['inv_std_21days_0.95'].fillna(0)

train_data['vonmises_time_21days_0.95'] = train_data[['inv_std_21days_0.95', 'Time_in_Secs', 'mean_21days_0.95', 'std_21days_0.95']].apply(distribution, axis=1)
train_data['vonmises_time_21days_0.95'].fillna(train_data['vonmises_time_21days_0.95'].mean(), inplace=True)

low_21days_95 = train_data.set_index('Time').groupby(['CustomerId']).rolling('21d')['vonmises_time_21days_0.95'].apply(lower_interval_95)
low_21days_95 = low_21days_95.reset_index().drop_duplicates(subset=['CustomerId', 'Time'])
low_21days_95 = low_21days_95.rename(columns={'vonmises_time_21days_0.95' : 'lower_interval_21days_0.95'})
train_data = pd.merge(train_data, low_21days_95, on=['CustomerId', 'Time'], how='left')

high_21days_95 = train_data.set_index('Time').groupby(['CustomerId']).rolling('21d')['vonmises_time_21days_0.95'].apply(upper_interval_95)
high_21days_95 = high_21days_95.reset_index().drop_duplicates(subset=['CustomerId', 'Time'])
high_21days_95 = high_21days_95.rename(columns={'vonmises_time_21days_0.95' : 'upper_interval_21days_0.95'})
train_data = pd.merge(train_data, high_21days_95, on=['CustomerId', 'Time'], how='left')

train_data['lower_interval_21days_0.95'].fillna(train_data['lower_interval_21days_0.95'].mean(), inplace=True)
train_data['upper_interval_21days_0.95'].fillna(train_data['upper_interval_21days_0.95'].mean(), inplace=True)

train_data['Within_CI95_21days'] = train_data[['vonmises_time_21days_0.95', 'lower_interval_21days_0.95', 'upper_interval_21days_0.95']].apply(check, axis=1)

train_data.drop(['mean_21days_0.95', 'std_21days_0.95', 'inv_std_21days_0.95', 'vonmises_time_21days_0.95', 'lower_interval_21days_0.95', 'upper_interval_21days_0.95'], axis=1, inplace=True)

  This is separate from the ipykernel package so we can avoid doing imports until
  del sys.path[0]
  keepdims=keepdims)


In [54]:
train_data.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount',
       ...
       'Amount_to_std_PricingStrategy', 'Amount_to_mean_ProductCategory',
       'Amount_to_std_ProductCategory', 'Amount_to_mean_ChannelId',
       'Amount_to_std_ChannelId', 'Amount_to_mean_ProductId',
       'Amount_to_std_ProductId', 'Within_CI95_7days', 'Within_CI95_14days',
       'Within_CI95_21days'],
      dtype='object', length=149)

In [55]:
train_data['Within_CI95_14days'].value_counts()

No     55670
Yes    39992
Name: Within_CI95_14days, dtype: int64

In [56]:
agg_func_28days_95 = train_data.set_index('Time').groupby('CustomerId').rolling('28d')['Time_in_Secs'].agg([mn, std]).reset_index()
agg_func_28days_95 = agg_func_28days_95.drop_duplicates(subset=['CustomerId', 'Time']).rename(columns={'mn' : 'mean_28days_0.95', 'std' : 'std_28days_0.95'})
train_data = pd.merge(train_data, agg_func_28days_95, on=['CustomerId', 'Time'], how='left')

train_data['std_28days_0.95'] = train_data['std_28days_0.95'].fillna(0)
train_data['inv_std_28days_0.95'] = 1/train_data['std_28days_0.95']
train_data['inv_std_28days_0.95'] = train_data['inv_std_28days_0.95'].replace([np.inf, -np.inf], np.nan)
train_data['inv_std_28days_0.95'] = train_data['inv_std_28days_0.95'].fillna(0)

train_data['vonmises_time_28days_0.95'] = train_data[['inv_std_28days_0.95', 'Time_in_Secs', 'mean_28days_0.95', 'std_28days_0.95']].apply(distribution, axis=1)
train_data['vonmises_time_28days_0.95'].fillna(train_data['vonmises_time_28days_0.95'].mean(), inplace=True)

low_28days_95 = train_data.set_index('Time').groupby(['CustomerId']).rolling('28d')['vonmises_time_28days_0.95'].apply(lower_interval_95)
low_28days_95 = low_28days_95.reset_index().drop_duplicates(subset=['CustomerId', 'Time'])
low_28days_95 = low_28days_95.rename(columns={'vonmises_time_28days_0.95' : 'lower_interval_28days_0.95'})
train_data = pd.merge(train_data, low_28days_95, on=['CustomerId', 'Time'], how='left')

high_28days_95 = train_data.set_index('Time').groupby(['CustomerId']).rolling('28d')['vonmises_time_28days_0.95'].apply(upper_interval_95)
high_28days_95 = high_28days_95.reset_index().drop_duplicates(subset=['CustomerId', 'Time'])
high_28days_95 = high_28days_95.rename(columns={'vonmises_time_28days_0.95' : 'upper_interval_28days_0.95'})
train_data = pd.merge(train_data, high_28days_95, on=['CustomerId', 'Time'], how='left')

train_data['lower_interval_28days_0.95'].fillna(train_data['lower_interval_28days_0.95'].mean(), inplace=True)
train_data['upper_interval_28days_0.95'].fillna(train_data['upper_interval_28days_0.95'].mean(), inplace=True)

train_data['Within_CI95_28days'] = train_data[['vonmises_time_28days_0.95', 'lower_interval_28days_0.95', 'upper_interval_28days_0.95']].apply(check, axis=1)

train_data.drop(['mean_28days_0.95', 'std_28days_0.95', 'inv_std_28days_0.95', 'vonmises_time_28days_0.95', 'lower_interval_28days_0.95', 'upper_interval_28days_0.95'], axis=1, inplace=True)

  This is separate from the ipykernel package so we can avoid doing imports until
  del sys.path[0]


In [57]:
train_data.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount',
       ...
       'Amount_to_mean_ProductCategory', 'Amount_to_std_ProductCategory',
       'Amount_to_mean_ChannelId', 'Amount_to_std_ChannelId',
       'Amount_to_mean_ProductId', 'Amount_to_std_ProductId',
       'Within_CI95_7days', 'Within_CI95_14days', 'Within_CI95_21days',
       'Within_CI95_28days'],
      dtype='object', length=150)

In [58]:
train_data['Within_CI95_14days'].value_counts()

No     55670
Yes    39992
Name: Within_CI95_14days, dtype: int64

In [59]:
agg_func_7days_95 = test_data.set_index('Time').groupby('CustomerId').rolling('7d')['Time_in_Secs'].agg([mn, std]).reset_index()
agg_func_7days_95 = agg_func_7days_95.drop_duplicates(subset=['CustomerId', 'Time']).rename(columns={'mn' : 'mean_7days_0.95', 'std' : 'std_7days_0.95'})
test_data = pd.merge(test_data, agg_func_7days_95, on=['CustomerId', 'Time'], how='left')

test_data['std_7days_0.95'] = test_data['std_7days_0.95'].fillna(0)
test_data['inv_std_7days_0.95'] = 1/test_data['std_7days_0.95']
test_data['inv_std_7days_0.95'] = test_data['inv_std_7days_0.95'].replace([np.inf, -np.inf], np.nan)
test_data['inv_std_7days_0.95'] = test_data['inv_std_7days_0.95'].fillna(0)

test_data['vonmises_time_7days_0.95'] = test_data[['inv_std_7days_0.95', 'Time_in_Secs', 'mean_7days_0.95', 'std_7days_0.95']].apply(distribution, axis=1)
test_data['vonmises_time_7days_0.95'].fillna(test_data['vonmises_time_7days_0.95'].mean(), inplace=True)

low_7days_95 = test_data.set_index('Time').groupby(['CustomerId']).rolling('7d')['vonmises_time_7days_0.95'].apply(lower_interval_95)
low_7days_95 = low_7days_95.reset_index().drop_duplicates(subset=['CustomerId', 'Time'])
low_7days_95 = low_7days_95.rename(columns={'vonmises_time_7days_0.95' : 'lower_interval_7days_0.95'})
test_data = pd.merge(test_data, low_7days_95, on=['CustomerId', 'Time'], how='left')

high_7days_95 = test_data.set_index('Time').groupby(['CustomerId']).rolling('7d')['vonmises_time_7days_0.95'].apply(upper_interval_95)
high_7days_95 = high_7days_95.reset_index().drop_duplicates(subset=['CustomerId', 'Time'])
high_7days_95 = high_7days_95.rename(columns={'vonmises_time_7days_0.95' : 'upper_interval_7days_0.95'})
test_data = pd.merge(test_data, high_7days_95, on=['CustomerId', 'Time'], how='left')

test_data['lower_interval_7days_0.95'].fillna(test_data['lower_interval_7days_0.95'].mean(), inplace=True)
test_data['upper_interval_7days_0.95'].fillna(test_data['upper_interval_7days_0.95'].mean(), inplace=True)

  This is separate from the ipykernel package so we can avoid doing imports until
  del sys.path[0]


In [60]:
test_data['Within_CI95_7days'] = test_data[['vonmises_time_7days_0.95', 'lower_interval_7days_0.95', 'upper_interval_7days_0.95']].apply(check, axis=1)
test_data.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount',
       ...
       'Amount_to_std_ChannelId', 'Amount_to_mean_ProductId',
       'Amount_to_std_ProductId', 'mean_7days_0.95', 'std_7days_0.95',
       'inv_std_7days_0.95', 'vonmises_time_7days_0.95',
       'lower_interval_7days_0.95', 'upper_interval_7days_0.95',
       'Within_CI95_7days'],
      dtype='object', length=152)

In [62]:
test_data.drop(['mean_7days_0.95', 'std_7days_0.95', 'inv_std_7days_0.95', 'vonmises_time_7days_0.95', 'lower_interval_7days_0.95', 'upper_interval_7days_0.95'], axis=1, inplace=True)
test_data['Within_CI95_7days'].value_counts()#.values.sum()

Yes    24076
No     20943
Name: Within_CI95_7days, dtype: int64

In [63]:
test_data.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount',
       ...
       'Amount_to_std_ProviderId', 'Amount_to_mean_PricingStrategy',
       'Amount_to_std_PricingStrategy', 'Amount_to_mean_ProductCategory',
       'Amount_to_std_ProductCategory', 'Amount_to_mean_ChannelId',
       'Amount_to_std_ChannelId', 'Amount_to_mean_ProductId',
       'Amount_to_std_ProductId', 'Within_CI95_7days'],
      dtype='object', length=146)

In [64]:
agg_func_14days_95 = test_data.set_index('Time').groupby('CustomerId').rolling('14d')['Time_in_Secs'].agg([mn, std]).reset_index()
agg_func_14days_95 = agg_func_14days_95.drop_duplicates(subset=['CustomerId', 'Time']).rename(columns={'mn' : 'mean_14days_0.95', 'std' : 'std_14days_0.95'})
test_data = pd.merge(test_data, agg_func_14days_95, on=['CustomerId', 'Time'], how='left')

test_data['std_14days_0.95'] = test_data['std_14days_0.95'].fillna(0)
test_data['inv_std_14days_0.95'] = 1/test_data['std_14days_0.95']
test_data['inv_std_14days_0.95'] = test_data['inv_std_14days_0.95'].replace([np.inf, -np.inf], np.nan)
test_data['inv_std_14days_0.95'] = test_data['inv_std_14days_0.95'].fillna(0)

test_data['vonmises_time_14days_0.95'] = test_data[['inv_std_14days_0.95', 'Time_in_Secs', 'mean_14days_0.95', 'std_14days_0.95']].apply(distribution, axis=1)
test_data['vonmises_time_14days_0.95'].fillna(test_data['vonmises_time_14days_0.95'].mean(), inplace=True)

low_14days_95 = test_data.set_index('Time').groupby(['CustomerId']).rolling('14d')['vonmises_time_14days_0.95'].apply(lower_interval_95)
low_14days_95 = low_14days_95.reset_index().drop_duplicates(subset=['CustomerId', 'Time'])
low_14days_95 = low_14days_95.rename(columns={'vonmises_time_14days_0.95' : 'lower_interval_14days_0.95'})
test_data = pd.merge(test_data, low_14days_95, on=['CustomerId', 'Time'], how='left')

high_14days_95 = test_data.set_index('Time').groupby(['CustomerId']).rolling('14d')['vonmises_time_14days_0.95'].apply(upper_interval_95)
high_14days_95 = high_14days_95.reset_index().drop_duplicates(subset=['CustomerId', 'Time'])
high_14days_95 = high_14days_95.rename(columns={'vonmises_time_14days_0.95' : 'upper_interval_14days_0.95'})
test_data = pd.merge(test_data, high_14days_95, on=['CustomerId', 'Time'], how='left')

test_data['lower_interval_14days_0.95'].fillna(test_data['lower_interval_14days_0.95'].mean(), inplace=True)
test_data['upper_interval_14days_0.95'].fillna(test_data['upper_interval_14days_0.95'].mean(), inplace=True)

test_data['Within_CI95_14days'] = test_data[['vonmises_time_14days_0.95', 'lower_interval_14days_0.95', 'upper_interval_14days_0.95']].apply(check, axis=1)

test_data.drop(['mean_14days_0.95', 'std_14days_0.95', 'inv_std_14days_0.95', 'vonmises_time_14days_0.95', 'lower_interval_14days_0.95', 'upper_interval_14days_0.95'], axis=1, inplace=True)

  This is separate from the ipykernel package so we can avoid doing imports until
  del sys.path[0]


In [65]:
test_data.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount',
       ...
       'Amount_to_mean_PricingStrategy', 'Amount_to_std_PricingStrategy',
       'Amount_to_mean_ProductCategory', 'Amount_to_std_ProductCategory',
       'Amount_to_mean_ChannelId', 'Amount_to_std_ChannelId',
       'Amount_to_mean_ProductId', 'Amount_to_std_ProductId',
       'Within_CI95_7days', 'Within_CI95_14days'],
      dtype='object', length=147)

In [67]:
test_data['Within_CI95_14days'].value_counts()

No     23516
Yes    21503
Name: Within_CI95_14days, dtype: int64

In [69]:
agg_func_21days_95 = test_data.set_index('Time').groupby('CustomerId').rolling('21d')['Time_in_Secs'].agg([mn, std]).reset_index()
agg_func_21days_95 = agg_func_21days_95.drop_duplicates(subset=['CustomerId', 'Time']).rename(columns={'mn' : 'mean_21days_0.95', 'std' : 'std_21days_0.95'})
test_data = pd.merge(test_data, agg_func_21days_95, on=['CustomerId', 'Time'], how='left')

test_data['std_21days_0.95'] = test_data['std_21days_0.95'].fillna(0)
test_data['inv_std_21days_0.95'] = 1/test_data['std_21days_0.95']
test_data['inv_std_21days_0.95'] = test_data['inv_std_21days_0.95'].replace([np.inf, -np.inf], np.nan)
test_data['inv_std_21days_0.95'] = test_data['inv_std_21days_0.95'].fillna(0)

test_data['vonmises_time_21days_0.95'] = test_data[['inv_std_21days_0.95', 'Time_in_Secs', 'mean_21days_0.95', 'std_21days_0.95']].apply(distribution, axis=1)
test_data['vonmises_time_21days_0.95'].fillna(test_data['vonmises_time_21days_0.95'].mean(), inplace=True)

low_21days_95 = test_data.set_index('Time').groupby(['CustomerId']).rolling('21d')['vonmises_time_21days_0.95'].apply(lower_interval_95)
low_21days_95 = low_21days_95.reset_index().drop_duplicates(subset=['CustomerId', 'Time'])
low_21days_95 = low_21days_95.rename(columns={'vonmises_time_21days_0.95' : 'lower_interval_21days_0.95'})
test_data = pd.merge(test_data, low_21days_95, on=['CustomerId', 'Time'], how='left')

high_21days_95 = test_data.set_index('Time').groupby(['CustomerId']).rolling('21d')['vonmises_time_21days_0.95'].apply(upper_interval_95)
high_21days_95 = high_21days_95.reset_index().drop_duplicates(subset=['CustomerId', 'Time'])
high_21days_95 = high_21days_95.rename(columns={'vonmises_time_21days_0.95' : 'upper_interval_21days_0.95'})
test_data = pd.merge(test_data, high_21days_95, on=['CustomerId', 'Time'], how='left')

test_data['lower_interval_21days_0.95'].fillna(test_data['lower_interval_21days_0.95'].mean(), inplace=True)
test_data['upper_interval_21days_0.95'].fillna(test_data['upper_interval_21days_0.95'].mean(), inplace=True)

test_data['Within_CI95_21days'] = test_data[['vonmises_time_21days_0.95', 'lower_interval_21days_0.95', 'upper_interval_21days_0.95']].apply(check, axis=1)

test_data.drop(['mean_21days_0.95', 'std_21days_0.95', 'inv_std_21days_0.95', 'vonmises_time_21days_0.95', 'lower_interval_21days_0.95', 'upper_interval_21days_0.95'], axis=1, inplace=True)

  This is separate from the ipykernel package so we can avoid doing imports until
  del sys.path[0]
  keepdims=keepdims)


In [70]:
test_data.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount',
       ...
       'Amount_to_std_PricingStrategy', 'Amount_to_mean_ProductCategory',
       'Amount_to_std_ProductCategory', 'Amount_to_mean_ChannelId',
       'Amount_to_std_ChannelId', 'Amount_to_mean_ProductId',
       'Amount_to_std_ProductId', 'Within_CI95_7days', 'Within_CI95_14days',
       'Within_CI95_21days'],
      dtype='object', length=148)

In [71]:
test_data['Within_CI95_21days'].value_counts()

No     23516
Yes    21503
Name: Within_CI95_14days, dtype: int64

In [72]:
agg_func_28days_95 = test_data.set_index('Time').groupby('CustomerId').rolling('28d')['Time_in_Secs'].agg([mn, std]).reset_index()
agg_func_28days_95 = agg_func_28days_95.drop_duplicates(subset=['CustomerId', 'Time']).rename(columns={'mn' : 'mean_28days_0.95', 'std' : 'std_28days_0.95'})
test_data = pd.merge(test_data, agg_func_28days_95, on=['CustomerId', 'Time'], how='left')

test_data['std_28days_0.95'] = test_data['std_28days_0.95'].fillna(0)
test_data['inv_std_28days_0.95'] = 1/test_data['std_28days_0.95']
test_data['inv_std_28days_0.95'] = test_data['inv_std_28days_0.95'].replace([np.inf, -np.inf], np.nan)
test_data['inv_std_28days_0.95'] = test_data['inv_std_28days_0.95'].fillna(0)

test_data['vonmises_time_28days_0.95'] = test_data[['inv_std_28days_0.95', 'Time_in_Secs', 'mean_28days_0.95', 'std_28days_0.95']].apply(distribution, axis=1)
test_data['vonmises_time_28days_0.95'].fillna(test_data['vonmises_time_28days_0.95'].mean(), inplace=True)

low_28days_95 = test_data.set_index('Time').groupby(['CustomerId']).rolling('28d')['vonmises_time_28days_0.95'].apply(lower_interval_95)
low_28days_95 = low_28days_95.reset_index().drop_duplicates(subset=['CustomerId', 'Time'])
low_28days_95 = low_28days_95.rename(columns={'vonmises_time_28days_0.95' : 'lower_interval_28days_0.95'})
test_data = pd.merge(test_data, low_28days_95, on=['CustomerId', 'Time'], how='left')

high_28days_95 = test_data.set_index('Time').groupby(['CustomerId']).rolling('28d')['vonmises_time_28days_0.95'].apply(upper_interval_95)
high_28days_95 = high_28days_95.reset_index().drop_duplicates(subset=['CustomerId', 'Time'])
high_28days_95 = high_28days_95.rename(columns={'vonmises_time_28days_0.95' : 'upper_interval_28days_0.95'})
test_data = pd.merge(test_data, high_28days_95, on=['CustomerId', 'Time'], how='left')

test_data['lower_interval_28days_0.95'].fillna(test_data['lower_interval_28days_0.95'].mean(), inplace=True)
test_data['upper_interval_28days_0.95'].fillna(test_data['upper_interval_28days_0.95'].mean(), inplace=True)

test_data['Within_CI95_28days'] = test_data[['vonmises_time_28days_0.95', 'lower_interval_28days_0.95', 'upper_interval_28days_0.95']].apply(check, axis=1)

test_data.drop(['mean_28days_0.95', 'std_28days_0.95', 'inv_std_28days_0.95', 'vonmises_time_28days_0.95', 'lower_interval_28days_0.95', 'upper_interval_28days_0.95'], axis=1, inplace=True)

  This is separate from the ipykernel package so we can avoid doing imports until
  del sys.path[0]


In [73]:
test_data.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount',
       ...
       'Amount_to_mean_ProductCategory', 'Amount_to_std_ProductCategory',
       'Amount_to_mean_ChannelId', 'Amount_to_std_ChannelId',
       'Amount_to_mean_ProductId', 'Amount_to_std_ProductId',
       'Within_CI95_7days', 'Within_CI95_14days', 'Within_CI95_21days',
       'Within_CI95_28days'],
      dtype='object', length=149)

In [74]:
test_data['Within_CI95_28days'].value_counts()

No     23994
Yes    21025
Name: Within_CI95_28days, dtype: int64

In [76]:
train_data.tail()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,...,Amount_to_mean_ProductCategory,Amount_to_std_ProductCategory,Amount_to_mean_ChannelId,Amount_to_std_ChannelId,Amount_to_mean_ProductId,Amount_to_std_ProductId,Within_CI95_7days,Within_CI95_14days,Within_CI95_21days,Within_CI95_28days
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-1000.0,...,-0.087447,-0.005666,0.25641,-0.072763,1.106786,-0.556767,Yes,No,No,No
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,...,1.215131,0.043294,0.073433,0.006297,0.611768,0.03377,No,No,No,No
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,...,-0.001749,-0.000113,0.005128,-0.001455,0.022136,-0.011135,No,No,No,No
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,ProviderId_6,ProductId_19,tv,ChannelId_3,3000.0,...,0.180276,0.071339,0.220299,0.018892,0.460657,0.496177,No,No,No,No
95661,TransactionId_35670,BatchId_29317,AccountId_4841,SubscriptionId_3829,CustomerId_1709,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-60.0,...,-0.005247,-0.00034,0.015385,-0.004366,0.066407,-0.033406,No,No,No,No


In [77]:
train_ids = train_data[['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId']]
test_ids = test_data[['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId']]

train_ids.to_csv('trainIds.csv', index=False)
test_ids.to_csv('testIds.csv', index=False)

In [3]:
train_data.drop(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 
                 'TransactionStartTime', 'Past_one_Week', 'Date'], 
                axis=1, inplace=True
               )

testTransId = test_data['TransactionId']
test_data.drop(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 
                 'TransactionStartTime', 'Past_one_Week', 'Date'], 
                axis=1, inplace=True
               )

In [79]:
train_data.drop('Time', axis=1, inplace=True)
test_data.drop('Time', axis=1, inplace=True)

In [80]:
cat_columns = ['Year']

for col in cat_columns:
    train_data[col] = train_data[col].astype(str)
    test_data[col] = test_data[col].astype(str)

In [81]:
train_data.drop('Hour', axis=1, inplace=True)
test_data.drop('Hour', axis=1, inplace=True)

In [82]:
train_data.shape

(95662, 140)

In [83]:
test_data.shape

(45019, 139)

In [84]:
train_data.select_dtypes(include='object').columns

Index(['ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Year',
       'Month', 'Weekday', 'Weekday/Weekend', 'Holiday', 'Form', 'Suspicious',
       'Multiple_Sub_by_Customer', 'Multiple_Sub_by_Acc', 'Week',
       'Within_CI95_7days', 'Within_CI95_14days', 'Within_CI95_21days',
       'Within_CI95_28days'],
      dtype='object')

In [85]:
train_data.select_dtypes(exclude='object').columns

Index(['Amount', 'PricingStrategy', 'FraudResult', 'No. Customers per Acc',
       'ExtraCharges', 'Days_to_Nearest_Holiday', 'Prev_Amt',
       'Time_since_last_trans', 'Avg_amt_per_ProviderId_in_last_12hhrs',
       'sum_of_amt_per_ProviderId_in_last_12hhrs',
       ...
       'Amount_to_mean_ProviderId', 'Amount_to_std_ProviderId',
       'Amount_to_mean_PricingStrategy', 'Amount_to_std_PricingStrategy',
       'Amount_to_mean_ProductCategory', 'Amount_to_std_ProductCategory',
       'Amount_to_mean_ChannelId', 'Amount_to_std_ChannelId',
       'Amount_to_mean_ProductId', 'Amount_to_std_ProductId'],
      dtype='object', length=122)

In [86]:
fraudResult = train_data['FraudResult']

train_num = train_data.drop('FraudResult', axis=1).select_dtypes(exclude='object')
test_num = test_data.select_dtypes(exclude='object')

train_cat = train_data.select_dtypes(include='object')
test_cat = test_data.select_dtypes(include='object')

train_cat['label'] = 1
test_cat['label'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [87]:
combined = pd.concat([train_cat, test_cat], axis=0)#.reset_index(drop=True)

In [88]:
combined = pd.get_dummies(combined, columns=combined.columns[:-1], drop_first=True)

train_cat = combined[combined['label']==1]
test_cat = combined[combined['label']==0]

In [89]:
train_cat.drop('label', axis=1, inplace=True)
test_cat.drop('label', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [90]:
train_data = pd.concat([train_num, train_cat, fraudResult], axis=1)
test_data = pd.concat([test_num, test_cat], axis=1)

In [91]:
np.where(train_data.isnull().sum()>0)

(array([], dtype=int64),)

In [93]:
test_data.shape

(45019, 191)

In [94]:
train_data.to_csv('FinalTrain31Aug.csv', index=False)
test_data.to_csv('FinalTest31Aug.csv', index=False)

In [10]:
testTransId.to_csv('testTransId.csv', index=False)

  """Entry point for launching an IPython kernel.
