In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.linear_model import Ridge
import time
from sklearn import preprocessing
import warnings
import datetime
warnings.filterwarnings("ignore")
import gc
from tqdm import tqdm

from scipy.stats import describe
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


In [None]:
train = pd.read_csv("../input/train.csv", parse_dates=["first_active_month"])
test = pd.read_csv("../input/test.csv", parse_dates=["first_active_month"])

new_m_t=pd.read_csv("../input/new_merchant_transactions.csv")
his_t=pd.read_csv("../input/historical_transactions.csv")

In [None]:
train.head()

In [None]:
new_m_t.head(10)

In [None]:
his_t.head(10)

In [None]:
plt.scatter(range(train.shape[0]),np.sort(train["target"].values))
plt.title("Scatter of Loyalty score")
plt.ylabel('Loyalty score', fontsize=12)
plt.show()

In [None]:
train.describe()

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(train["target"].values, bins=150, kde=False, color="red")
plt.title("Histogram of Loyalty score")
plt.xlabel('Loyalty score', fontsize=12)
plt.show()



In [None]:
train[train.target< -30].count()

In [None]:
sns.set_style("whitegrid")
sns.violinplot(x=train.target.values)
plt.show()

In [None]:
plt.figure(figsize=(12,5))
plt.hist(train.feature_1.values,bins=150)
plt.title('Histogram feature_1 counts')
plt.xlabel('Count')
plt.ylabel('Target')
plt.show()

In [None]:
plt.figure(figsize=(12,5))
plt.hist(train.feature_2.values,bins=150)
plt.title('Histogram feature_1 counts')
plt.xlabel('Count')
plt.ylabel('Target')
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(train.feature_3.values, bins=200)
plt.title('Histogram feature_3 counts')
plt.xlabel('Count')
plt.ylabel('Target')
plt.show()

In [None]:
train.head(10)

In [None]:
for df in [his_t,new_m_t]:
    df['category_2'].fillna(1.0,inplace=True)
    df['category_3'].fillna('A',inplace=True)
    df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)

In [None]:
def get_new_columns(name,aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

In [None]:
for df in [his_t,new_m_t]:
    df["purchase_date"]=pd.to_datetime(df["purchase_date"])
    df["year"]=df["purchase_date"].dt.year
    df["weekofyear"]=df["purchase_date"].dt.weekofyear
    df["month"]=df["purchase_date"].dt.month
    df["dayofweek"]=df["purchase_date"].dt.dayofweek
    df['weekend'] = (df.purchase_date.dt.weekday >=5).astype(int)
    df["hour"]=df["purchase_date"].dt.hour
    df["authorized_flag"]=df["authorized_flag"].map({"Y":1,"N":0})
    df["category_1"]=df["category_1"].map({"Y":1,"N":0})
    df['category_3'] = df['category_3'].map({'A':0, 'B':1, 'C':2}) 
    df["month_diff"]=((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df["month_diff"]+= df['month_lag']


In [None]:
aggs = {}
for col in ['subsector_id','merchant_id','merchant_category_id']:
    aggs[col] = ['nunique']
for col in ['month', 'hour', 'weekofyear', 'dayofweek', 'year']:
    aggs[col] = ['nunique', 'mean', 'min', 'max']

aggs['purchase_amount'] = ['sum','max','min','mean','var']
aggs['installments'] = ['sum','max','min','mean','var']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var']
aggs['month_diff'] = ['mean', 'min', 'max', 'var']
aggs['authorized_flag'] = ['sum', 'mean', 'min', 'max']
aggs['weekend'] = ['sum', 'mean', 'min', 'max']
aggs['category_1'] = ['sum', 'mean', 'min', 'max']
aggs['category_2'] = ['sum', 'mean', 'min', 'max']
aggs['category_3'] = ['sum', 'mean', 'min', 'max']
aggs['card_id'] = ['size', 'count']

for col in ['category_2','category_3']:
    his_t[col+'_mean'] = his_t.groupby([col])['purchase_amount'].transform('mean')
    his_t[col+'_min'] = his_t.groupby([col])['purchase_amount'].transform('min')
    his_t[col+'_max'] = his_t.groupby([col])['purchase_amount'].transform('max')
    his_t[col+'_sum'] = his_t.groupby([col])['purchase_amount'].transform('sum')
    aggs[col+'_mean'] = ['mean']    

new_columns = get_new_columns('hist',aggs)
hist_trans_group = his_t.groupby('card_id').agg(aggs)
hist_trans_group.columns = new_columns
hist_trans_group.reset_index(drop=False,inplace=True)
hist_trans_group['hist_purchase_date_diff'] = (hist_trans_group['hist_purchase_date_max'] - hist_trans_group['hist_purchase_date_min']).dt.days
hist_trans_group['hist_purchase_date_average'] = hist_trans_group['hist_purchase_date_diff']/hist_trans_group['hist_card_id_size']
hist_trans_group['hist_purchase_date_uptonow'] = (datetime.datetime.today() - hist_trans_group['hist_purchase_date_max']).dt.days
train = train.merge(hist_trans_group,on='card_id',how='left')
test = test.merge(hist_trans_group,on='card_id',how='left')
del hist_trans_group;gc.collect();gc.collect()

In [None]:
aggs = {}
for col in ['subsector_id','merchant_id','merchant_category_id']:
    aggs[col] = ['nunique']
for col in ['month', 'hour', 'weekofyear', 'dayofweek', 'year']:
    aggs[col] = ['nunique', 'mean', 'min', 'max']
    
aggs['purchase_amount'] = ['sum','max','min','mean','var']
aggs['installments'] = ['sum','max','min','mean','var']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var']
aggs['month_diff'] = ['mean',"max","min","var"]
aggs['weekend'] = ['sum', 'mean',"min","max"]
aggs['category_1'] = ['sum', 'mean',"min","max"]
aggs['category_2'] = ['sum', 'mean', 'min',"max"]
aggs['category_3'] = ['sum', 'mean', 'min',"max"]
aggs['card_id'] = ['size']

for col in ['category_2','category_3']:
    new_m_t[col+'_mean'] = new_m_t.groupby([col])['purchase_amount'].transform('mean')
    new_m_t[col+'_min'] = new_m_t.groupby([col])['purchase_amount'].transform('min')
    new_m_t[col+'_max'] = new_m_t.groupby([col])['purchase_amount'].transform('max')
    new_m_t[col+'_sum'] = new_m_t.groupby([col])['purchase_amount'].transform('sum')
    aggs[col+'_mean'] = ['mean']
    
new_columns = get_new_columns('new_hist',aggs)
new_m_trans_group = new_m_t.groupby('card_id').agg(aggs)
new_m_trans_group.columns = new_columns
new_m_trans_group.reset_index(drop=False,inplace=True)
new_m_trans_group['new_hist_purchase_date_diff'] = (new_m_trans_group['new_hist_purchase_date_max'] - new_m_trans_group['new_hist_purchase_date_min']).dt.days
new_m_trans_group['new_hist_purchase_date_average'] = new_m_trans_group['new_hist_purchase_date_diff']/new_m_trans_group['new_hist_card_id_size']
new_m_trans_group['new_hist_purchase_date_uptonow'] = (datetime.datetime.today() - new_m_trans_group['new_hist_purchase_date_max']).dt.days
new_m_trans_group['new_hist_purchase_date_uptomin'] = (datetime.datetime.today() - new_m_trans_group['new_hist_purchase_date_min']).dt.days
train = train.merge(new_m_trans_group,on='card_id',how='left')
test = test.merge(new_m_trans_group,on='card_id',how='left')
del new_m_trans_group;gc.collect();gc.collect()

In [None]:
#del hist_trans_group;gc.collect();gc.collect()
del his_t;gc.collect()
del new_m_t;gc.collect()
train.head(5)

In [None]:
train['outliers'] = 0
train.loc[train['target'] < -30, 'outliers'] = 1
train['outliers'].value_counts()

In [None]:
for df in [train, test]:
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['dayofweek'] = df['first_active_month'].dt.dayofweek
    df['weekofyear'] = df['first_active_month'].dt.weekofyear
    df['dayofyear'] = df['first_active_month'].dt.dayofyear
    df['quarter'] = df['first_active_month'].dt.quarter
    df['is_month_start'] = df['first_active_month'].dt.is_month_start
    df['month'] = df['first_active_month'].dt.month
    df['elapsed_time'] = (datetime.datetime.today() - df['first_active_month']).dt.days
    df['hist_first_buy'] = (df['hist_purchase_date_min'] - df['first_active_month']).dt.days
    df['hist_last_buy'] = (df['hist_purchase_date_max'] - df['first_active_month']).dt.days
    df['new_hist_first_buy'] = (df['new_hist_purchase_date_min'] - df['first_active_month']).dt.days
    df['new_hist_last_buy'] = (df['new_hist_purchase_date_max'] - df['first_active_month']).dt.days
    for f in ['hist_purchase_date_max','hist_purchase_date_min','new_hist_purchase_date_max',\
                     'new_hist_purchase_date_min']:
        df[f] = df[f].astype(np.int64) * 1e-9
    df['card_id_total'] = df['new_hist_card_id_size']+df['hist_card_id_size']
    df['purchase_amount_total'] = df['new_hist_purchase_amount_sum']+df['hist_purchase_amount_sum']

for f in ['feature_1','feature_2','feature_3']:
    order_label = train.groupby([f])['outliers'].mean()
    train[f] = train[f].map(order_label)
    test[f] = test[f].map(order_label)

In [None]:
train_columns = [c for c in train.columns if c not in ['card_id', 'first_active_month','target','outliers']]
target = train['target']
del train['target']

In [None]:
#train_x=train[train_columns]
#test_x=test[train_columns]

#train[train_columns].to_csv('train_3.csv', index=False)
#test[train_columns].to_csv('test_3.csv', index=False)



In [None]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         "random_state": 4590}

oof_lgb_3 = np.zeros(len(train))
predictions_lgb_3 = np.zeros(len(test))
start = time.time()

folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=4590)

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train,train['outliers'].values)):    
    print("fold n皗}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=target.iloc[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof_lgb_3[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    
    predictions += clf.predict(test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits

np.save('oof_lgb_3', oof_lgb_3)
np.save('predictions_lgb_3', predictions)
print("CV score: {:<8.5f}".format(mean_squared_error(oof_lgb_3, target)**0.5))

In [None]:
sample_submission = pd.read_csv('../input/sample_submission.csv')
sample_submission['target'] = predictions
sample_submission.to_csv('stacker.csv', index=False)
