In [None]:
import pathlib
import time, datetime

import numpy as np
import pandas as pd
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
%matplotlib inline

In [None]:
def display_columns_description(dataframe):
    for column_name in dataframe.columns.tolist():
        print("\n", column_name, " description:\n", dataframe[column_name].describe())

In [None]:
def display_columns_info(dataframe):
    for column_name in dataframe.columns.tolist():
        print("\n", column_name, " info:\n", dataframe[column_name].info())

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
def binarize(df):
    for col in ['authorized_flag', 'category_1']:
        df[col] = df[col].map({'Y':1, 'N':0})
    return df

In [None]:
def convert_time(pandas_time):
    #x = time.strptime(t,'%H:%M:%S')
    #return int(datetime.timedelta(hours=x.tm_hour, minutes=x.tm_min, seconds=x.tm_sec).total_seconds())
    return int(datetime.timedelta(
        hours=pandas_time.hour,
        minutes=pandas_time.minute,
        seconds=pandas_time.second
    ).total_seconds())

In [None]:
#!ls

In [None]:
!ls ../input/

In [None]:
historical_transactions_df = pd.read_csv('../input/historical_transactions.csv')

In [None]:
historical_transactions_df.info()

In [None]:
historical_transactions_df = reduce_mem_usage(historical_transactions_df)

In [None]:
historical_transactions_df.info()

In [None]:
merchants_df = pd.read_csv('../input/merchants.csv')

In [None]:
merchants_df = reduce_mem_usage(merchants_df)

In [None]:
new_merchant_transactions_df = pd.read_csv('../input/new_merchant_transactions.csv')

In [None]:
new_merchant_transactions_df = reduce_mem_usage(new_merchant_transactions_df)

In [None]:
sample_sumbmission_df = pd.read_csv('../input/sample_submission.csv')

In [None]:
sample_sumbmission_df = reduce_mem_usage(sample_sumbmission_df)

In [None]:
train_df = pd.read_csv('../input/train.csv')

In [None]:
train_df = reduce_mem_usage(train_df)

In [None]:
test_df = pd.read_csv('../input/test.csv')

In [None]:
test_df = reduce_mem_usage(test_df)

In [None]:
historical_transactions_df.info()

In [None]:
#print("historical_transactions_df.card_id.describe()", historical_transactions_df.card_id.describe())
#print()
#print(historical_transactions_df.city_id.describe())
display_columns_description(historical_transactions_df)

In [None]:
historical_transactions_df.describe()

In [None]:
#display_columns_info(historical_transactions_df)

In [None]:
historical_transactions_df.head()

In [None]:
new_merchant_transactions_df.info()

In [None]:
new_merchant_transactions_df.describe()

In [None]:
new_merchant_transactions_df.head()

In [None]:
merchants_df.info()

In [None]:
merchants_df.describe()

In [None]:
merchants_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.head()

In [None]:
train_data_brouped_by_first_active_month = train_df.groupby('first_active_month')

In [None]:
count_train_data_by_first_active_month = train_data_brouped_by_first_active_month.count()

In [None]:
count_train_data_by_first_active_month.shape

In [None]:
count_train_data_by_first_active_month.head()

In [None]:
count_train_data_by_first_active_month = count_train_data_by_first_active_month.sort_values(by='card_id')

In [None]:
count_train_data_by_first_active_month.tail()

In [None]:
count_train_data_by_first_active_month

In [None]:
count_train_data_by_first_active_month.index

In [None]:
pd.to_datetime(count_train_data_by_first_active_month.index)

In [None]:
count_train_data_by_first_active_month['first_active_month_as_datetime'] = pd.to_datetime(count_train_data_by_first_active_month.index)

In [None]:
count_train_data_by_first_active_month

In [None]:
sorted_by_datetime_count_train_data = count_train_data_by_first_active_month.sort_values(by='first_active_month_as_datetime')

In [None]:
years = mdates.YearLocator()
months = mdates.MonthLocator()
years_fmt = mdates.DateFormatter('%Y')

In [None]:
fig, ax = plt.subplots(figsize=(16, 16))
ax.plot(
    sorted_by_datetime_count_train_data['first_active_month_as_datetime'],
    sorted_by_datetime_count_train_data['card_id']
)
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(years_fmt)
ax.xaxis.set_minor_locator(months)
datemin = np.datetime64(sorted_by_datetime_count_train_data['first_active_month_as_datetime'][0], 'Y')
datemax = np.datetime64(sorted_by_datetime_count_train_data['first_active_month_as_datetime'][-1], 'Y') + np.timedelta64(1, 'Y')
ax.set_xlim(datemin, datemax)
ax.set_title("Cards activation count by date")
fig.autofmt_xdate()

In [None]:
train_features_and_target = train_df[['feature_1', 'feature_2', 'feature_3', 'target']]

In [None]:
plt.figure(figsize=(24, 24))
heatmap_fig = sns.heatmap(
    train_features_and_target.corr().round(2),
    annot=True, annot_kws={"size":24},
    cbar=False
)
for item in heatmap_fig.get_xticklabels():
    item.set_fontsize(24)
for item in heatmap_fig.get_yticklabels():
    item.set_fontsize(24)
for item in heatmap_fig.get_label():
    item.set_fontsize(24)

In [None]:
train_features_and_target.plot()

In [None]:
figure = plt.figure(figsize=(16, 16))
axes = figure.add_subplot(111)
axes_scatter_matrix = pd.plotting.scatter_matrix(train_features_and_target, ax=axes, marker='H', s=25)
[plt.setp(item.yaxis.get_majorticklabels(), 'size', 12) for item in axes_scatter_matrix.ravel()]
[plt.setp(item.xaxis.get_majorticklabels(), 'size', 12) for item in axes_scatter_matrix.ravel()]
[plt.setp(item.yaxis.get_label(), 'size', 12) for item in axes_scatter_matrix.ravel()]
[plt.setp(item.xaxis.get_label(), 'size', 12) for item in axes_scatter_matrix.ravel()]

In [None]:
historical_transactions_df.describe()

In [None]:
historical_transactions_df.info()

In [None]:
historical_transactions_df.head()

In [None]:
# historical_transactions_df['authorized_flag'].astype('str')

In [None]:
#historical_transactions_df.loc[:, 'authorized_flag'] = historical_transactions_df['authorized_flag'].astype(str)

In [None]:
#historical_transactions_df.info()

In [None]:
#authorized_flag = historical_transactions_df['authorized_flag'].astype(str)

In [None]:
#authorized_flag.dtypes

In [None]:
#historical_transactions_df_row_0 = historical_transactions_df.iloc[0, :]

In [None]:
#type(historical_transactions_df_row_0)

In [None]:
#for item in historical_transactions_df_row_0:
#    print(type(item))

Сгруппировать т.ж. и по time без даты (т.е. узнать в какое время суток наиболее часто происходит purchase)
Сгруппировать т.ж. и по purchase ammoutn - т.е. поисследовать как и что зависить от размера purchase.

In [None]:
historical_transactions_df['purchase_date_as_date'] = pd.to_datetime(historical_transactions_df['purchase_date']).dt.date

In [None]:
historical_transactions_df['purchase_time_as_time'] = pd.to_datetime(historical_transactions_df['purchase_date']).dt.time

In [None]:
historical_transactions_df['purchase_date_as_datetime'] = pd.to_datetime(historical_transactions_df['purchase_date'])

In [None]:
historical_transactions_df.head()

In [None]:
historical_transactions_df['purchase_datetime_as_seconds'] = pd.to_timedelta(
    historical_transactions_df['purchase_date_as_datetime']
).dt.total_seconds()

In [None]:
historical_transactions_df['purchase_date_as_seconds'] = pd.to_timedelta(
    pd.to_datetime(historical_transactions_df['purchase_date_as_date'])
).dt.total_seconds()

In [None]:
#historical_transactions_df['purchase_time_as_seconds'] = pd.to_timedelta(
#    pd.to_datetime(historical_transactions_df['purchase_time_as_time'])
#).dt.total_seconds()
#historical_transactions_df['purchase_time_as_seconds'] = pd.to_timedelta(
#    historical_transactions_df['purchase_time_as_time']
#).dt.total_seconds()
historical_transactions_df['purchase_time_as_seconds'] = historical_transactions_df['purchase_time_as_time'].apply(convert_time)
#purchase_time_as_time = historical_transactions_df['purchase_time_as_time']
#purchase_time_as_time_0 = purchase_time_as_time.iloc[0]
#purchase_time_as_time_0.

In [None]:
historical_transactions_df.head()

In [None]:
#purchase_time_as_seconds = purchase_time_as_time.apply(convert_time)

In [None]:
#purchase_time_as_time.head()

In [None]:
historical_transactions_df_grouped_by_purchase_date = historical_transactions_df.groupby('purchase_date_as_date')

In [None]:
count_historical_transactions_grouped_by_purchase_date = historical_transactions_df_grouped_by_purchase_date.count()

In [None]:
count_historical_transactions_grouped_by_purchase_date.head()

In [None]:
count_historical_transactions_grouped_by_purchase_date.tail()

In [None]:
sorted_by_date_historical_transactions = count_historical_transactions_grouped_by_purchase_date.sort_values(by='purchase_date_as_date')

In [None]:
sorted_by_date_historical_transactions.head()

In [None]:
fig, ax = plt.subplots(figsize=(16, 16))
ax.plot(
    sorted_by_date_historical_transactions.index,
    sorted_by_date_historical_transactions['card_id']
)
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(years_fmt)
ax.xaxis.set_minor_locator(months)
datemin = np.datetime64(sorted_by_date_historical_transactions.index[0], 'Y')
datemax = np.datetime64(sorted_by_date_historical_transactions.index[-1], 'Y') + np.timedelta64(1, 'Y')
ax.set_xlim(datemin, datemax)
ax.set_title("Purchases by date")
fig.autofmt_xdate()

In [None]:
historical_transactions_df_grouped_by_purchase_amount = historical_transactions_df.groupby('purchase_amount')

In [None]:
count_historical_transactions_grouped_by_purchase_amount = historical_transactions_df_grouped_by_purchase_amount.count()

In [None]:
count_historical_transactions_grouped_by_purchase_amount.head()

In [None]:
count_historical_transactions_grouped_by_purchase_amount.hist?

In [None]:
count_historical_transactions_grouped_by_purchase_amount.info()

In [None]:
historical_transactions_df.card_id.unique().shape

In [None]:
fig, ax = plt.subplots(figsize=(16, 16))
ax.hist(
    count_historical_transactions_grouped_by_purchase_amount.index[
        count_historical_transactions_grouped_by_purchase_amount.index <= 50],
    bins=20
)
#ax.set_xlim(0, 400000)

In [None]:
historical_transactions_df_sorted_by_purchase_amount = historical_transactions_df.sort_values(by='purchase_amount')

In [None]:
#historical_transactions_df_sorted_by_purchase_amount_less_then_200000 = historical_transactions_df_sorted_by_purchase_amount[
    #historical_transactions_df_sorted_by_purchase_amount['purchase_amount'] < 200000]

In [None]:
#fig, ax = plt.subplots(figsize=(16, 16))
#ax.hist(
#    historical_transactions_df_sorted_by_purchase_amount_less_then_200000,
#    bins=30
#)
#ax.set_xlim(0, 400000)

In [None]:
fig, ax = plt.subplots(figsize=(16, 16))
'''
ax.hist(
    count_historical_transactions_grouped_by_purchase_amount.index[
        ((count_historical_transactions_grouped_by_purchase_amount.index) > 50 and (count_historical_transactions_grouped_by_purchase_amount.index < 1000))],
    bins=20
)
'''
ax.hist(
    count_historical_transactions_grouped_by_purchase_amount.index[
        count_historical_transactions_grouped_by_purchase_amount.index < 30],
    bins=20,
    log=True
)

In [None]:
count_historical_transactions_grouped_by_purchase_amount.index.max()

In [None]:
count_historical_transactions_grouped_by_purchase_amount.index.min()

In [None]:
sorted_purchase_amount = count_historical_transactions_grouped_by_purchase_amount.index.sort_values()

In [None]:
sorted_purchase_amount

In [None]:
historical_transactions_df_grouped_by_cart_id = historical_transactions_df.groupby('card_id')

In [None]:
type(historical_transactions_df_grouped_by_cart_id )

In [None]:
purchase_amount_historical_transaction = historical_transactions_df[['card_id', 'purchase_amount']]

In [None]:
purchase_amount_historical_transaction

In [None]:
purchase_amount_historical_transaction_grouped_by_card_id = purchase_amount_historical_transaction.groupby('card_id')

In [None]:
sum_purchase_amount_historical_transaction_grouped_by_card_id = purchase_amount_historical_transaction_grouped_by_card_id.sum()

In [None]:
count_purchase_amount_historical_transaction_grouped_by_card_id = purchase_amount_historical_transaction_grouped_by_card_id.count()

In [None]:
card_id_purchase_amount_stats = pd.merge(
    count_purchase_amount_historical_transaction_grouped_by_card_id,
    sum_purchase_amount_historical_transaction_grouped_by_card_id,
    on='card_id',
    suffixes=('_count', '_sum')
)

In [None]:
card_id_purchase_amount_stats.head()

In [None]:
card_id_purchase_amount_stats.tail()

In [None]:
print(card_id_purchase_amount_stats['purchase_amount_count'].unique())
print(len(card_id_purchase_amount_stats['purchase_amount_count'].unique()))

In [None]:
card_id_purchase_amount_stats.hist?

In [None]:
pd.scatter_matrix(card_id_purchase_amount_stats, figsize=(24, 24))

Feature engineering
Поробовать линейные комбинации признаков, как "родных" так и сгенерённых в kernels которые лежат в дирректории code. Как то это сделать посредством скриптов, не делать наобум. Попробовать нелинейнейные комбинации?
Ну и посилледвать ещё всё "вручную" поискать какие либо ещё закономерности, линейные, нелинейные, как то попробовать применить статистику, RandomForest и т.д. Использовать, среднии арифметические, средние геометрические, средние гармонические?
Кластеризация? Почитать различные handbooks с "рецептами" по поводу feature engineering.

In [None]:
historical_transactions_df.info()

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
#agg_func = {'purchase_amount': ['sum']}
historical_transactions_df.head()

In [None]:
historical_transactions_df = binarize(historical_transactions_df)

In [None]:
historical_transactions_df.head()

In [None]:
#short_historical_transactions_df = historical_transactions_df[historical_transactions_df.columns.drop(['merchant_id', 'purchase_date'])]
short_historical_transactions_df = historical_transactions_df[historical_transactions_df.columns.drop(['purchase_date'])]

In [None]:
short_historical_transactions_df.head()

In [None]:
uniques_sht_card_id = short_historical_transactions_df['card_id'].unique()

In [None]:
uniques_sht_card_id.shape

In [None]:
uniques_train_card_id = train_df['card_id'].unique()

In [None]:
uniques_train_card_id.shape

In [None]:
train_df.shape

In [None]:
short_historical_transactions_df.shape

In [None]:
merged_short_historical_and_train_df = pd.merge(short_historical_transactions_df, train_df, on='card_id')

In [None]:
merged_short_historical_and_train_df.head()

In [None]:
merged_short_historical_and_train_df.shape

In [None]:
left_merged_short_historical_and_train_df = pd.merge(short_historical_transactions_df, train_df, on='card_id', how='left')

In [None]:
left_merged_short_historical_and_train_df.head()

In [None]:
left_merged_short_historical_and_train_df.shape

In [None]:
right_merged_short_historical_and_train_df = pd.merge(short_historical_transactions_df, train_df, on='card_id', how='right')

In [None]:
right_merged_short_historical_and_train_df.head()

In [None]:
right_merged_short_historical_and_train_df.shape

In [None]:
right_merged_short_historical_and_train_df['first_active_month_as_datetime'] = pd.to_datetime(right_merged_short_historical_and_train_df['first_active_month'])

In [None]:
right_merged_short_historical_and_train_df['first_active_month_as_seconds'] = pd.to_timedelta(
    right_merged_short_historical_and_train_df['first_active_month_as_date']
).total_seconds()

In [None]:
#purchase_date_as_date = right_merged_short_historical_and_train_df['purchase_date_as_date']

In [None]:
#timedeltas = pd.to_timedelta(pd.to_datetime(purchase_date_as_date))

In [None]:
#timedeltas.head()

In [None]:
#totalseconds = timedeltas.dt.total_seconds()

In [None]:
#totalseconds

In [None]:
right_merged_short_historical_and_train_df = right_merged_short_historical_and_train_df[right_merged_short_historical_and_train_df.columns.drop('first_active_month')]

In [None]:
right_merged_short_historical_and_train_df.head()

In [None]:
nans_in_merged = right_merged_short_historical_and_train_df[right_merged_short_historical_and_train_df['target'] == np.NAN]

In [None]:
nans_in_merged.shape

In [None]:
targets = right_merged_short_historical_and_train_df['target'].values

In [None]:
features = right_merged_short_historical_and_train_df[right_merged_short_historical_and_train_df.columns.drop(['target'])].values

In [None]:
feature_extraction_test = SelectKBest(score_func=chi2, k=4)

In [None]:
extracted_features = feature_extraction_test.fit(features, targets)