In [40]:
import base64
import datetime
import time
import json
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns
import statsmodels as sm
from pylab import rcParams
from numpy import median
import matplotlib.dates as mdates

np.random.seed(1337)
%matplotlib inline
sns.set(font_scale=1.5)
rcParams['figure.figsize'] = 12, 8
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette('muted'))

In [43]:
def group_to_features(g, target):
    g = g.copy().sort_values('measurement_date')

    d = {}

    d['customer_id'] = g.customer_id.max()
    d['target'] = target

    last_id = g.index.tolist()[-1]
    for p in g.columns[23:]:
        g.set_value(last_id, p, np.nan)

    try:
        d['measurement_month_last'] = g.measurement_date.max().month
    except:
        d['measurement_month_last'] = np.nan

    d['gender'] = g.gender.max()
    d['age'] = g.age.max()

    try:
        d['first_contract_month'] = g.first_contract_date.max().month
    except:
        d['first_contract_month'] = np.nan

    try:
        d['age_at_first_contract'] = g.age.max() - (2016 - g.first_contract_date.max().year)
    except:
        d['age_at_first_contract'] = np.nan

    try:
        d['diff_first_contract_month_and_last_month'] = (g.first_contract_date.max().month - g.measurement_date.max().month) % 12
    except:
        d['diff_first_contract_month_and_last_month'] = np.nan

    try:
        d['first_contract_months_ago'] = (g.measurement_date.max() - g.first_contract_date.max()).days / 30
    except:
        d['first_contract_months_ago'] = np.nan

    d['customer_for_months'] = g.customer_for_months.max()

    try:
        d['diff_first_customer_month_and_last_month'] = (g.measurement_date.max().month - g.customer_for_months.max()) % 12
    except:
        d['diff_first_customer_month_and_last_month'] = np.nan

    try:
        d['diff_first_customer_month_and_first_contract_month'] = (g.first_contract_date.max().month - g.customer_for_months.max()) % 12
    except:
        d['diff_first_customer_month_and_first_contract_month'] = np.nan

    d['is_foreigner'] = g.is_foreigner.max()
    d['join_channel'] = g.join_channel.max()
    d['is_dead'] = np.nan
    if len(g.is_dead.dropna().tolist()) > 0:
        d['is_dead'] = g.is_dead.dropna().tolist()[-1]

    try:
        d['household_income_diff'] = g['household_income'].tolist()[-1] - g['household_income'].tolist()[0]
    except:
        d['household_income_diff'] = np.nan

    lca_features = [
        'employment_index',
        'country_of_residence',
        'is_new_customer',
        'is_primary_customer',
        'customer_type',
        'customer_relation_type',
        'country_residence_bank_same',
        'is_spouse_of_an_employee',
        'province_code',
        'is_active_customer',
        'household_income',
        'customer_segment']

    for p in g.columns[23:]:
        lca_features.append(p)
        d[p + '_sum'] = g[p].sum()

    for f in lca_features:
        try:
            d[f + '_last'] = g[f].dropna().tolist()[-1]
        except:
            d[f + '_last'] = np.nan

        d[f + '_changed'] = 'yes' if len(g[f].dropna().value_counts()) > 1 else 'no'
        d[f + '_changed_measurements_ago'] = np.nan
        if d[f + '_changed'] == 'yes':
            i = 2
            while len(g[f].dropna().tail(i).value_counts()) == 1:
                i += 1
            d[f + '_changed_measurements_ago'] = i - 1

    return d

In [41]:
dg = pd.read_csv('tmp/raw_train_ind_viv_fin_ult1.csv', parse_dates=['measurement_date', 'first_contract_date']).groupby(['customer_id'])

In [42]:
for k, g in dg:
    print k

22225
45263
47386
49128
51089
58273
62001
72202
85015
86882
92988
101030
110092
114525
114534
116719
122718
125258
127156
135753
137923
157493
172361
182143
188618
199224
203032
211592
212806
215604
219222
233612
247509
250709
255674
261331
271897
277257
277945
283256
285221
286596
286725
287808
309386
312953
313750
316206
344758
345919
365101
386530
390129
395013
411469
416006
417379
418412
438707
448790
458204
462421
473873
476088
477198
490688
491165
511225
516799
522476
526809
546100
548296
571148
582638
584109
588885
604163
608668
614659
614763
622143
625279
629025
633379
644068
649223
653807
660345
664591
664664
669366
676467
677964
685808
690911
695702
708162
712616
716319
728688
733473
734055
734874
734932
757179
762318
780763
784320
784774
800104
819543
820535
831947
887040
887182
887394
888829
890419
910826
940826
952084
959075
967629
976556
997002
1008028
1011980
1016567
1022088
1022377
1027729
1045975
1055009
1061276
1065405
1073234
1073442
1074471
1077485
1080892
1084674
1

In [45]:
g = dg.get_group(1396933)
g.T

Unnamed: 0,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035
customer_id,1396933,1396933,1396933,1396933,1396933,1396933,1396933,1396933,1396933,1396933,1396933,1396933,1396933
measurement_date,2015-06-28 00:00:00,2015-07-28 00:00:00,2015-08-28 00:00:00,2015-09-28 00:00:00,2015-10-28 00:00:00,2015-11-28 00:00:00,2015-12-28 00:00:00,2016-01-28 00:00:00,2016-02-28 00:00:00,2016-03-28 00:00:00,2016-04-28 00:00:00,2016-05-28 00:00:00,2016-06-28 00:00:00
employment_index,not-employee,not-employee,not-employee,not-employee,not-employee,not-employee,not-employee,not-employee,not-employee,not-employee,not-employee,not-employee,not-employee
country_of_residence,ES,ES,ES,ES,ES,ES,ES,ES,ES,ES,ES,ES,ES
gender,female,female,female,female,female,female,female,female,female,female,female,female,female
age,25,25,25,25,25,25,25,25,25,26,26,26,26
first_contract_date,2015-05-13 00:00:00,2015-05-13 00:00:00,2015-05-13 00:00:00,2015-05-13 00:00:00,2015-05-13 00:00:00,2015-05-13 00:00:00,2015-05-13 00:00:00,2015-05-13 00:00:00,2015-05-13 00:00:00,2015-05-13 00:00:00,2015-05-13 00:00:00,2015-05-13 00:00:00,2015-05-13 00:00:00
is_new_customer,yes,yes,yes,yes,yes,no,no,no,no,no,no,no,no
customer_for_months,2,2,3,4,5,6,7,8,9,10,11,12,13
is_primary_customer,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes


In [50]:
group_to_features(g.head(1), np.nan)

{'age': 25.0,
 'age_at_first_contract': 24.0,
 'country_of_residence_changed': 'no',
 'country_of_residence_changed_measurements_ago': nan,
 'country_of_residence_last': 'ES',
 'country_residence_bank_same_changed': 'no',
 'country_residence_bank_same_changed_measurements_ago': nan,
 'country_residence_bank_same_last': 'yes',
 'customer_for_months': 2.0,
 'customer_id': 1396933,
 'customer_relation_type_changed': 'no',
 'customer_relation_type_changed_measurements_ago': nan,
 'customer_relation_type_last': 'active',
 'customer_segment_changed': 'no',
 'customer_segment_changed_measurements_ago': nan,
 'customer_segment_last': 'individual',
 'customer_type_changed': 'no',
 'customer_type_changed_measurements_ago': nan,
 'customer_type_last': 'primary',
 'diff_first_contract_month_and_last_month': 11,
 'diff_first_customer_month_and_first_contract_month': 3.0,
 'diff_first_customer_month_and_last_month': 4.0,
 'employment_index_changed': 'no',
 'employment_index_changed_measurements_ago'

In [47]:
pd.read_csv('agg_train_ind_viv_fin_ult1.csv').T

IOError: File agg_train_ind_viv_fin_ult1.csv does not exist