In [14]:
import pandas as pd
import time
from collections import defaultdict, Counter
from itertools import product
import numpy as np

In [2]:
g = pd.read_csv("data/attendances_over_time.csv.gz", encoding='latin-1', 
                header=None, names=["cust_id","age_gr","mtype_sec", "n_attend"])

In [3]:
g.head()

Unnamed: 0,cust_id,age_gr,mtype_sec,n_attend
0,11168,50-54,RUNION,1
1,11171,50-54,RUNION,1
2,11179,60-64,FOLK,1
3,11179,60-64,AFL,5
4,11181,55-59,RUNION,24


In [4]:
g_purch_byage = g.groupby(["cust_id", "age_gr"])["mtype_sec"].apply(list)  # obtain a Series wit Multi Index

In [5]:
g_age_gr_each_customer = g.groupby(["cust_id"])["age_gr"].apply(set).apply(list).apply(sorted)  # note: sorting is done by groupby

In [6]:
cust_longterm = g_age_gr_each_customer.loc[g_age_gr_each_customer.apply(lambda x: True if len(x) > 1 else False)].index.values
cust_shortterm = g_age_gr_each_customer.loc[g_age_gr_each_customer.apply(lambda x: True if len(x) == 1 else False)].index.values

In [7]:
print("longterm customers: {}, shortterm cutomers: {}".format(len(cust_longterm), len(cust_shortterm)))

longterm customers: 1442937, shortterm cutomers: 4494603


Create feature for the shortterm customers. These will be plain binary features showing whother a customer purchesed a particular mtype.

In [8]:
feat_sht_customers = pd.get_dummies(g.loc[g["cust_id"].isin(cust_shortterm), ["cust_id", "mtype_sec","age_gr"]], columns=["mtype_sec"]).groupby(["cust_id", "age_gr"]).sum()
feat_sht_customers.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mtype_sec_5060S,mtype_sec_70SPOP,mtype_sec_70SROCK,mtype_sec_80SPOP,mtype_sec_80SROCK,mtype_sec_90SPOP,mtype_sec_90SROCK,mtype_sec_AFL,mtype_sec_ALTERN,mtype_sec_AWARDS,...,mtype_sec_RNB,mtype_sec_ROCK,mtype_sec_RUNION,mtype_sec_SCHOOL,mtype_sec_SENIOR,mtype_sec_SFCOMEDY,mtype_sec_SFMUSIC,mtype_sec_SOCCER,mtype_sec_STANDUPC,mtype_sec_TENNIS
cust_id,age_gr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
11168,50-54,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
11171,50-54,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
11179,60-64,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
11181,55-59,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
11197,50-54,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [9]:
feat_sht_customers.shape

(4494603, 45)

In [10]:
lt_customers = g.loc[g["cust_id"].isin(cust_longterm),:]
lt_customers.head()               

Unnamed: 0,cust_id,age_gr,mtype_sec,n_attend
5,11196,35-39,AFL,1
6,11196,30-34,ALTERN,1
29,11391,65-69,TENNIS,4
30,11391,60-64,TENNIS,8
31,11392,60-64,CRICKET,1


In [11]:
f = defaultdict(lambda: defaultdict(int))  # {cust_id: {"AFL->BALLET": 1, "TENNIS->COMEDY": 1}}

In [12]:
g_age_gr_each_customer_longterm = g_age_gr_each_customer.loc[g_age_gr_each_customer.index.isin(cust_longterm)]
g_age_gr_each_customer_longterm.head()

cust_id
11196    [30-34, 35-39]
11391    [60-64, 65-69]
11392    [55-59, 60-64]
11485    [45-49, 50-54]
11581    [50-54, 55-59]
Name: age_gr, dtype: object

In [15]:
from multiprocessing import Pool

def xf_subdf(d0):
    """
    here d0 is a DATA FRAME of the form
    cust_id
    11196    [30-34, 35-39]
    11391    [60-64, 65-69]
    """

    f = defaultdict(lambda: defaultdict(int))
    
    for r in d0.itertuples():
        # make "transition" feature
        f[r.cust_id] = {"->".join(tp): 1 for tp in product(*[g_purch_byage[(r.cust_id, ag)] 
                                                        for ag in d0[row.cust_id]])}
        # add age group 
        f[r.cust_id].update({"age_gr": g_age_gr_each_customer[r.cust_id][-1]})
    
    # create a data frame from dict and return it
    return pd.DataFrame.from_dict(f, orient='index').fillna(0)

t0 = time.time()

def xf_df(d1):
    
    df_split = np.array_split(d1, 10)
    
    pool = Pool(2)
    
    df = pd.concat(pool.map(xf_subdf, df_split))
    
    pool.close()
    pool.join()
    
    return df

f1 = xf_df(g_age_gr_each_customer_longterm.reset_index())

print("elapsed time: {:.0f} m {:.0f} s".format(*divmod(time.time()-t0, 60)))

NameError: name 'itertuples' is not defined

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import LabelEncoder

In [None]:
X_train, X_test, y_train, y_test = train_test_split(d1.drop("age_gr", axis=1), d1["age_gr"], 
                                                    test_size=0.3, random_state=94, stratify=d1["age_gr"])

In [None]:
print("created training data - features {} and target {}".format(X_train.shape, y_train.shape))

In [None]:
gnb = BernoulliNB()
gnb.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

svc = SVC(kernel='linear')
svc.fit(X_train, y_train)

print("accuracy: {:.3f}".format(accuracy_score(y_test, svc.predict(X_test))))
scores = cross_val_score(svc, X_train, y_train, cv=5)


#print("accuracy: {:.3f}".format(accuracy_score(y_test, gnb.predict(X_test))))
# scores = cross_val_score(gnb, X_train, y_train, cv=5)
print("cross-validation scores: {}, {}, {}, {}, {}".format(*scores))