In [284]:
import numpy as np
import pandas as pd
from scipy import sparse
import re
import xgboost ; import lightgbm as lgb
from sklearn.metrics import roc_curve,auc

In [179]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
song = pd.read_csv('songs.csv')
members =  pd.read_csv('members.csv')

In [180]:
train = train.merge(song,on= 'song_id',how = 'left')
test = test.merge(song,on = 'song_id',how = 'left')

In [181]:
#user_bias, user_freq, song_bias, song_freq
tuf = train.groupby('msno')['target'].agg(['sum','count']) 
tuf = tuf.reset_index()
tuf.columns.values[1:3] = ['user_sum','user_count']
tuf['user_freq'] = (tuf.user_sum+1)/(tuf.user_count+2) #Laplacian smoothing
train = train.merge(tuf,on = 'msno',how = 'left')
train.loc[train.user_count == 1,'user_sum'] = 0 
train.loc[train.user_count == 1,'user_freq'] = 0.5 
train['is_new_user'] = train.user_count == 1

#prepare test's user bias
test = test.merge(tuf,on = 'msno',how = 'left')
test['is_new_user'] = pd.isnull(test.user_count)
test.loc[pd.isnull(test.user_count),'user_count'] = 1
test.loc[pd.isnull(test.user_sum),'user_sum'] = 0
test.loc[pd.isnull(test.user_freq),'user_freq'] = 0.5

#prepare songs' bias
tuf = train.groupby('song_id')['target'].agg(['sum','count']) #Laplacian smoothing
tuf = tuf.reset_index()
tuf.columns.values[1:3] = ['song_sum','song_count']
tuf['song_freq'] = (tuf.song_sum+1)/(tuf.song_count+2)
train = train.merge(tuf,on = 'song_id',how = 'left')
train.loc[train.song_count == 1,'song_sum'] = 0
train.loc[train.song_count == 1,'song_freq'] = 0.5 
train['is_new_song'] = train.song_count == 1

#prepare test's songs' bias
test = test.merge(tuf,on = 'song_id',how = 'left')
test['is_new_song'] = pd.isnull(test.song_count)
test.loc[pd.isnull(test.song_count),'song_count'] = 1
test.loc[pd.isnull(test.song_freq),'song_freq'] = 0.5
test.loc[pd.isnull(test.song_sum),'song_sum'] = 0

Artist Preparation: 
1. find the intersection of artist between the train and test
2. treat the compound artist seperately from single artist
3. if a single artist from test set hasn't appeared before, we can do nothing, however, if a compound artist has appeared, we can possibly estimate it from the single artist's contribution
4. add a dummy variable artist_estimated to show

In [182]:
def split(x):
    l = re.split("[+|&/]",x)
    l = [i.strip() for i in list(l)]
    return l

In [183]:
train.artist_name = train.artist_name.fillna("no_rec")
test.artist_name = test.artist_name.fillna("no_rec")

In [39]:
train_art = set()
for name in train.artist_name.unique():
    train_art.add(name)
    for sub in split(name):
        train_art.add(sub.strip())

In [40]:
test_art_diff = set()
for name in test.artist_name.unique():
    indic = [sub.strip() in train_art for sub in split(name)]
    if not any(indic):
        test_art_diff.add(name)
#19%的artist从来没有出现过

In [41]:
tt_csame = set()
for name in test.artist_name.unique():
     if len(split(name)) > 1 and name in train_art:
            tt_csame.add(name)

In [42]:
tt_single = set()
for name in train.artist_name.unique():
    if len(split(name)) == 1:
        tt_single.add(name)

In [43]:
tt_cdump = set()
for name in train.artist_name.unique():
    if len(split(name)) > 1 and not name in tt_csame:
        tt_cdump.add(name)

In [44]:
artdict = dict(zip(list(train_art),range(len(train_art))))
userdict = dict(zip(train.msno.unique(),range(train.msno.nunique())))
v = train.groupby(['msno','artist_name'])['target'].agg(['sum','count'])
v = v.reset_index()
v.columns.values[2:] = ['uasum','uacount']
uamat = sparse.lil_matrix((len(userdict),2*len(artdict)))
v['mind'] = v.msno.apply(lambda x: userdict[x])
v['amind'] = v.artist_name.apply(lambda x: artdict[x])
for i in v.index:
    uamat[v.mind[i],2*v.amind[i]] = v.uasum[i]
    uamat[v.mind[i],2*v.amind[i]+1] = v.uacount[i]
disdict = {}
for key, ind in artdict.items():
    l = [artdict[sub] for sub in split(key) if not sub in tt_single]
    if len(l) > 0:
        disdict[key] = (l,ind)
uamat = sparse.csc_matrix(uamat)
for key, ind in disdict.items():
    for sub in ind[0]:
        uamat[:,2*sub] += uamat[:,2*ind[1]]
        uamat[:,2*sub+1] += uamat[:,2*ind[1]+1]



In [46]:
#uamat.data = uamat.data.astype('int64')
#np.savez("uamat.csv",data = uamat.data,indices=uamat.indices,indptr =uamat.indptr, shape=uamat.shape)


array([  212,   319,   335, ..., 27897, 27966, 30423], dtype=int32)

In [184]:
#prepare train artist feature
oart = train.groupby('artist_name')['target'].agg(['sum','count'])
oart = oart.reset_index()
oart.columns.values[1:3] = ['art_sum','art_count']
oart['art_freq'] = (oart.art_sum+1)/(oart.art_count+2)
train = train.merge(oart,on='artist_name',how = 'left')
train.loc[train.art_count == 1,'art_sum'] = 0
train.loc[train.art_count == 1,'art_freq'] = 0.5
train['is_new_art'] = train.art_count == 1
traindump = train[train.artist_name.isin(list(tt_cdump))]
train['is_art_dump'] = 0
uasum = uamat.sum(axis = 0)
for name,group in traindump.groupby("artist_name"):
    subind = np.array([artdict[sub.strip()] for sub in split(name)])
    gasum = uasum[0,2*subind].sum()/len(subind)
    gacount = uasum[0,2*subind+1].sum()/len(subind)
    for ind in group.index:
        train.set_value(ind,'art_sum',gasum)
        train.set_value(ind,'art_count',gacount)
        train.set_value(ind,'art_freq',(gasum+1)/(gacount+2))
        train.set_value(ind,'is_art_dump',1)

In [185]:
#prepare test artist feature
test = test.merge(oart,on='artist_name',how = 'left')
test['is_art_dump'] = 0
test['is_new_art'] = 0
testnan = test[pd.isnull(test.art_sum)]
test.loc[pd.isnull(test.art_sum),'is_new_art'] = 1
for name, group in testnan.groupby("artist_name"):
    subind = np.array([artdict[sub.strip()] for sub in split(name) if sub.strip() in train_art])
    if subind.size == 0:
        for ind in group.index:
            test.set_value(ind,'art_sum',0)
            test.set_value(ind,'art_count',1)
            test.set_value(ind,'art_freq',0.5)
    else:
        gasum = uasum[0,2*subind].sum()/len(subind)
        gacount = uasum[0,2*subind+1].sum()/len(subind)
        for ind in group.index:
            train.set_value(ind,'art_sum',gasum)
            test.set_value(ind,'art_count',gacount)
            test.set_value(ind,'art_freq',(gasum+1)/(gacount+2))
            test.set_value(ind,'is_art_dump',1)

In [188]:
#prepare train/artist feature
v['uafreq'] = (v.uasum+1)/(v.uacount+2)
train = train.merge(v,on = ['msno','artist_name'],how = 'left')
train.loc[train.uacount == 1,'uafreq'] = 0.5
train.loc[train.uacount == 1,'uasum'] = 0
for name,group in train[train.artist_name.isin(list(tt_cdump))].groupby('artist_name'):
    subcolin = np.array([artdict[sub.strip()] for sub in split(name)])
    vsum = uamat[:,2*subcolin][group.mind,:].sum(axis =1)
    vcount = uamat[:,2*subcolin+1][group.mind,:].sum(axis = 1)
    gi = group.index ; gs = group.shape[0]
    for i in range(gs):
        train.set_value(gi[i],'uasum',vsum[i]/len(subcolin))
        train.set_value(gi[i],'uacount',vcount[i]/len(subcolin))
        train.set_value(gi[i],'uafreq',(vsum[i]+len(subcolin))/(vcount[i]+2*len(subcolin)) 
                        if vcount[i] > len(subcolin) else 0.5)
train = train.drop(['mind','amind'],1)

In [189]:
#prepare test/artist feature
trainmsno = train.msno.unique()
test = test.merge(v,on = ['msno','artist_name'],how = 'left')
testnan = test[pd.isnull(test.uacount)]
testnu = testnan[~testnan.msno.isin(trainmsno)]
for name,group in testnu.groupby(['msno','artist_name']):
    subcolin = np.array([artdict[sub.strip()] for sub in split(name[1]) if sub.strip() in train_art])
    indic = len(subcolin) > 0 
    gasum = uasum[0,2*subcolin].sum()/len(subcolin) if indic else 1
    gacount = uasum[0,2*subcolin+1].sum()/len(subcolin) if indic else 1
    overallfreq = (gasum + len(subcolin))/(gacount + 2*len(subcolin)) if indic else 0.5
    for ind in group.index:
        test.set_value(ind,'uasum', gasum if indic else 0)
        test.set_value(ind,'uacount',gacount if indic else 1)
        test.set_value(ind,'uafreq',overallfreq)
testrem = test[pd.isnull(test.uafreq)]
testrem['mind']= testrem.msno.apply(lambda x: userdict[x])
for name,group in testrem.groupby('artist_name'):
    subcolin = np.array([artdict[sub.strip()] for sub in split(name) if sub.strip() in train_art])
    if len(subcolin) > 0:
        vsum = uamat[:,2*subcolin][group.mind,:].sum(axis = 1)
        vcount = uamat[:,2*subcolin+1][group.mind,:].sum(axis =1)
        gi = group.index
        for i in range(group.shape[0]):
            test.set_value(gi[i],'uafreq',(vsum[i]+len(subcolin))/(vcount[i] + 2*len(subcolin))
                           if vcount[i] > len(subcolin) else 0.5)
            test.set_value(gi[i],'uacount',vcount[i]/len(subcolin))
            test.set_value(gi[i],'uasum',vsum[i]/len(subcolin))
    else:
        for ind in group.index:
            test.set_value(ind,'uasum',0)
            test.set_value(ind,'uacount',1)
            test.set_value(ind,'uafreq',0.5)
test = test.drop(['mind','amind'],1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Genre Preparation: 
1. find the intersection of artist between the train and test
2. treat the compound artist seperately from single artist
3. if a single artist from test set hasn't appeared before, we can do nothing, however, if a compound artist has appeared, we can possibly estimate it from the single artist's contribution
4. add a dummy variable genre_estimated to show

In [190]:
train.genre_ids = train.genre_ids.fillna('-1')
test.genre_ids = test.genre_ids.fillna('-1')

In [146]:
train_genre = set()
for genre in train.genre_ids:
    train_genre.add(genre)
    if not pd.isnull(genre) and "|" in genre:
        for sub in genre.split("|"):
            train_genre.add(sub)
test_genre_diff = set()  #只有五个单个genre没有
for name in test.genre_ids.unique():
    if not pd.isnull(name) and not "|" in name and not name in train_genre:
        test_genre_diff.add(name)
    if not pd.isnull(name) and "|" in name:
        indic = [x in train_genre for x in name.split("|")]
        if not any(indic):
            test_genre_diff.add(name)

In [147]:
train_gsame = set()
for genre in test.genre_ids.unique():
    if not pd.isnull(genre) and "|" in genre and genre in train_genre:
        train_gsame.add(genre)

In [149]:
train_gdump = set()
for genre in train.genre_ids.unique():
    if not pd.isnull(genre) and "|" in genre and not genre in train_gsame:
        train_gdump.add(genre)

In [150]:
train_gsingle = [x for x in train.genre_ids.unique() if not pd.isnull(x) and not "|" in x]
#test_newsingle = [x for x in test.genre_ids.unique() if not pd.isnull(x) and not "|" in x and not x in train_gsingle and x in train_genre]
test_newsingle = set()
for x in test.genre_ids.unique():
    if not pd.isnull(x) and "|" in x:
        indc = [sub for sub in x.split("|") if sub in train_gsingle]
        if len(indc) == 0:
            test_newsingle.add(sub)

In [151]:
test_compnew = set()
for genre in test.genre_ids.unique():
    if not pd.isnull(genre) and "|" in genre and not genre in train_genre:
        for sub in genre.split("|"):
            if not sub in train_gsingle and sub in train_genre:
                test_compnew.add(genre) 
#sub= 275/864/857

In [152]:
#prepare user,genre dictionary
genredict = dict(zip(list(train_genre),range(len(train_genre))))
ugmat = sparse.lil_matrix((len(userdict),2*len(genredict)))
v = train.groupby(['msno','genre_ids'])['target'].agg(['sum','count'])
v = v.reset_index()
v.columns.values[2:] = ['ugsum','ugcount']
v['mind'] = v.msno.apply(lambda x: userdict[x])
v['gmind'] = v.genre_ids.apply(lambda x: genredict[x])
for i in v.index:
    ugmat[v.mind[i],2*v.gmind[i]] = v.ugsum[i]
    ugmat[v.mind[i],2*v.gmind[i]+1] = v.ugcount[i]
ugmat = sparse.csc_matrix(ugmat)
for dump in train.genre_ids.unique():
    subp = [dumpart for dumpart in dump.split("|") if dumpart not in train_gsingle]
    for sub in subp:
        ugmat[:,2*genredict[sub]] += ugmat[:,2*genredict[dump]]
        ugmat[:,2*genredict[sub]+1] += ugmat[:,2*genredict[dump]+1]



In [191]:
#prepare the train features on genre
ogenre = train.groupby('genre_ids')['target'].agg(['sum','count'])
ogenre = ogenre.reset_index()
ogenre.columns.values[1:3] = ['genre_sum','genre_count']
ogenre['genre_freq'] = (ogenre.genre_sum+1)/(ogenre.genre_count+2)
train.genre_ids = train.genre_ids.astype('str')
ogenre.genre_ids = ogenre.genre_ids.astype('str')
train = train.merge(ogenre,on='genre_ids',how = 'left')
train['is_new_genre'] = train.genre_count == 1
train.loc[train.genre_count ==1, 'genre_sum'] = 0
train.loc[train.genre_count ==1, 'genre_freq'] = 0.5
traindump = train[train.genre_ids.isin(list(train_gdump))]
train['is_genre_dump'] = 0
ugsum = ugmat.sum(axis = 0)
for name,group in traindump.groupby("genre_ids"):
    subind = np.array([genredict[sub] for sub in name.split("|")])
    gasum = ugsum[0,2*subind].sum()/len(subind)
    gacount = ugsum[0,2*subind+1].sum()/len(subind)
    for ind in group.index:
        train.set_value(ind,'genre_sum',gasum)
        train.set_value(ind,'genre_count',gacount)
        train.set_value(ind,'genre_freq',(gasum+1)/(gacount+2))
        train.set_value(ind,'is_genre_dump',1)

In [192]:
#prepare the test features on genre
test = test.merge(ogenre,on='genre_ids',how = 'left')
test['is_new_genre'] = 0
test['is_genre_dump'] = 0
test.loc[pd.isnull(test.genre_sum),'is_new_genre'] = 1
testnan = test[pd.isnull(test.genre_sum)]
for name, group in testnan.groupby("genre_ids"):
    if not "|" in name:
        for ind in group.index:
            test.set_value(ind,'genre_count',0)
            test.set_value(ind,'genre_freq',0.5)
    else:
        subind = np.array([genredict[sub] for sub in name.split("|") if sub in train_genre])
        gasum = ugsum[0,2*subind].sum()/len(subind)
        gacount = ugsum[0,2*subind+1].sum()/len(subind)
        for ind in group.index:
            test.set_value(ind,'genre_sum',gasum)
            test.set_value(ind,'genre_count',gacount)
            test.set_value(ind,'genre_freq',(gasum+1)/(gacount+2))
            test.set_value(ind,'is_genre_dump',1)
#test = test.drop('genre_sum',1)

In [195]:
#prepare the train feature on genre/user pair
v['ugfreq'] = (v.ugsum+1)/(v.ugcount+2)
train = train.merge(v,on = ['msno','genre_ids'],how = 'left')
train.loc[train.uacount == 1,'uafreq'] = 0.5
train.loc[train.uacount == 1,'uasum'] = 0
for name,group in train[train.genre_ids.isin(list(train_gdump))].groupby('genre_ids'):
    subcolin = np.array([genredict[sub.strip()] for sub in name.split("|")])
    vsum = ugmat[:,2*subcolin][group.mind,:].sum(axis =1)
    vcount = ugmat[:,2*subcolin+1][group.mind,:].sum(axis = 1)
    gi = group.index ; gs = group.shape[0]
    for i in range(gs):
        train.set_value(gi[i],'ugfreq',(vsum[i]+len(subcolin))/(vcount[i] + 2*len(subcolin))
                        if vcount[i] > len(subcolin) else 0.5)
        train.set_value(gi[i],'ugcount',vcount[i]/len(subcolin))
        train.set_value(gi[i],'ugsum',vsum[i]/len(subcolin))
train = train.drop(['mind','gmind'],1)

In [196]:
#prepare the test feature on genre/user pair
trainmsno = train.msno.unique()
test = test.merge(v,on = ['msno','genre_ids'],how = 'left')
testnan = test[pd.isnull(test.ugcount)]
testnu = testnan[~testnan.msno.isin(trainmsno)]
for name,group in testnu.groupby(['msno','genre_ids']):
    subcolin = np.array([genredict[sub.strip()] for sub in name[1].split("|") if sub in train_genre])
    indic = len(subcolin) > 0
    gasum = ugsum[0,2*subcolin].sum()/len(subcolin) if indic else 1
    gacount = ugsum[0,2*subcolin+1].sum()/len(subcolin) if indic else 1
    overallfreq = (gasum + len(subcolin))/(gacount + 2*len(subcolin)) if indic else 0.5
    for ind in group.index:
        test.set_value(ind,'ugcount',gacount if indic else 1)
        test.set_value(ind,'ugsum',gasum if indic else 0)
        test.set_value(ind,'ugfreq',overallfreq)
testrem = test[pd.isnull(test.ugfreq)]
testrem['mind']= testrem.msno.apply(lambda x: userdict[x])
for name,group in testrem.groupby('genre_ids'):
    subcolin = np.array([genredict[sub.strip()] for sub in name.split("|") if sub in train_genre])
    if len(subcolin) > 0:
        vsum = ugmat[:,2*subcolin][group.mind,:].sum(axis = 1)
        vcount = ugmat[:,2*subcolin+1][group.mind,:].sum(axis =1)
        gi = group.index
        for i in range(group.shape[0]):
            test.set_value(gi[i],'ugfreq',(vsum[i]+len(subcolin))/(vcount[i] + 2*len(subcolin))
                           if vcount[i] > len(subcolin) else 0.5)
            test.set_value(gi[i],'ugcount',vcount[i]/len(subcolin))
            test.set_value(gi[i],'ugsum',vsum[i]/len(subcolin))
    else:
        for ind in group.index:
            test.set_value(ind,'ugsum',0)
            test.set_value(ind,'ugcount',1)
            test.set_value(ind,'ugfreq',0.5)
test = test.drop(['mind','gmind'],1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


#compute the cosine similarity based songs score

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
rowin = train['msno'].unique()
colin = train['song_id'].unique()
spm = sparse.lil_matrix((len(rowin),len(colin)))
dictuser = dict(zip(rowin,range(len(rowin))))
dictsong = dict(zip(colin,range(len(colin))))
for ind in train.index:
    userp = dictuser[train.msno[ind]]
    songp = dictsong[train.song_id[ind]]
    spm[userp,songp] = (1 if train.target[ind] == 1 else -1)
u_simi_song= cosine_similarity(spm)
#spm = sparse.csr_matrix(spm)

In [7]:
train['usbs'] = 0.0
train['usbs_is_estimate'] = 0
train['uind'] = train['msno'].apply(lambda x: dictuser[x])
train['rating'] = train['target'].apply(lambda x: 1 if x == 1 else -1)
train_gr = train.groupby('song_id')
for name,group in train_gr:
    num = group.shape[0]
    if num > 1:
        for ind in group.index:
            sims = u_simi_song[group.uind[ind],group.uind]
            ra = (np.dot(sims,group.rating)-group.rating[ind])/(num-1)
            train.set_value(ind,'usbs',ra)
#     else:
#         train.set_value(group.index[0],'usbs',group.uafreq)
#         train.set_value(group.index[0],'usbs_is_estimate',1)

In [6]:
#test usbs
test['usbs'] = 0.0
test['usbs_is_estimate'] = 0
for ind in test[~test.song_id.isin(train.song_id.unique()) & ~test.msno.isin(train.msno.unique())].index:
    test.set_value(ind,'usbs',0.5)
    test.set_value(ind,'usbs_is_estimate',1)
for ind in test[~test.song_id.isin(train.song_id.unique()) & test.msno.isin(train.msno.unique())].index:
    test.set_value(ind,'usbs',test.uafreq[ind])
    test.set_value(ind,'usbs_is_estimate',1)
for ind in test[test.song_id.isin(train.song_id.unique()) & ~test.msno.isin(train.msno.unique())].index:
    test.set_value(ind,'usbs',test.song_freq[ind])
    test.set_value(ind,'usbs_is_estimate',1)
testall = test[test.song_id.isin(train.song_id.unique()) & test.msno.isin(train.msno.unique())]
testall['uind'] = testall.msno.apply(lambda x: dictuser[x])
for name,group in testall.groupby('song_id'):
    trainind = train.loc[train.song_id == name,'uind']
    for ind in group.index:
        sims = u_simi_song[group.uind[ind],trainind]
        ra = np.dot(sims,group.rating)/trainind.size
        test.set_value(ind,'usbs',ra)

In [20]:
testall = test[test.song_id.isin(train.song_id.unique()) & test.msno.isin(train.msno.unique())]
testall['uind'] = testall.msno.apply(lambda x: dictuser[x])
for name,group in testall.groupby('song_id'):
    trainind = train.loc[train.song_id == name,['uind','rating']]
    for ind in group.index:
        sims = u_simi_song[group.uind[ind],trainind.uind]
        ra = np.dot(sims,trainind.rating)/trainind.shape[0]
        test.set_value(ind,'usbs',ra)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


KeyboardInterrupt: 

In [26]:
for key in testall.msno.unique():
    

(2556790, 13)

In [197]:
#member clean
#How many days has they been active
members['rtime'] = members['registration_init_time'].apply(
    lambda x : pd.to_datetime(str(x), format='%Y%m%d'))
members['etime'] = members['expiration_date'].apply(
    lambda x : pd.to_datetime(str(x), format='%Y%m%d'))
members['numactivedays'] = (members['etime'] - members['rtime']).dt.days
min_day = members['rtime'].min()
members['day_id_res'] = (members['rtime'] - min_day).dt.days+1
members['day_id_exp'] = (members['etime'] - min_day).dt.days+1
members['r_year'] = members['rtime'].dt.year
members['r_month'] = members['rtime'].dt.month
members['r_day'] = members['rtime'].dt.day
members['e_year'] = members['etime'].dt.year
members['e_month'] = members['etime'].dt.month
members['e_day'] = members['etime'].dt.day

In [198]:
#cleaned age and cleaned_gender
members['is_male'] = members.gender == 'male'
members['is_female'] = members.gender == 'female'
members['age_mtrue'] = members.bd.apply(lambda x: 1 if x > 10 and x < 80 else 0)
weighted_age = round(members.bd[members.age_mtrue == 1].mean())
members['age_clean'] = members.bd.apply(lambda x: x if x >10 and x < 80 else weighted_age)

In [199]:
train = train.merge(members,on = 'msno', how = 'left')
test = test.merge(members,on = 'msno', how = 'left')

In [200]:
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

In [201]:
songs_extra = pd.read_csv('song_extra_info.csv')
songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)
songs_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

train = train.merge(songs_extra, on='song_id', how='left')
test = test.merge(songs_extra, on='song_id', how='left')

In [60]:
train = train.drop(['uind','rating'],1)
ttrain = train.drop(['usbs','usbs_is_estimate'],1)
ttest = test.drop('id',1)

In [224]:
ttrain = train
ttest = test.drop('id',1)

In [225]:
columns_keep = ['user_count', 'user_freq', 'song_count','song_freq', 'art_count', 'art_freq', 'is_art_dump', 'uacount','uafreq', 
'genre_count', 'genre_freq', 'is_genre_dump', 'ugcount','ugfreq','song_length','is_male','is_female','age_mtrue',
'age_clean','song_year','numactivedays','source_system_tab','source_type','registered_via','source_screen_name','city',
'day_id_res','day_id_exp']

In [226]:
ttest = test[columns_keep]
ttrain = train[columns_keep+['target']]

In [227]:
ttest['song_length'] = round(ttest['song_length']/60000,2)
ttrain['song_length'] = round(ttrain['song_length']/60000,2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [228]:
def dataexpand(column,ttrain,ttest):
    stored = pd.get_dummies(pd.concat([ttrain[column],ttest[column]]))
    stored.columns = [column+'_'+str(i) for i in stored.columns.values]
    ttrain = pd.concat([ttrain,stored.iloc[:ttrain.shape[0],:]],axis = 1)
    ttest = pd.concat([ttest,stored.iloc[ttrain.shape[0]:,:]],axis = 1)
    ttrain = ttrain.drop(column,1)
    ttest = ttest.drop(column,1)
    return ttrain,ttest

In [229]:
for column in ['source_system_tab','source_type','registered_via','source_screen_name','city']:
    print(column)
    ttrain,ttest = dataexpand(column,ttrain,ttest)

source_system_tab
source_type
registered_via
source_screen_name
city


In [243]:
for column in ttrain.columns:
    if any(pd.isnull(ttrain[column])):
        print(column)

In [233]:
for column in ['song_length','song_year']:
    ttrain['nan_'+column] = pd.isnull(ttrain[column])
    ttest['nan_'+column] = pd.isnull(ttest[column])
    ml = pd.concat([ttrain[column],ttest[column]]).mean()
    print(ml)
    ttrain.loc[:,column] = ttrain.loc[:,column].fillna(ml)
    ttest.loc[:,column] = ttest.loc[:,column].fillna(ml)

4.07982894421
2011.2265781


In [235]:
from sklearn.preprocessing import scale
for column in ['user_count','song_count','art_count','uacount','genre_count','ugcount','song_length','age_clean','song_year',
              'numactivedays','day_id_res','day_id_exp']:
    print(column)
    nmax = max(ttrain[column].max(),ttest[column].max())
    nmin = min(ttrain[column].min(),ttest[column].min())
    diff = nmax - nmin
    ttrain[column] = (ttrain[column] - nmin)/diff
    ttest[column] = (ttest[column] - nmin)/diff

user_count
song_count
art_count
uacount
genre_count
ugcount
song_length
age_clean
song_year
numactivedays
day_id_res
day_id_exp


In [236]:
ranind = np.random.permutation(range(ttrain.shape[0]))
thres = round(ttrain.shape[0]*0.75)
m = ttrain.drop('target',1).iloc[ranind[:thres],:]
n = ttrain.drop('target',1).iloc[ranind[thres:],:]

In [242]:
any(pd.isnull(ttrain))

True

In [264]:
from sklearn.linear_model import LogisticRegression

for pe in [0.05,0.1,0.5,1]:
    print(pe)
    model = LogisticRegression(C = pe,solver = 'sag')
    model.fit(m,train.target[ranind[:thres]])
    v = model.predict_proba(n)[:,1]
    fpr, tpr, thresholds = roc_curve(train.target[ranind[thres:]],v)
    roc_auc = auc(fpr,tpr)
    print(roc_auc)

0.01


NameError: name 'roc_curve' is not defined

In [261]:
from sklearn.linear_model import LogisticRegression
for pe in [1]:
    print(pe)
    model = LogisticRegression(penalty = 'l1',C = pe,solver = 'saga')
    model.fit(m,train.target[ranind[:thres]])
    v = model.predict_proba(n)[:,1]
    fpr, tpr, thresholds = roc_curve(train.target[ranind[thres:]],v)
    roc_auc = auc(fpr,tpr)
    print(roc_auc)

1




0.869685176733


In [263]:
c = model.predict_proba(ttest)

In [270]:
model.fit(ttrain.drop('target',1),ttrain.target)
c2 = model.predict(ttest)[:,1]



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='saga', tol=0.0001,
          verbose=0, warm_start=False)

In [281]:
(c2 - l).min()

-0.71015704503128485

In [273]:
ttest.columns.values[(model.coef_ > 0.01)[0]]

array(['user_freq', 'song_count', 'song_freq', 'uafreq', 'ugfreq',
       'source_system_tab_my library', 'source_system_tab_search',
       'source_type_local-playlist',
       'source_screen_name_Local playlist more'], dtype=object)

In [274]:
sel_column = ['user_count', 'user_freq', 'song_count', 'song_freq', 'uacount','uafreq', 'ugcount', 'ugfreq', 'is_female', 'age_clean',
'song_year', 'day_id_res', 'day_id_exp', 'source_type_artist','source_type_local-library', 'source_type_local-playlist',
'source_type_topic-article-playlist', 'registered_via_7','registered_via_13', 'source_screen_name_Discover New','source_system_tab_my library', 
'source_system_tab_search','source_screen_name_My library_Search', 'source_screen_name_Search','nan_year']

In [244]:
from sklearn import svm
rbf_svc = svm.SVC(kernel='polynomial',C = 0.5)
model.fit(m,train.target[ranind[:thres]])
v = model.predict_proba(n)[:,1]
fpr, tpr, thresholds = roc_curve(train.target[ranind[thres:]],v)
roc_auc = auc(fpr,tpr)

NameError: name 'model' is not defined

In [297]:
dropcol = ['user_count', 'user_freq', 'song_count', 'song_freq', 'art_count',
       'art_freq', 'is_art_dump', 'uacount', 'uafreq', 'genre_count',
       'genre_freq', 'is_genre_dump', 'ugcount', 'ugfreq']

In [358]:
c = model.predict_proba(ttest)[:,1]

In [360]:
c[:100]

array([ 0.50000561,  0.50001856,  0.49999564,  0.49999965,  0.49999438,
        0.49999967,  0.5       ,  0.5000073 ,  0.50000121,  0.50001479,
        0.50002172,  0.49999818,  0.50000394,  0.49999356,  0.49999914,
        0.49999667,  0.49999415,  0.4999934 ,  0.49999966,  0.49999728,
        0.5000314 ,  0.5000055 ,  0.50001058,  0.50000741,  0.500025  ,
        0.50002613,  0.50002501,  0.50002463,  0.50002502,  0.49999744,
        0.50002591,  0.5000273 ,  0.50002568,  0.50002513,  0.50000334,
        0.49998411,  0.50002093,  0.49998393,  0.50002068,  0.50001901,
        0.50000894,  0.5000036 ,  0.50000896,  0.50001199,  0.50000892,
        0.50001401,  0.49997602,  0.500006  ,  0.50000642,  0.50000606,
        0.50000149,  0.50001165,  0.50001373,  0.50000753,  0.50001481,
        0.50000877,  0.5000032 ,  0.49999691,  0.49999309,  0.50002371,
        0.50000816,  0.50001859,  0.49999894,  0.50000013,  0.49998628,
        0.50000797,  0.49999656,  0.50000695,  0.50000855,  0.50

In [307]:
model = (C = pe,solver = 'sag')
model.fit(m[dropcol],train.target[ranind[:thres]])
v = model.predict_proba(n[dropcol])[:,1]
fpr, tpr, thresholds = roc_curve(train.target[ranind[thres:]],v)
roc_auc = auc(fpr,tpr)

0.79007948025688468

In [255]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(criterion = 'entropy',max_depth=8)

In [256]:
clf.fit(m,train.target[ranind[:thres]])
v = clf.predict_proba(n)[:,1]
fpr, tpr, thresholds = roc_curve(train.target[ranind[thres:]],v)
roc_auc = auc(fpr,tpr)

In [258]:
l = clf.predict_proba(ttest)[:,1]

In [257]:
roc_auc

0.86096054951812229

In [370]:
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
plt.scatter(train.art_freq,train.target)

<matplotlib.collections.PathCollection at 0x39db7c358>

In [None]:
from sklearn import ensemble
gbes = ensemble.GradientBoostingClassifier(n_estimators=n_estimators,
                                           validation_fraction=0.2,
                                           n_iter_no_change=5, tol=0.01,
                                           random_state=0)


In [303]:
subm = pd.DataFrame()
subm['id'] = test.id
subm['target'] = p_test_1
subm.to_csv("hope.csv.gz",compression = 'gzip',index = False)

In [220]:
subm = pd.DataFrame()
subm['id'] = test.id
subm['target'] = 1-l
subm.to_csv("hope.csv.gz",compression = 'gzip',index = False)

In [182]:
for column in ttrain.columns:
    if any(pd.isnull(ttrain[column])):
        print(column)

song_length
song_year


In [187]:
ttrain['nan_length'] = pd.isnull(train.song_length)
ttrain['nan_year'] = pd.isnull(train.song_year)

In [283]:
print ("Train test and validation sets")
for col in train.columns:
    if train[col].dtype == object:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')


X_train = train.drop(['target'], axis=1)
y_train = train['target'].values


X_test = test.drop(['id'], axis=1)
ids = test['id'].values

X_train = X_train.drop(['rtime','etime'],1)
X_test = X_test.drop(['rtime','etime'],1)

# del train, test; gc.collect();

d_train_final = lgb.Dataset(X_train, y_train)
watchlist_final = lgb.Dataset(X_train, y_train)
print('Processed data...')

Train test and validation sets


NameError: name 'lgb' is not defined

In [293]:
d_train_final = lgb.Dataset(X_train, y_train)
watchlist_final = lgb.Dataset(X_train, y_train)

In [291]:
X_train.song_year

AttributeError: 'Series' object has no attribute 'type'

In [304]:
params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'learning_rate': 0.3 ,
        'verbose': 0,
        'bagging_fraction': 0.75,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'max_depth': 8,
        'num_rounds': 20,
        'metric' : 'auc'
    }

%time model_f1 = lgb.train(params, train_set=d_train_final,  valid_sets=watchlist_final, verbose_eval=5)



[5]	valid_0's auc: 0.884988
[10]	valid_0's auc: 0.889976
[15]	valid_0's auc: 0.892889
[20]	valid_0's auc: 0.895368
CPU times: user 1min 41s, sys: 876 ms, total: 1min 42s
Wall time: 37 s


In [309]:
model_f1.feature_importance(importance_type='split')

array([58, 54,  2, 24, 23,  0,  5, 12,  3,  3,  0,  1,  6,  8,  0, 33, 15,
       60,  0, 15,  2, 15,  0,  0, 51, 45, 73,  1,  0,  1,  0,  0, 27, 23,
       40,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0])

In [300]:
print('Making predictions')
p_test_1 = model_f1.predict(X_test)

Making predictions


In [302]:
p_test_1[:10]

array([ 0.34792644,  0.45328587,  0.10034968,  0.10826129,  0.04545351,
        0.13946447,  0.30898531,  0.87795143,  0.2658625 ,  0.13572496])