In [17]:
import pandas as pd

In [18]:
songs = pd.read_csv('data/songs.csv')
members = pd.read_csv('data/members.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

## Dataset

In [7]:
members.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,expiration_date
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,,7,20110820,20170920
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,0,,7,20150628,20170622
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,0,,4,20160411,20170712
3,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,1,0,,9,20150906,20150907
4,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=,1,0,,4,20170126,20170613


In [8]:
songs.head()

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,465,張信哲 (Jeff Chang),董貞,何啟弘,3.0
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,444,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,TEDDY,31.0
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,465,SUPER JUNIOR,,,31.0
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,273554,465,S.H.E,湯小康,徐世珍,3.0
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,140329,726,貴族精選,Traditional,Traditional,52.0


In [6]:
train.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1


## Models

In [52]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

In [20]:
train_df, test_df = train_test_split(train)

### Popularity baseline

In [47]:
X = train_df[['song_id', 'target']].groupby('song_id').mean()
X['target'] = X.target.apply(lambda r: 1 if r > 0.5 else 0)

In [54]:
test_df = test_df.merge(X, left_on='song_id', right_index=True)

In [56]:
precision_recall_fscore_support(test_df.target_x, test_df.target_y)

(array([ 0.59383738,  0.58082497]),
 array([ 0.50902062,  0.66148667]),
 array([ 0.54816753,  0.61853719]),
 array([884972, 910168]))

### Better model

In [92]:
from sklearn.feature_extraction import FeatureHasher
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression

In [69]:
train_sample = train.sample(1000)

In [70]:
X = pd.merge(train_sample, songs, left_on='song_id', right_on='song_id').drop('target', axis=1)
y = train_sample['target']

In [71]:
def serie_to_categorical(serie):
    h = FeatureHasher(input_type='string', n_features=serie.nunique())
    return h.transform(serie.tolist())

In [76]:
X.source_system_tab.fillna('unknown', inplace=True)
X.source_screen_name.fillna('unknown', inplace=True)
X.source_type.fillna('unknown', inplace=True)
# X.genre_ids = X.genre_ids.astype(int) # TODO: parse genre_ids

In [75]:
M = hstack([
            serie_to_categorical(X.msno),
            serie_to_categorical(X.song_id),
            serie_to_categorical(X.source_type),
            serie_to_categorical(X.source_screen_name),
            serie_to_categorical(X.source_system_tab),
            serie_to_categorical(X.source_screen_name),
            serie_to_categorical(X.source_type),
            serie_to_categorical(X.artist_name),
            csr_matrix(X.language.values).T,
            # csr_matrix(X.genre_ids.values).T
           ])

In [77]:
M

<1000x2440 sparse matrix of type '<type 'numpy.float64'>'
	with 106886 stored elements in COOrdinate format>

In [90]:
lr = LogisticRegression()
lr.fit(M,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [93]:
precision_recall_fscore_support(y, lr.predict(M))

(array([ 0.71458774,  0.71157495]),
 array([ 0.68979592,  0.73529412]),
 array([ 0.701973  ,  0.72324012]),
 array([490, 510]))