In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Dataset

In [4]:
members = pd.read_csv('data/members.csv')
songs = pd.read_csv('data/songs.csv')

In [5]:
train = pd.read_csv('data/train.csv').sample(5000)

In [6]:
def clean_df(df):
    df = df.merge(songs, on='song_id')
    df['song_length'] = pd.cut(df['song_length'],
                               3,
                               labels=['short', 'regular', 'long'])

    for col in ['source_system_tab','source_screen_name','source_type',
                'artist_name','composer','lyricist', 'genre_ids']:
        df[col].fillna('', inplace=True)
        
    return df

In [7]:
train = clean_df(train)

In [8]:
train_df, test_df = train_test_split(train)

In [9]:
train_df.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,lyricist,language
4414,w1ZqQL8G5SGzk0njrRc2J/eO15ZTkZrFXw6nKcS3Qq4=,fBayxIefRsoVZlB29G6vjIHa5h1B1qJyaGI0kwrVERc=,my library,Local playlist more,local-playlist,0,short,465,陳零九 (Nine Chen),陳零九,陳零九,3.0
4173,saSeBaJPCD8I/jKIWru5oSyqgHIPw1Lizc3blZKOANM=,caXQoad48M9jS7s0V227KI5i0HnARffYKOxe0fYVtrM=,my library,Local playlist more,local-library,1,short,359,Coldplay,Steven Tyler| Joseph Perry| Desmond Child,,52.0
4696,Ue5tWTU5pxrj7IwOcs7d88M6ICnUdSpFSr7FtUlcyrc=,uuv/oI/dfOrHOHwDN9co6xyS484UCmel5TPfqOWCiU8=,my library,Local playlist more,local-library,0,short,465,Rihanna,Alexander Grant| Skylar Grey| Marshall Mathers,,52.0
3031,OBhCuGr9/Vif35ztzJ8XD0oJhZP1tFxBhmGfsvlZcEc=,MbUrDuSZUHl97oQwk/GxAC+AIOxQd0wMpeBI4BU4PT8=,my library,Local playlist more,local-library,1,short,465,那英,Hsu Kuang-Yi,Hsu Kuang-Yi,3.0
934,xk73CNHhT4j+oE8yxR438NApz29Rxl9MVWnapgjqQ+M=,A81HcHeN2CTw6r/4lTdcjY3hBU6Tg3ZKKBs0D1C79cQ=,discover,Discover Genre,online-playlist,0,short,2122,Lisa Ono (小野リサ),Gonzalo Curiel,,-1.0


In [10]:
msno_vectorizer = CountVectorizer()
song_id_vectorizer = CountVectorizer()
source_system_tab_vectorizer = CountVectorizer()
source_screen_name_vectorizer = CountVectorizer()
source_type_vectorizer = CountVectorizer()
song_length_vectorizer = CountVectorizer()
genre_ids_vectorizer = CountVectorizer()
artist_name_vectorizer = CountVectorizer()
composer_vectorizer = CountVectorizer()
lyricist_vectorizer = CountVectorizer()
language_vectorizer = CountVectorizer()

In [13]:
msno_vectorizer.fit(train_df['msno'])
song_id_vectorizer.fit(train_df['song_id'])
source_system_tab_vectorizer.fit(train_df['source_system_tab'])
source_screen_name_vectorizer.fit(train_df['source_screen_name'])
source_type_vectorizer.fit(train_df['source_type'])
song_length_vectorizer.fit(train_df['song_length'])
genre_ids_vectorizer.fit(train_df['genre_ids'].astype(str))
artist_name_vectorizer.fit(train_df['artist_name'])
composer_vectorizer.fit(train_df['composer'])
lyricist_vectorizer.fit(train_df['lyricist'])
language_vectorizer.fit(train_df['language'].astype(str))

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [14]:
def make_X_and_y(df):
    X = hstack([msno_vectorizer.transform(df['msno']),
                song_id_vectorizer.transform(df['song_id']),
                source_system_tab_vectorizer.transform(df['source_system_tab']),
                source_screen_name_vectorizer.transform(df['source_screen_name']),
                source_type_vectorizer.transform(df['source_type']),
                song_length_vectorizer.transform(df['song_length']),
                genre_ids_vectorizer.transform(df['genre_ids'].astype(str)),
                artist_name_vectorizer.transform(df['artist_name']),
                composer_vectorizer.transform(df['composer']),
                lyricist_vectorizer.transform(df['lyricist']),
                language_vectorizer.transform(df['language'].astype(str)),])
    y = df['target'].values
    
    return X,y

In [15]:
X_train, y_train = make_X_and_y(train_df)
X_test, y_test = make_X_and_y(test_df)

In [16]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3750, 20190), (3750,), (1250, 20190), (1250,))

# Models

## Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
lr = LogisticRegression(C=10)

In [19]:
lr.fit(X_train, y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
accuracy_score(y_pred=lr.predict(X_test),
               y_true=y_test)

0.59360000000000002

## Neural Network

In [21]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

Using TensorFlow backend.


In [24]:
model = Sequential()
model.add(Dense(64,
                activation='sigmoid',
                input_dim=X_train.shape[1]))
model.add(Dropout(0.4))
model.add(Dense(64, activation='sigmoid'))
model.add(Dropout(0.4))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [25]:
model.fit(X_train.todense(),
          y_train,
          epochs=5,
          validation_data=(X_test.todense(),y_test))

Train on 3750 samples, validate on 1250 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x11f04f790>