# Install necessary libraries:
    conda install scikit-learn
    conda install -c anaconda pandas
    conda install seaborn
    conda install -c anaconda numpy
    conda install -c conda-forge matplotlib 


In [1]:
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn import metrics, ensemble
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt


In [2]:
data=pd.read_csv('train.csv')
len(data)

7377418

In [3]:
data=data.sample(frac=0.4)
len(data)

2950967

In [4]:
songs=pd.read_csv('songs.csv')
data=pd.merge(data,songs,on='song_id',how='left')
del songs

In [5]:
members=pd.read_csv('members.csv')
data=pd.merge(data,members,on='msno',how='left')
del members

In [6]:
##check the NA in the record
data.isnull().sum()/data.isnull().count()*100

msno                       0.000000
song_id                    0.000000
source_system_tab          0.338330
source_screen_name         5.634153
source_type                0.291972
target                     0.000000
song_length                0.001355
genre_ids                  1.603000
artist_name                0.001355
composer                  22.694527
lyricist                  43.038062
language                   0.001898
city                       0.000000
bd                         0.000000
gender                    40.145180
registered_via             0.000000
registration_init_time     0.000000
expiration_date            0.000000
dtype: float64

In [7]:
##replace NA
for i in data.select_dtypes(include=['object']).columns:
    data[i][data[i].isnull()]='unknown'
data=data.fillna(value=0)
##check the NA in the record again
data.isnull().sum()/data.isnull().count()*100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


msno                      0.0
song_id                   0.0
source_system_tab         0.0
source_screen_name        0.0
source_type               0.0
target                    0.0
song_length               0.0
genre_ids                 0.0
artist_name               0.0
composer                  0.0
lyricist                  0.0
language                  0.0
city                      0.0
bd                        0.0
gender                    0.0
registered_via            0.0
registration_init_time    0.0
expiration_date           0.0
dtype: float64

In [8]:
##format and read date
data.registration_init_time=pd.to_datetime(data.registration_init_time, format='%Y%m%d',errors='ignore')
data['registration_init_time_year']=data['registration_init_time'].dt.year
data['registration_init_time_month']=data['registration_init_time'].dt.month
data['registration_init_time_day']=data['registration_init_time'].dt.day

data.expiration_date=pd.to_datetime(data.expiration_date, format='%Y%m%d',errors='ignore')
data['expiration_date_year']=data['expiration_date'].dt.year
data['expiration_date_month']=data['expiration_date'].dt.month
data['expiration_date_day']=data['expiration_date'].dt.day

data['registration_init_time']=data['registration_init_time'].astype('category')
data['expiration_date']=data['expiration_date'].astype('category')

for col in data.select_dtypes(include=['object']).columns:
    data[col]=data[col].astype('category')
    
for col in data.select_dtypes(include=['category']).columns:
    data[col]=data[col].cat.codes

In [9]:
#implementing PCA

from sklearn.decomposition import PCA

In [10]:
pca=PCA(n_components=2)
pca.fit(data[data.columns[data.columns != 'target']])

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [12]:
#random forest classifier, exclude the column of "target"
model=ensemble.RandomForestClassifier(n_estimators=250,max_depth=25)
model.fit(data[data.columns[data.columns != 'target']],data.target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
data_plot=pd.DataFrame({'features':data.columns[data.columns != 'target'],
                        'importances':model.feature_importances_})
data_plot=data_plot.sort_values('importances',ascending=False)

In [16]:
#drop columns with less importance
data=data.drop(data_plot.features[data_plot.importances < 0.05].tolist(),1)

In [17]:
data.columns

Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target', 'song_length', 'artist_name', 'composer',
       'registration_init_time', 'expiration_date'],
      dtype='object')

# droped columns with low importance
genre_ids                 
lyricist                  
language                  
city                      
bd                        
gender                    
registered_via            

In [18]:
from sklearn.model_selection import KFold
kf=KFold(n_splits=4)

model = ensemble.RandomForestClassifier(n_estimators=250, max_depth=25)
for train_indices, val_indices in kf.split(data):
    model.fit(data.drop(['target'],axis=1).loc[train_indices,:],data.loc[train_indices,'target'])

# Test with the optimized classifier

In [20]:
data_test=pd.read_csv('test.csv')
songs=pd.read_csv('songs.csv')
data_test=pd.merge(data_test,songs,on='song_id',how='left')
del songs
members=pd.read_csv('members.csv')
data_test=pd.merge(data_test,members,on='msno',how='left')
del members
data_test.isnull().sum()/data_test.isnull().count()*100
##replace NA
for i in data_test.select_dtypes(include=['object']).columns:
    data_test[i][data_test[i].isnull()]='unknown'
data_test=data_test.fillna(value=0)
##format and read date
data_test.registration_init_time=pd.to_datetime(data_test.registration_init_time, format='%Y%m%d',errors='ignore')
data_test.expiration_date=pd.to_datetime(data_test.expiration_date, format='%Y%m%d',errors='ignore')

data_test['registration_init_time']=data_test['registration_init_time'].astype('category')
data_test['expiration_date']=data_test['expiration_date'].astype('category')
#Object data to category
for col in data_test.select_dtypes(include=['object']).columns:
    data_test[col]=data_test[col].astype('category')
    
for col in data_test.select_dtypes(include=['category']).columns:
    data_test[col]=data_test[col].cat.codes
#drop clomuns with low importances
data_test=data_test.drop(['genre_ids','lyricist','language','city','bd','gender','registered_via'],1)


data_test.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


Index(['id', 'msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'song_length', 'artist_name', 'composer',
       'registration_init_time', 'expiration_date'],
      dtype='object')

In [21]:
data.columns

Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target', 'song_length', 'artist_name', 'composer',
       'registration_init_time', 'expiration_date'],
      dtype='object')

In [23]:
predictions =np.zeros(shape=[len(data_test)])
predictions += model.predict(data_test.drop(['id'],axis=1))
predictions=predictions/4

submission=pd.read_csv('sample_submission.csv')
submission.target=predictions
submission.to_csv('submission.csv',index=False)


# Report

In [None]:
#roc curve
from sklearn import metrics
from sklearn.metrics import roc_curve,auc
test=data[int(data['target'].count()*(1-0.4)):data['target'].count()]
data=data[0:int(data['target'].count()*(1-0.4))]
x=test.drop(['target'],axis=1)
predict_roc=model.predict(x)

fpr,tpr,threshold=roc_curve(test['target'],predict_roc)
roc_auc=auc(fpr,tpr)
roc_auc
plt.plot(fpr,tpr)
plt.show()

In [None]:
from sklearn.metrics import classification_report
targets=['0','1']
X=data['target'].values
print(classification_report(X,predict_roc,target_names=targets))