### Import Modules

In [None]:
"""
Importing modules
"""

import numpy as np
import datetime
import re
import math
import random
import pickle as pkl
from IPython import get_ipython
#get_ipython().run_line_magic('pylab inline', 'config InlineBackend.figure_formats = ['retina']'

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#sns.set()
import scipy.stats as stats
import statsmodels.api as sm

from sklearn.model_selection import train_test_split, KFold, learning_curve, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score, accuracy_score, f1_score, confusion_matrix, fbeta_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
from sklearn import preprocessing
import xgboost as xgb
import lightgbm as lgb

from collections import defaultdict

### Reduce Memory Usage

In [None]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if (props[col].dtype != object and props[col].dtype != 'datetime64[ns]'):  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props

### Load Files from csv into Dataframe

In [None]:
def loadfiles12():
    """
    Load files into pandas DataFrame
    Returns: DataFrames
    """
    df_songs = pd.read_csv('/home/ubuntu/Project_3/kkbox-music-recommendation-challenge/songs.csv', low_memory = True)
    df_songs_extra = pd.read_csv('/home/ubuntu/Project_3/kkbox-music-recommendation-challenge/song_extra_info.csv',low_memory = True)
    df_members = pd.read_csv('/home/ubuntu/Project_3/kkbox-music-recommendation-challenge/members.csv', low_memory = True)
    df_train = pd.read_csv('/home/ubuntu/Project_3/kkbox-music-recommendation-challenge/train.csv', low_memory = True)
    df_test = pd.read_csv('/home/ubuntu/Project_3/kkbox-music-recommendation-challenge/test.csv', low_memory = True)

    return df_songs, df_songs_extra, df_members, df_train, df_test

### Data Wrangling

In [None]:
def labelencoding(df):
    for feature in ['msno', 'song_id', 'gender', 'language','Registration', 'Expiration Date',
                    'genre_ids', 'artist_name', 'composer', 'lyricist', 'name', 'source_system_tab', 'source_screen_name','source_type']:
        le = preprocessing.LabelEncoder()
        le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

In [None]:
def members_convrt(df_members):
    """
    Parameter df_members: Members dataframe
    Convert registration and expiration date to time Series.
    Add new column with duration of the users membership
    Convert 0's in 'bd' column (age) into the mean of people from the same area and same way of registration

    Returns: Members DataFrame
    """
    df_members['Registration'] = pd.to_datetime(df_members['registration_init_time'], format = '%Y%m%d')
    df_members['Expiration Date'] = pd.to_datetime(df_members['expiration_date'], format = '%Y%m%d')
    df_members.drop(columns = ['registration_init_time','expiration_date'], inplace = True)

    #Compute registered timeframe
    df_members['Registered Timeframe (days)'] = (df_members['Expiration Date'] - \
    df_members['Registration']).apply(lambda x: x.days)
        

    return df_members


In [None]:
def genderconvert(df):
    df['gender'].replace(to_replace = np.NaN, value = -1, inplace = True)

In [None]:
def bdconvert(df_members, df_train, df_songs):
    """
    Convert 0's in 'bd' column (age) into the mean of people listening to same
    genres and using same way of registration.
    """
    df_members = df_train[['msno', 'song_id']].merge(df_members, on = 'msno')
    df_members = df_members.merge(df_songs[['song_id', 'genre_ids']], on = 'song_id')
    age_mean = df_members[df_members.bd != 0].groupby(['registered_via', 'genre_ids'])\
    ['bd'].mean().reset_index()
    age_dict = defaultdict(int)
    for (c,r,a) in zip(age_mean['registered_via'], age_mean['genre_ids'], \
    age_mean['bd']):
    #age_dict[c,r] = a
        mask = (df_members.registered_via == c)&(df_members.genre_ids == r)&(df_members.bd == 0)
        index_list = (df_members[mask].index)
    for index in index_list:
        df_members.loc[index,'bd'] = a
    pkl.dump(df_members, open( "members.pkl", "wb" ))
    #return df_members

In [None]:
#def registrationtoday(df_eda):
    #timestamp = df_eda['Registration'].sort_values(ascending = False).iloc[0]
    #df_eda['Registration_to_today'] = (pd.to_datetime(timestamp) -pd.to_datetime(df_eda['Registration'])).apply(lambda x: x.days)
    #with open('msno_age.pkl',"wb")as file:
        #pkl.dump(df_members[['msno', 'bd']],file)

In [None]:
def active(df_eda):
#Computing the time the user has been active in the music streaming service
    timestamp = df_eda['Registration'].sort_values(ascending = False).iloc[0]
    df_eda['Registration_to_today'] = (pd.to_datetime(timestamp) -pd.to_datetime(df_eda['Registration'])).apply(lambda x: x.days)
    df_eda['Active Timeframe'] = df_eda['Registered Timeframe (days)']
    df_eda['Active Timeframe'] = df_eda['Active Timeframe'][df_eda['Registered Timeframe (days)'] < df_eda['Registration_to_today']] = df_eda['Registered Timeframe (days)']
    #with open('msno_active_timeframe_eda.pkl',"wb")as file:
        #pkl.dump(df_eda[['msno', 'song_id','Registration', 'Registration_to_today', 'Active Timeframe']],file)
    return df_eda

In [None]:
def timestamp(df_eda):
    """
    Add timestamp to each index
    """
    df_eda = pkl.load(open( "msno_active_timeframe_eda.pkl", "rb" ))
    df_eda['Timestamp'] = df_eda.Registration
    df_eda['days_between_songs']=round((df_eda['Active Timeframe']/df_eda.groupby(['msno'])['msno'].transform('count')),0)
    count1 = 0
    for index in df_eda.index:
        count1 +=1
        if count1%2000 == 0:
            print(count1)
        count = len(df_eda.iloc[:index+1][df_eda.msno == df_eda.msno.iloc[index]])-1
        df_eda['Timestamp'].iloc[index] = pd.to_datetime(df_eda['Registration'].iloc[index])+ datetime.timedelta(df_eda['days_between_songs'].iloc[index]*count)
    with open('msno_timestamp.pkl',"wb")as file:
        pkl.dump(df_eda,file)   

In [None]:
#active(df_eda)
#timestamp(df_eda)

In [None]:
def genrecvrt(df):
    df_genre = df[df['genre_ids'].str.contains(pat = '\|')]
    df_genre1 = df_genre['genre_ids'].str.replace(re.compile('\|\d*'), repl ='')
    df_genre['genre_ids'] = df_genre1
    df.update(df_genre[['song_id', 'genre_ids']])
    return df

In [None]:
def genrefix(df):
    df['genre_ids'].fillna('-1', inplace = True)

In [None]:
def artistcvrt(df):
    df_artist = df[df['artist_name'].str.contains(pat = '\|')]
    df_artist1 = df_artist['artist_name'].str.replace(re.compile('\|\d*'), repl ='')
    df_artist['artist_name'] = df_artist1
    df.update(df_artist[['song_id', 'artist_name']])
    return df

In [None]:
def filllyricist(df):
    df['lyricist'].fillna('no_lyricist',inplace=True)

In [None]:
def fillgenre(df):
    df['genre_ids'].fillna('no_genreid', inplace = True)

In [None]:
def fillcomposer(df):
    df['composer'].fillna('no_composer', inplace = True)

In [None]:
def fillartistname(df):
    df['artist_name'].fillna('no_artist', inplace = True)

In [None]:
def fillsonglength(df):
    df['song_length'].fillna(df['song_length'].mean(), inplace = True)

In [None]:
def fillsongyear(df):
    df['song_year'].replace(to_replace = np.NaN, value = int(df['song_year'].mean()), inplace = True)

In [None]:
def fillsources(df):
    df['source_system_tab'].fillna('-1', inplace = True)
    df['source_screen_name'].fillna('-1', inplace = True)
    df['source_type'].fillna('-1', inplace = True)

In [None]:
def fillbd(df):
    df['bd'].fillna(-1, inplace = True)

In [None]:
def fillnas(df):
    filllyricist(df)
    fillgenre(df)
    fillcomposer(df)
    fillartistname(df)
    genrefix(df)
    fillsources(df)
    fillbd(df)

In [None]:
def songcategories(df):
    df['song_length'] = df['song_length'].astype(np.uint32)
    df['song_id'] = df['song_id'].astype('category')
    return df

In [None]:
def languagecategories(df):
    df['language'] = df['language'].astype('category')
    return df

In [None]:
def datawrangler(df):
    """
    Applies Datawrangling functions
    """
    df = artistcvrt(df)
    fillnas(df)
    df = genrecvrt(df)
    df = songcategories(df)
    df = languagecategories(df)
    return df

### Function Call for Loading Data and Data Wrangling

In [None]:
#df_songs, df_songs_extra, df_members, df_train, df_test = loadfiles12()
#print('Loaded all files into DataFrames')
#del df_test

#df_members = members_convrt(df_members)
#print('Convert registration and expiration date to time Series.')

#df_members = bdconvert(df_members, df_train, df_songs)
#print("Converted 0's in age columns")

#create_eda_train_set(df_train, df_members, df_songs, df_songs_extra,1000)
#print('Saved eda_train_set as csv')

### Preparations for Modeling and Scoring

In [None]:
def splitdata(df,test_size = 0.2, seed = 89, plot = False ):
    """
    Split data into train and validation set according to member id.

    Parameter df: Dataframe to be split
    Precondtion: df is a Pandas DataFrame

    Parameter test_size: Size of test data set
    Precondition: 0 ≤ test_size ≤ 1

    Parameter seed: random number for random state generator
    Precondition: type(seed) == int

    Parameter identifier: Dataset feature according to which the dataset will be split
    Precondition: identifier is a valid DataFrame index
    """
    rs = np.random.RandomState(seed)
    members_unique = df.msno.unique()
    test_members = rs.choice(members_unique, size = int(members_unique.shape[0]*\
    test_size), replace = False)
    df_tr = df[~df['msno'].isin(test_members)]
    df_te = df[df['msno'].isin(test_members)]
    
    if plot:
        sns.pairplot(df_tr)

    y_tr, y_te = df_tr['target'], df_te['target']
    X_tr = df_tr.drop(columns = ['target','msno'], axis = 1)
    X_te = df_te.drop(columns = ['target','msno'], axis = 1)

    return X_tr, X_te, y_tr, y_te

In [None]:
def getdummyset(df):
    X_dummies = pd.get_dummies(df.drop(columns = 'msno'),drop_first = True)
    X_dummies['msno'] = df['msno']
    X_train, X_test, y_train, y_test = splitdata(X_dummies, test_size = 0.2)
    
    return X_train, X_test, y_train, y_test

### Scoring Metrics

In [None]:
def scoring(X_tr,X_te,y_tr,y_te ,model, model_name):
    """
    Scoring baseline Model
    """
    model.fit(X_tr,y_tr)
    score = precision(y_te, model.predict(X_te))
    print(model_name+' precision score:'+str(score))
    cm = confusion_matrix(y_te, model.predict(X_te))
    sns.heatmap(cm,
                cmap=plt.cm.Blues,
                annot=True,
                square=True,
                xticklabels=[0,1],
                yticklabels=[0,1],
                fmt='g')

In [None]:
#Baseline
def score_baseline(df,columns, model,model_name, plot = False):
    """
    AUC score of baseline
    """
    columns_eda = columns
    X_tr, X_te, y_tr, y_te = splitdata(df[columns_eda], test_size = 0.2, plot = plot)
    model.fit(X_tr,y_tr)
    score = roc_auc_score(y_te, model.predict(X_te))
    print(model_name+' AUC score:'+str(score))
    cm = confusion_matrix(y_te, model.predict(X_te))
    sns.heatmap(cm,
                cmap=plt.cm.Blues,
                annot=True,
                square=True,
                xticklabels=[0,1],
                yticklabels=[0,1],
                fmt='g')

In [None]:
def scoremodels(df, columns, models, model_name, plot = False):
    """
    Scoring several models using the pre-defined score_baseline function and plotting pairplots
    """
    for index, model in enumerate(models):
        score_baseline(df,columns,model,model_name)
        if plot:
            plot_features(df[columns])

In [None]:
def baselineanalysis():
    df_eda = pd.read_csv('/home/ubuntu/Project_3/df_eda')
    df_eda.drop(columns = 'Unnamed: 0', inplace = True)
    columns2 = ['msno','city', 'bd', 'registered_via','target','song_length']
    model1 = LogisticRegression(C = 1)
    model2 = KNeighborsClassifier(n_neighbors = 5)
    model_name = ['Logistic Regression', 'KNN']
    for index,model in enumerate([model1,model2]):
        scoremodels(df_eda,columns2,[model], model_name[index], plot = True)

In [None]:
#Evaluation metrics
#add probabilitiess
def accuracy(actuals, preds):
    return np.mean(actuals == preds)

def precision(actuals,preds):
    tp = np.sum((actuals == 1) & (preds == 1))
    fp = np.sum((actuals == 0) & (preds == 1))
    return tp / (tp + fp)

def recall(actuals, preds):
    tp = np.sum((actuals == 1) & (preds == 1))
    fn = np.sum((actuals == 1) & (preds == 0))
    return tp / (tp + fn)

def F1(actuals, preds):
    p, r = precision(actuals, preds), recall(actuals, preds)
    return 2*p*r / (p + r)

def f1_beta(actuals, preds, beta):
    return fbeta_score(actuals, preds, beta)

In [None]:
def scoremodeldummy(df, columns, categorical):
    """
    Compute F1 Beta score for Logistic Regression, KNN, Random Forest and XGBoost.
    Return predictions for each model
    """
    X_train, X_test, y_train, y_test = getdummyset(df)
    #preds = getpredsdummy(KNeighborsClassifier(n_neighbors=5), X_train, X_test, y_train)
    #preds2 = getpredsdummy(LogisticRegression(),X_train, X_test, y_train)
    #preds3 = getpredsdummy(RandomForestClassifier(),X_train, X_test, y_train)
    y_lgbm, y_lgbm_train, gbm, df_importance = lightgbm(X_train, X_test, y_train, y_test, columns, categorical)
    #print('KNN AUC Score : '+str(roc_auc_score(y_test, preds)))
    #print('Logistic AUC Score: '+str(roc_auc_score(y_test, preds2)))
    #print('Random Forest AUC Score: '+str(roc_auc_score(y_test, preds3)))
    print('Light GBM Regression Train AUC Score: '+str(roc_auc_score(y_train, y_lgbm_train)))
    print('Light GBM Regression AUC Score: '+str(roc_auc_score(y_test, y_lgbm)))
    return y_lgbm,y_lgbm_train, y_test,X_test, gbm, df_importance

In [None]:
def getpredsdummy(model, X_train, X_test, y_train):
    model1 = model
    model1.fit(X_train,y_train)
    y_preds = model1.predict_proba(X_test)[:,1]
    preds = y_preds > 0.5
    
    return preds

### Fit Models

In [None]:
#Fit KNN regression to training data and find optimal no. of neighbors
def fitknn(df):
    #Create dictionary to store values
    X_train, X_test, y_train, y_test = getdummyset(df)
    d_knn = defaultdict(int)
    score = 0
    k = 0
    for i in range(1,50):
        neigh = KNeighborsClassifier(n_neighbors=i)
        neigh.fit(X_train, y_train)
        preds = neigh.predict(X_test)
        accuracy = accuracy_score(y_test, preds)
        d_knn[str(i)] = accuracy
        if accuracy > score:
            score = accuracy
            k = i
    print('k value: '+str(k)+'\n' 
          'accuracy: '+str(score))
    return d_knn

In [None]:
#Fit Logistics regression to training data and find optimal 'C' value
def fitlog(df):
    #Create dictionary to store values
    d_logistic = defaultdict(int)
    #Split dataset into train and test
    X_train, X_test, y_train, y_test = getdummyset(df)
    score = 0
    k = 1
    for i in range(1,1002,100):
        logistic = LogisticRegression(C = i)
        logistic.fit(X_train, y_train)
        preds = logistic.predict(X_test)
        accuracy = accuracy_score(y_test, preds)
        d_logistic[str(i)] = accuracy
        if accuracy > score:
            score = accuracy
            k = i   
    print('C value: '+str(k)+'\n' 
          'accuracy: '+str(score))
    return d_logistic

In [None]:
#Fit Decision Tree Classifier to training data
def fitrf(df):
    #Create dictionary to store values
    X_train, X_test, y_train, y_test = getdummyset(df)
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    preds = rf.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    print('Random Forest Classifier accuracy: '+str(accuracy))
    return rf

In [None]:
def xgboost(X_train, X_test, y_train, y_test):
    gbm = xgb.XGBClassifier(n_estimators=30000,
                            max_depth=5,
                            objective='binary:logistic',
                            learning_rate=.05, 
                            subsample=.8,
                            min_child_weight=2,
                            colsample_bytree=.8)

    eval_set=[(X_train,y_train),(X_test,y_test)]
    fit_model = gbm.fit( 
                    X_train, y_train, 
                    eval_set=eval_set,
                    eval_metric='error', #new evaluation metric: classification error (could also use AUC, e.g.)
                    early_stopping_rounds=40,
                    verbose=False
                   )

    print(accuracy_score(y_test, gbm.predict(X_test, ntree_limit=gbm.best_ntree_limit))) 
    return gbm.predict(X_test, ntree_limit=gbm.best_ntree_limit)

In [None]:
def lightgbm(X_train, X_test, y_train, y_test, columns, categorical):
    lgb_train = lgb.Dataset(X_train, y_train, feature_name='auto',categorical_feature=categorical)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, feature_name='auto', categorical_feature=categorical)    
    
    params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 100,
    'max_depth': 8,
    'learning_rate': 0.03,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    }
    
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10000,
                valid_sets=lgb_eval,early_stopping_rounds=50)
    y_preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    y_preds1 = gbm.predict(X_train,num_iteration = gbm.best_iteration)
    
    df_imp = pd.DataFrame({'feature' : gbm.feature_name(),
                      'importance' : gbm.feature_importance(importance_type='gain')})
    return y_preds, y_preds1, gbm, df_imp

In [None]:
def getpreds(df, model):
    model1 = model
    X_train, X_test, y_train, y_test = splitdata(df, test_size = 0.2)
    model.fit(X_train,y_train)
    y_preds = model.predict_proba(X_test)[:,1]
    preds = y_preds > 0.5025
    
    
    return y_preds, preds, y_test

### Visual Analysis Tools

In [None]:
def plot_features(df, sample_size=1000):
    
    sample = (df.drop(['msno'],axis=1).sample(sample_size, random_state=44)) 
    sns.pairplot(sample,hue='target', plot_kws=dict(alpha=.3, edgecolor='none'))

In [None]:
def confusionmatrix(y_lgr,y_knn,y_rf,y_xgboost, y_actual):
    models = zip(range(1,5),
             ['Logistic Regression', 'KNN','Random Forest', 'LightGBM'],
             [y_lgr, y_knn, y_rf, y_xgboost])
    plt.figure(figsize=(12,12))
    for ind, name, pred in models:
        plt.subplot(2, 2, ind)
        cm = confusion_matrix(y_actual,pred)
        sns.heatmap(cm,
                cmap=plt.cm.Blues,
                annot=True,
                square=True,
                xticklabels=[0,1],
                yticklabels=[0,1],
                fmt='g')
        plt.title(name, size=15);

In [None]:
def confusionmatrix1(y_lgr, y_lgbm, y_actual):
    models = zip(range(1,3),
             ['Logistic Regression','LightGBM'],
             [y_lgr, y_lgbm])
    plt.figure(figsize=(12,12))
    for ind, name, pred in models:
        plt.subplot(2, 2, ind)
        cm = confusion_matrix(y_actual,pred)
        sns.heatmap(cm,
                cmap=plt.cm.Blues,
                annot=True,
                square=True,
                xticklabels=[0,1],
                yticklabels=[0,1],
                fmt='g')
        plt.title(name, size=15);

In [None]:
def plot_ROC_curve(X_test, y_test, d_model):
    fig, ax = plt.subplots(figsize=(10,3))
    for model in d_models.keys():
        model1 = d_models[model]
        fpr, tpr, _ = roc_curve(y_test,model1.predict_proba(X_test)[:,1])
        score_auc = roc_auc_score(y_test,model1.predict_proba(X_test)[:,1])
        ax.plot(fpr, tpr, label = model)
        print(model+' AUC score: '+str(score_auc))
        fig.legend()

### Create EDA Analysis Dataset

In [None]:
def create_eda_train_set(df_train, df_members, df_songs, df_songs_extra,entries):
    """
    Creating training set for EDA Analysis with only a specific amount of data entries and
    saving it as csv file.
    """
    df_short = df_members.iloc[:entries,:]
    member_ids = df_short.msno.unique()
    df_train = df_train[df_train['msno'].isin(member_ids)]
    df_train = df_train.merge(df_short, on = 'msno', how = 'left')
    df_train = df_train.merge(df_songs, on = 'song_id')
    df_train = df_train.merge(df_songs_extra, on = 'song_id')
    #df_train = registrationtoday(df_train)
    #df_train = active(df_train)
    #df_train = timestamp(df_train)
    df_train.to_csv(path_or_buf = 'df_eda_new')
    

In [None]:
def create_eda_train_set1(df_train,df_songs,df_songs_extra,entries):
    """
    Creating training set for EDA Analysis with only a specific amount of data entries and
    saving it as csv file.
    """
    df_members = pkl.load(open('members.pkl', 'rb'))
    df_short = df_members.iloc[:entries,:]
    member_ids = df_short.msno.unique()
    print('unique members')
    df_train = df_train[df_train['msno'].isin(member_ids)]
    df_train = df_train.merge(df_members, on = 'msno', how = 'left')
    print('merged train members')
    del df_members
    df_train = df_train.merge(df_songs, on = 'song_id')
    print('merged train songs')
    del df_songs1   
    df_train = df_train.merge(df_songs_extra, on = 'song_id')
    #df_train = registrationtoday(df_train)
    #df_train = active(df_train)
    #df_train = timestamp(df_train)
    df_train.to_csv(path_or_buf = 'df_eda_new')
    

### Baseline Analysis

In [None]:
df_eda = pd.read_csv('/home/ubuntu/Project_3/df_eda')

In [None]:
df_eda = reduce_mem_usage(df_eda)

In [None]:
baseline = ['msno','city', 'bd', 'registered_via','target', 'song_length']

In [None]:
df_eda = datawrangler(df_eda)

In [None]:
#df_eda = reduce_mem_usage(df_eda)

le = preprocessing.LabelEncoder()
le.fit(df_eda['msno'])
df_eda['msno'] = le.transform(df_eda['msno'])

le1 = preprocessing.LabelEncoder()
le1.fit(df_eda['song_id'])
df_eda['song_id'] = le1.transform(df_eda['song_id'])

le2 = preprocessing.LabelEncoder()
le2.fit(df_eda['language'])
df_eda['language'] = le2.transform(df_eda['language'])

In [None]:
df_eda = reduce_mem_usage(df_eda)

In [None]:
#Computing F-1 score for Logistic Regression and KNN
baselineanalysis()

In [None]:
#Check for recall, precision and accuracy of baseline 
y_preds_proba, preds_lr, y_test = getpreds(df_eda[baseline], LogisticRegression())
recall_base = recall(y_test, preds_lr)
precision_base = precision(y_test, preds_lr)
accuracy_base = accuracy(y_test, preds_lr)
f1beta = f1_beta(y_test, preds_lr, beta = 0.7)

print('Recall Baseline score: '+ str(recall_base))
print('Precision Baseline score: '+ str(precision_base))
print('Accuracy Baseline score: '+ str(accuracy_base))
print('F1 Beta Baseline score: '+ str(f1beta))

In [None]:
#Computing AUC Score
X_train, X_test, y_train, y_test = splitdata(df_eda[baseline])
y_knn_proba, y_knn,y_test = getpreds(df_eda[baseline],KNeighborsClassifier(n_neighbors=5))
y_lm_proba, y_lm,y_test = getpreds(df_eda[baseline], LogisticRegression())
y_rf_proba, y_rf,y_test = getpreds(df_eda[baseline], RandomForestClassifier())
y_lgbm, y_lgbm_train, gbm, df_importance= lightgbm(X_train, X_test, y_train, y_test, baseline, categorical = [])
print('KNN AUC: '+str(roc_auc_score(y_test, y_knn)))
print('Logistic AUC: '+str(roc_auc_score(y_test, y_lm)))
print('Random Forest AUC: '+str(roc_auc_score(y_test, y_rf)))
print('Light GBM AUC: '+str(roc_auc_score(y_test,gbm.predict(X_test, num_iteration=gbm.best_iteration))))
print('Light GBM AUC Train Set: '+str(roc_auc_score(y_train,gbm.predict(X_train, num_iteration=gbm.best_iteration))))

In [None]:
#Classification Matrix for Baseline
preds_lgbm = y_lgbm > 0.5
confusionmatrix(y_lm, y_knn,y_rf,preds_lgbm,y_test)

### Feature Engineering Column Creation

In [None]:
def activetimeframe(df_eda):
    timestamp = df_eda['Registration'].sort_values(ascending = False).iloc[0]
    df_eda['Registration_to_today'] = (pd.to_datetime(timestamp) -pd.to_datetime(df_eda['Registration'])).apply(lambda x: x.days)
    df_eda['Active Timeframe'] = df_eda['Registered Timeframe (days)']
    df_eda['Active Timeframe'] = df_eda['Active Timeframe'][df_eda['Registered Timeframe (days)'] < df_eda['Registration_to_today']] = df_eda['Registered Timeframe (days)']
    return df_eda
    #with open('msno_active_timeframe_eda.pkl',"wb")as file:
        #pkl.dump(df_eda[['msno', 'song_id','Registration', 'Registration_to_today', 'Active Timeframe']],file)

In [None]:
def timestamp():
    """
    Add timestamp to each index
    """
    df_members = pkl.load(open( "msno_active_timeframe_eda.pkl", "rb" ))
    df_members['Timestamp'] = df_members.Registration
    df_members['days_between_songs']=round((df_members['Active Timeframe']/df_members.groupby(['msno'])['msno'].transform('count')),0)
    print('Days between songs have been added')
    count1 = 0
    for index in df_members.index:
        count1 +=1
        if count1%1000 == 0:
            print(count1)
        count = len(df_members.iloc[:index+1][df_members.msno == df_members.msno.iloc[index]])-1
        df_members['Timestamp'].iloc[index] = pd.to_datetime(df_members['Registration'].iloc[index])+ datetime.timedelta(df_members['days_between_songs'].iloc[index]*count)
    with open('msno_timestamp.pkl',"wb")as file:
        pkl.dump(df_members[['msno', 'song_id', 'Timestamp']],file) 

In [None]:
def songlength_mean_msno(df):
    df_mean = df.groupby(['msno'])['song_length'].mean().reset_index()
    df1 = df.merge(df_mean, on = 'msno', how = 'left')
    df1.rename(columns={"song_length_y": "mean_song_length"}, inplace = True)
    return df1

In [None]:
def artistcount(df):
    df['count'] = df.groupby(['msno','artist_name'])['artist_name'].transform('count')
    df.rename(columns={"count": "artist_count"}, inplace = True)
    return df

In [None]:
def songpopularity (df):
    """
    Counts the total number a song has been played
    """
    dict_songs_played = {key: count for key, count in df['song_id'].value_counts().iteritems()}
    df['Total_count_songs'] = df['song_id'].map(dict_songs_played)
    return df

In [None]:
def artistpopularity (df):
    """
    Counts the total number an artist has been played
    """
    dict_artist = {key: count for key, count in df['artist_name'].value_counts().iteritems()}
    df['Total_count_artist'] = df['artist_name'].map(dict_artist)
    return df

In [None]:
def composerpopularity(df):
    """
    Counts the total number a composer has been played
    """
    dict_artist = {key: count for key, count in df['composer'].value_counts().iteritems()}
    df['Total_count_composer'] = df['composer'].map(dict_artist)
    return df

In [None]:
def lyricistpopularity(df):
    """
    Counts the total number a lyricist has been played
    """
    dict_artist = {key: count for key, count in df['lyricist'].value_counts().iteritems()}
    df['Total_count_lyricist'] = df['lyricist'].map(dict_artist)
    return df

In [None]:
def year_isrc(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

In [None]:
def convertisrc(df):
    df['song_year'] = df['isrc'].apply(year_isrc)
    fillsongyear(df)
    df.drop(columns = 'isrc', inplace = True)
    return df

In [None]:
def language_english(df):
    """Checks whether a song is in English or not
    """

In [None]:
def is_featured(artist):
    if 'feat' in str(artist) :
        return 1
    return 0

In [None]:
def features(df):
    df['is_featured'] = df['artist_name'].apply(is_featured).astype(np.int8)
    return df

In [None]:
def shortsongapply(df):
    """
    Checks whether a songis shorter than the songs the user is usually listening to
    """
    list_songs = []
    
    df['short_song'] = np.where((df['song_length_x'] < df['mean_song_length']),1, 0)
    return df

In [None]:
def sourceprobabilities(df):
    """
    Computing the probability of a user using this specific source
    """
    for source in ['source_system_tab', 'source_screen_name', 'source_type']:
        df[source+'_msno_count'] =df.groupby(by = ['msno',source])[source].transform('count')
        df[source+'_msno_count_total'] =df.groupby(by = ['msno'])['source_system_tab'].transform('count')
        df['msno_'+source+'_probability'] = df[source+'_msno_count'] / df[source+'_msno_count_total']
    
    #Creating dict with overall probabilites of using source
        total = df[source].count()
        d_source = {key: count/total for key,count in df[source].value_counts().iteritems()}
        df['total_'+source+'_probability'] = df[source].map(d_source)
        df.drop(columns = source+'_msno_count_total')
    return df

In [None]:
def featureaddition(df):
    df = activetimeframe(df)
    df = songlength_mean_msno(df)
    df = artistcount(df)
    df = songpopularity(df)
    df = artistpopularity(df)
    df = composerpopularity(df)
    df = lyricistpopularity(df)
    df = convertisrc(df)
    df = features(df)
    df = shortsongapply(df)
    df = sourceprobabilities(df)
    
    return df
    

### Feature Engineering Scoring

In [None]:
df_features = featureaddition(df_eda)

In [None]:
df_features.columns

In [None]:
#Adding Registered Timeframe for user
columns = ['msno','city', 'bd', 'song_length','registered_via', 'Active Timeframe', 'target']
scoremodels(df_eda, columns, [LogisticRegression()], 'Logistic Regression')

In [None]:
#Adding average song length per user
columns = ['msno','city', 'bd','song_length_x', 'registered_via','mean_song_length','target']
scoremodels(df_features, columns, [LogisticRegression()], 'Logistic Regression')

In [None]:
#Adding number of times user has already listened to artist
columns = ['msno','city', 'bd', 'song_length_x','registered_via','artist_count','target', 'mean_song_length']
scoremodels(df_features, columns, [LogisticRegression()], 'Logistic Regression')

In [None]:
columns = ['msno','city', 'bd', 'song_length_x','registered_via','artist_count','target', 'mean_song_length', 'Registered Timeframe (days)']
scoremodels(df_features, columns, [LogisticRegression()], 'Logistic Regression')

In [None]:
columns = ['msno','city', 'bd', 'song_length_x','registered_via','artist_count','target', 'mean_song_length', 'Total_count_songs']
scoremodels(df_features, columns, [LogisticRegression()], 'Logistic Regression')

In [None]:
columns = ['msno','city', 'bd', 'song_length_x','registered_via','artist_count','target', 'mean_song_length', 'Total_count_songs', 'Total_count_artist']
scoremodels(df_features, columns, [LogisticRegression()], 'Logistic Regression')

In [None]:
columns = ['msno','city', 'bd', 'song_length_x','registered_via','artist_count','target', 'mean_song_length', 'Total_count_songs', 'Total_count_artist','song_year']
scoremodels(df_features, columns, [LogisticRegression()], 'Logistic Regression')

In [None]:
columns = ['msno','city', 'bd', 'song_length_x','registered_via','artist_count','target', 'mean_song_length', 'Total_count_songs', 'Total_count_artist','song_year']
scoremodels(df_features, columns, [LogisticRegression()], 'Logistic Regression', plot = True)

In [None]:
df_features = reduce_mem_usage(df_features)
with open('feature_engineered.pkl',"wb")as file:
        pkl.dump(df_features.drop(columns = 'Unnamed: 0'),file) 

In [None]:
del df_features

In [None]:
#Takes too long
#def dummyartist(df, column):
    #artists = df[column].unique()
    #length = len(df)
    #for artist in artists:
        #df[artist] = np.zeros(length)
        #df[df.artist_name == artist].replace(to_replace = 0, value = 1, inplace = True)
    #return df

### Scoring Models

In [None]:
df_features = pkl.load(open('feature_engineered.pkl', 'rb'))

In [None]:
df_features = reduce_mem_usage(df_features)

In [None]:
df_features.columns

In [None]:
columns = ['msno', 'song_id', 'target','city', 'gender','registered_via','bd',
       'Registered Timeframe (days)', 'song_length_x', 'genre_ids', 'language','artist_name',
       'Registration_to_today', 'Active Timeframe', 'mean_song_length',
       'artist_count', 'Total_count_songs', 'Total_count_artist',
       'Total_count_composer', 'Total_count_lyricist', 'song_year',
       'is_featured', 'short_song', 'source_system_tab_msno_count',
       'source_system_tab_msno_count_total',
       'msno_source_system_tab_probability', 'total_source_system_tab_probability', 'source_screen_name_msno_count',
       'source_screen_name_msno_count_total',
       'msno_source_screen_name_probability',
       'total_source_screen_name_probability', 'source_type_msno_count',
       'source_type_msno_count_total', 'msno_source_type_probability',
       'total_source_type_probability']
#y_lgbm, y_lgbm_train, X_test, y_test, gbm = scoremodeldummy(df_features[columns], columns, categorical = ['song_id', 'language'])

In [None]:
#plot_ROC_curve(X_test, y_test, [gbm])

In [None]:
#confusionmatrix1(preds2,y_lgbm,y_test)

In [None]:
#df_songs, df_songs_extra, df_members, df_train, df_test = loadfiles12()
#print('Loaded all files into DataFrames')

#df_members, df_member_dt = members_convrt(df_members)
#print('Convert registration and expiration date to time Series.')
#df_members = bdconvert(df_members, df_train, df_songs)
#print("Converted 0's in age columns")

### Modeling Train Set

In [None]:
df_train = pkl.load(open('train_complete', 'rb'))

In [None]:
df_train.columns

In [None]:
df_train = datawrangler(df_train)
print('Finished datawrangling')
df_train= featureaddition(df_train)

In [None]:
pkl.dump(df_train, open('train_engineered', 'wb'))

In [None]:
df_train = pkl.load(open('train_engineered', 'rb'))

In [None]:
df_train['song_id'] = df_train['song_id'].astype('object')
df_train['gender'] = df_train['gender'].fillna('-1')
df_train['language'] = df_train['language'].astype('object')
df_train['language'].fillna('-1')
df_train['name'] = df_train['name'].astype('str')

In [None]:
df_train = labelencoding(df_train)

In [None]:
df_train = reduce_mem_usage(df_train)

In [None]:
pkl.dump(df_train,open('df_train_label', 'wb'))

In [None]:
df_train = pkl.load(open('df_train_label','rb'))

In [None]:
columns = ['msno', 'song_id', 'target', 'city', 'bd', 'gender', 'registered_via',
       'Registration', 'Expiration Date',
       'song_length_x', 'genre_ids', 'artist_name', 'composer', 'lyricist',
       'language', 'name', 'Registration_to_today', 'Active Timeframe',
       'mean_song_length', 'artist_count', 'Total_count_songs',
       'Total_count_artist', 'Total_count_composer', 'Total_count_lyricist',
       'song_year', 'is_featured', 'short_song',
       'source_system_tab_msno_count', 'source_system_tab_msno_count_total',
       'msno_source_system_tab_probability',
       'total_source_system_tab_probability', 'source_screen_name_msno_count',
       'source_screen_name_msno_count_total',
       'msno_source_screen_name_probability',
       'total_source_screen_name_probability', 'source_type_msno_count',
       'source_type_msno_count_total', 'msno_source_type_probability',
       'total_source_type_probability']#'language', 'name','composer', 'gender'
categorical = ['song_id','language','Registration', 'Expiration Date',
               'genre_ids','artist_name', 'composer', 'lyricist', 'name']
X_train, X_test, y_train, y_test = splitdata(df_train[columns])
y_lgbm, y_lgbm_train, gbm, df_importance  = lightgbm(X_train, X_test, y_train, y_test, columns, categorical)

In [None]:
print('Light GBM Regression Train AUC Score: '+str(roc_auc_score(y_train, y_lgbm_train)))
print('Light GBM Regression AUC Score: '+str(roc_auc_score(y_test, y_lgbm)))

In [None]:
df_importance.to_csv('/home/ubuntu/Project_3/Feature.csv')

In [None]:
#Confusion Matrix
preds = y_lgbm > 0.58
plt.figure(figsize=(10,6))
cm = confusion_matrix(y_test,preds)
sns.heatmap(cm,cmap=plt.cm.Blues,annot=True,square=True,
                xticklabels=[0,1],
                yticklabels=[0,1],
                fmt='g')
plt.title('Light GBM Confusion Matrix', size=15)

In [None]:
#ROC Curve
plt.figure(figsize=(10,6))
fpr, tpr, _ = roc_curve(y_test,gbm.predict(X_test, num_iteration=gbm.best_iteration))
score_auc = roc_auc_score(y_test,gbm.predict(X_test, num_iteration=gbm.best_iteration))
plt.plot(fpr, tpr, label = 'Light GBM ROC Curve')
print(score_auc)

In [None]:
df_roc_auc = pd.DataFrame(data = fpr, columns = ['False Positive Rate'])

In [None]:
df_roc_auc['True Positive Rate'] = tpr

In [None]:
df_roc_auc.to_csv('ROC_AUC_Scores.csv')

In [None]:
#Recall, Precision, Accuracy, F1
actuals = y_test
preds = y_lgbm > 0.58
accuracy_lgbm = accuracy(y_test, preds)
precision_lgbm = precision(y_test, preds)
recall_lgbm = recall(y_test, preds)
f1 = F1(y_test, preds)

print('Recall Light GBM score: '+ str(recall_lgbm))
print('Precision Light GBM score: '+ str(precision_lgbm))
print('Accuracy Light GBM score: '+ str(accuracy_lgbm))
print('F1 Light GBM score: '+ str(f1))