In [1]:
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.sparse as sparse
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import log_loss
import time
import scipy

In [2]:
df_val = pd.read_csv('all_dumm_val.csv.gz')

In [3]:
df_test = pd.read_csv('all_dumm_test.csv.gz')

In [4]:
df = pd.read_csv('all_dumm_tt.csv.gz',  usecols=[0])

In [5]:
df = pd.concat([df, df_val[['click']]]).reset_index(drop=True)

In [6]:
df = pd.concat([df, df_test[['click']]]).reset_index(drop=True)

In [7]:
# 83.1% values are 0 and 16.9% values are 1 
print(df.click.value_counts(normalize = True))

0    0.830194
1    0.169806
Name: click, dtype: float64


In [8]:
import gc
gc.collect()

0

In [9]:
y_val = df_val['click']
x_val = df_val.drop(columns=df.columns[0], axis=1)

In [10]:
# Define class for logistic regression with stochastic gradient descent

class LogisticRegression:
    
    def __init__(LR, start, eta=0.1,panelty=None, lamb=1):
        LR.eta = eta
        LR.beta = start
        LR.penalty=panelty
        LR.lamb=lamb
    
    def fit(LR, X, y, eta_i):
        n,k=X.shape
        z = np.dot(X, LR.beta)
        p = 1 / (1 + np.exp(-z))
        gradient = np.dot(X.T, (p - y)) / y.size
        
        if LR.panelty=='l1':
            gradient[1:k,] += LR.lamb*np.sign(LR.beta)[1:k,]/y.size
        elif LR.panelty=='l2':
            gradient[1:k,] += LR.lamb*LR.beta[1:k,]/y.size
            
        LR.beta -= eta_i * gradient
        loss = (-y * np.log(p) - (1 - y) * np.log(1 - p)).mean()
    
    def predict_loss(LR, X, y):   
        z = np.dot(X, LR.beta)
        p = 1 / (1 + np.exp(-z))
        return (-y * np.log(p) - (1 - y) * np.log(1 - p)).mean()

    
def log_loss_NB(naive_bayes_model, X, y):
    p = naive_bayes_model.predict_proba(X)[:,1]
    return (-y * np.log(p) - (1 - y) * np.log(1 - p)).mean()

__How you split the dataset into an estimation and evaluation sample.__ 

80/20

Train and test sample, 80/20 using train_test_split

In [12]:
###### LOGISTIC REGRESSION

chunksize = 10000
df_chunk = pd.read_csv('all_dumm_tt.csv.gz', chunksize=chunksize, delimiter=',')

t=1
i=1
avg_loss_logistic = []
avg_loss_logistic_eval = []
elapsed_time_logistic = []
avg_loss_previous = 100
avg_loss_previous_eval = 100
count_increasing_loss = 0
early_stopping = True    # Breaks the loop if the test performance of the model does not improve (to prevent overfitting)

print('Estimating logistic regression model')

for chunk in df_chunk:

    start_time_chunk = time.time()
    eta_i = 1/np.sqrt(t)
    
    # Select y variable from chunk, saved as series
    y_chunk = chunk['click']
    
    # Remove irrelevant columns
    chunk = chunk.drop(columns=df.columns[0], axis=1)
    
    # Divide the data in a training and testing set
    #chunk_train, chunk_test, y_train, y_test = train_test_split(chunk, y_chunk, test_size=0, random_state=44)
    chunk_train = chunk
    y_train = y_chunk

    if i == 1: 
        logistic_regression = LogisticRegression(eta=eta_i, start = np.zeros(chunk.shape[1])) # Initialize regression object
    
    # Update the logistic regression betas using the training data in this chunk
    logistic_regression.fit(chunk_train,y_train, eta_i)
    
    # Evaluate the model performance using the testing data in this chunk
    avg_loss_logistic.append(logistic_regression.predict_loss(x_val, y_val))
    avg_loss_logistic_eval.append(logistic_regression.predict_loss(chunk_train, y_train))
    
    # Show performance after every 10 iterations
    if i % 10 == 0:
        print('Chunk nr {}, avg test loss for the last 10 chunks = log-loss test {}, log-loss training {}'
              .format(i,round(np.array(avg_loss_logistic).mean(),4),round(np.array(avg_loss_logistic_eval).mean(),4)))
        
        # Break the loop if model did not improve with the last 10 chunks two times in a row
        if (early_stopping == True and np.array(avg_loss_logistic).mean() > avg_loss_previous):
            count_increasing_loss = count_increasing_loss + 1
            if count_increasing_loss > 1:
                print('Training stopped because log loss is no longer decreasing')
                break
        else:
            count_increasing_loss = 0
            
        avg_loss_previous = np.array(avg_loss_logistic).mean()
        avg_loss_logistic = []
    
    elapsed_time_logistic.append(time.time()-start_time_chunk)
    
    i = i+1
    t = t+1
        
print('\nAlgorithm needed {} chunks to converge, and took {} seconds.\nThis is an average of {} seconds per chunk'.
     format(i, round(np.array(elapsed_time_logistic).sum(),4), round(np.array(elapsed_time_logistic).mean(),4)))

Estimating logistic regression model
Chunk nr 10, avg test loss for the last 10 chunks = log-loss test 0.4601, log-loss training 0.4628
Chunk nr 20, avg test loss for the last 10 chunks = log-loss test 0.4498, log-loss training 0.456
Chunk nr 30, avg test loss for the last 10 chunks = log-loss test 0.445, log-loss training 0.442
Chunk nr 40, avg test loss for the last 10 chunks = log-loss test 0.4428, log-loss training 0.4358
Chunk nr 50, avg test loss for the last 10 chunks = log-loss test 0.4418, log-loss training 0.4366
Chunk nr 60, avg test loss for the last 10 chunks = log-loss test 0.4413, log-loss training 0.4333
Chunk nr 70, avg test loss for the last 10 chunks = log-loss test 0.4411, log-loss training 0.4302
Chunk nr 80, avg test loss for the last 10 chunks = log-loss test 0.4409, log-loss training 0.427
Chunk nr 90, avg test loss for the last 10 chunks = log-loss test 0.4408, log-loss training 0.4247
Chunk nr 100, avg test loss for the last 10 chunks = log-loss test 0.4406, l

In [14]:
###### NAIVE BAYES

df_chunk = pd.read_csv('all_dumm_tt.csv.gz', chunksize=chunksize, delimiter=',')


i=1
avg_loss_nb = []
avg_loss_nb_eval = []
elapsed_time_nb = []
avg_loss_previous = 1
avg_loss_previous_eval = 1
count_increasing_loss = 0
early_stopping = True    # Breaks the loop if the test performance of the model does not improve (to prevent overfitting)

print('Estimating Naive Bayes model')

for chunk in df_chunk:

    start_time_chunk = time.time()
    
        # Select y variable from chunk, saved as series
    y_chunk = chunk['click']
    
    # Remove irrelevant columns
    chunk = chunk.drop(columns=df.columns[0], axis=1)
    
    # Divide the data in a training and testing set
    #chunk_train, chunk_test, y_train, y_test = train_test_split(chunk, y_chunk, test_size=0, random_state=44)
    chunk_train = chunk
    y_train = y_chunk

    if i == 1:         
        naive_bayes = MultinomialNB() # Initialize naive bayes object

    # Update the naive bayes using the training data in this chunk
    naive_bayes.partial_fit(chunk_train,y_train,[0,1])
    
    # Evaluate the model performance using the testing data in this chunk
    avg_loss_nb.append(log_loss(y_val,naive_bayes.predict_proba(x_val)))
    avg_loss_nb_eval.append(log_loss(y_train,naive_bayes.predict_proba(chunk_train)))
           
    # Show performance after every 5 iterations
    if i % 5 == 0:
        print('Chunk nr {}, avg test loss for the 5 chunks = log-loss test {}, log-loss training {}'
              .format(i,round(np.array(avg_loss_nb).mean(),4), round(np.array(avg_loss_nb_eval).mean(),4)))
        
        # Break the loop if model did not improve with the last 10 chunks two times in a row
        if (early_stopping == True and np.array(avg_loss_nb).mean() > avg_loss_previous):
            count_increasing_loss = count_increasing_loss + 1
            if count_increasing_loss > 1:
                print('Training stopped because log loss is no longer decreasing')
                break
        else:
            count_increasing_loss = 0
            
        avg_loss_previous = np.array(avg_loss_nb).mean()
        avg_loss_nb = []
    
    elapsed_time_nb.append(time.time()-start_time_chunk)
    
    # Temporary: limit training time
    if i == 100:
        break
        
    i = i+1
        
print('\nAlgorithm needed {} chunks to converge, and took {} seconds.\nThis is an average of {} seconds per chunk'.
     format(i, round(np.array(elapsed_time_nb).sum(),4), round(np.array(elapsed_time_nb).mean(),4)))

Estimating Naive Bayes model
Chunk nr 5, avg test loss for the 5 chunks = log-loss test 0.6638, log-loss training 0.4547
Chunk nr 10, avg test loss for the 5 chunks = log-loss test 0.6581, log-loss training 0.4517
Chunk nr 15, avg test loss for the 5 chunks = log-loss test 0.6496, log-loss training 0.4525
Chunk nr 20, avg test loss for the 5 chunks = log-loss test 0.6524, log-loss training 0.453
Chunk nr 25, avg test loss for the 5 chunks = log-loss test 0.6641, log-loss training 0.4468
Training stopped because log loss is no longer decreasing

Algorithm needed 25 chunks to converge, and took 69.1073 seconds.
This is an average of 2.8795 seconds per chunk


__How the calculation times of the two models compare to each other.__

For Logistic Regression, it took 275s. For Naive Bayes, it took 69s.

__How many observations each model needs for its parameters to converge.__

1 chunk is 10,000.
Logistic regression needed 200 chunks to converge so 2,000,000 obs needed.<br>
Naive bayes needed 25 chunks to converge so 250,000 obs needed.

__What the prediction accuracy is of the two models based on Mean Log Loss for both the 
estimation  and  evaluation  sample.  Also  compare  with  a  model  that  simply  predicts  the 
average CTR of the estimation sample as click probability for each observation.__

In [15]:
# Log loss if we always predict the average CTR
print('The loss for the test set of the average prediction CTR is: ', log_loss(df_val['click'], np.array([np.mean(df_val['click'])]*len(df_val['click']))))

The loss for the test set of the average prediction CTR is:  0.4555148712958835


In [16]:
y_test = df_test['click']
x_test = df_test.drop(columns=df_test.columns[0], axis=1)

In [18]:
print('The loss for the test set of Logistic Regression is: ',logistic_regression.predict_loss(x_test, y_test))

The loss for the test set of Logistic Regression is:  0.44027694985590893


In [19]:
print('The loss for the test set of Naive Bayes is: ',log_loss(y_test,naive_bayes.predict_proba(x_test)))

The loss for the test set of Naive Bayes is:  0.6733661971845487
