In [11]:
from __future__ import division
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.grid_search import GridSearchCV

def get_click_data(impressions_count):
    feats = 10 #Number of features
    feat_occurence = 0.2 #Probability that a feature occurs in any feature vector

    coefficients = 0.1*np.random.randn(feats) #Random feature occurence
    coefficients[0] = -3 #Ensure CTR is low
    
    #Random occurence of features
    features = np.random.binomial(1,feat_occurence,size = [feats,impressions_count])
    features[0] = np.ones(impressions_count)#Add a 'constant' feature
    features = features.T
    score = features.dot(coefficients)
    p = 1/(1+np.exp(-score))
    clicks = np.random.binomial(1, p) 
    
    return clicks, features

# Problem description
In the following we are going to estimate the click through rate of a new article given its contents. The contents are described using boolean features. The function `get_click_data()` generates sample data for you. The goal is 
* split the data into train/test set
* Measure average click through rate
* Build a model and evaluate it on the test data

### Required solution components
* Justification for the chosen model
* Justification for the choice of loss function
* Empirical evaluation metrics

In [12]:
clicks, features = get_click_data(10000)

In [13]:
data = pd.DataFrame(features)
data['clicks'] = clicks
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,clicks
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,1.0,0.2011,0.2011,0.2025,0.1977,0.2035,0.2023,0.1984,0.1913,0.204,0.0515
std,0.0,0.400843,0.400843,0.401883,0.398285,0.402621,0.401735,0.398815,0.393344,0.402989,0.221026
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
