In [149]:
import pdb
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.stats import norm
import statsmodels.api as sm

pd.options.mode.chained_assignment = None
data_path = 'data/pickles/pop_eta1_third/'
train_file = 'train_clicks_see%d_1.pkl'
test_file = 'test_clicks_see%d_1.pkl'

In [88]:
num_files = 30

In [89]:
def probit(Y, X):
    lr = LinearRegression()
    lr.fit(X, Y)
    return lr.coef_

def inverse_mills(val):
#     return norm.pdf(val) / (1 - norm.cdf(val))
    return -norm.pdf(val) / norm.cdf(val)

def OLS(Y, X):
    N = X.shape[0]
    lr = LinearRegression()
    lr.fit(X, Y)
    return lr.coef_

In [90]:
def dump_psvm_file(data, props, filename, is_train=True):
    import pdb
    feat_off = 3 if is_train else 2
    data = data.values
#     pdb.set_trace()
    with open(psvm_data_path + filename, 'w') as f:
        for i in range(data.shape[0]):
            row = data[i,:]
            qid = str(int(row[0]))
            relv = str(int(row[1]))
            
            feat = ' '.join(['%d:%0.6f' % (idx+1, val) for idx, val in enumerate(row[feat_off:]) if val > 1e-5])

            line = relv + ' qid:' + qid
            prop = props[i] if is_train else 1.0
            if relv == '1':
                line += ' cost:' + str(prop) + ' '
            line += feat + '\n'
            f.write(line)
            

In [92]:
def heckman(train_file, test_file):
    train = pd.read_pickle(data_path + train_file)
    train['S'] = train['S'].astype(int)

    test = pd.read_pickle(data_path + test_file)
    test = test[test.columns.drop('S')]

    X = train[train.columns.drop(['qid', 'C', 'S'])]
    Y = train['S']

    gamma = probit(Y, X)
    delta = gamma.max() - gamma.min()
    gamma = (gamma - gamma.min()) / delta

    lambda_ = inverse_mills(np.matmul(X, gamma))

    xcols = list(filter(lambda x: x[0] == 'X', train.columns))
    
    X = train[xcols]
    Y = train['C']

    params = OLS(Y, np.append(X, lambda_.reshape(-1,1), 1))[:-1]

    X_test = test[xcols]
    Eval = test[['qid', 'C']]
    Eval['Relv'] = np.matmul(X_test, params)
    
    del train
    del test
    del gamma
    
    return Eval

def eval(eval_data):
    avg_rank = 0.0
    cnt = 0
    for name, group in eval_data.groupby(['qid']):
        df = group
        df['ranks'] = df['Relv'].rank(ascending=False).astype(int)
        avg_rank += df[df['C'] == 1]['ranks'].mean()
        cnt += 1
    return avg_rank / cnt

In [150]:
results = pd.DataFrame(columns=['seen', 'avg_rank'])

scores = [0.0] * num_files
for i in range(num_files):
    fname_train = train_file % i
    fname_test = test_file % i
    eval_data = heckman(fname_train, fname_test)
    scores[i] = eval(eval_data)
    print('done %d' % i)

results['avg_rank'] = scores
results['seen'] = list(range(1, num_files+1))
results

results.to_csv('results/15_pass_heckman_pop_eta1_third.csv', index=None, header=None)

done 0
done 1
done 2
done 3
done 4
done 5
done 6
done 7
done 8
done 9
done 10
done 11
done 12
done 13
done 14
done 15
done 16
done 17
done 18
done 19
done 20
done 21
done 22
done 23
done 24
done 25
done 26
done 27
done 28
done 29
