In [70]:
import ast
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.stats import norm
import statsmodels.api as sm

pd.options.mode.chained_assignment = None
data_path = 'data/'
train_file = 'pickles/train_clicks_see%d_2.pkl'
test_file = 'pickles/test_clicks_see%d_2.pkl'
gamma_filename = 'weight_eta2/%d.txt'
svm_gamma_filename = 'weight_svm.dat'

In [71]:
num_files = 30
is_svm = True

In [72]:
def probit(Y, X):
    lr = LinearRegression()
    lr.fit(X, Y)
    return lr.coef_

def inverse_mills(val):
#     return norm.pdf(val) / (1 - norm.cdf(val))
    return -norm.pdf(val) / norm.cdf(val)

def OLS(Y, X):
    N = X.shape[0]
    lr = LinearRegression()
    lr.fit(X, Y)
    return lr.coef_

In [73]:
def load_gamma(gamma_file):
    with open(data_path + gamma_file, 'r') as f:
        gamma = list(ast.literal_eval(f.read().strip()[1:-1]))
        gamma = list(map(lambda x: 0.0 if x == 'undefined' else x, gamma))
        gamma = np.array(gamma)

    return gamma

def load_gamma_svm(svm_gamma_file):
    gamma = np.zeros(700)
    with open(data_path + svm_gamma_file, 'r') as f:
        data = f.read().strip().split(' ')
        for t in data:
            k, v = int(t.split(':')[0]), float(t.split(':')[1])
            gamma[k-1] = v

    return gamma

In [74]:
def heckman(train_file, test_file, gamma_file, is_svm=False):
    train = pd.read_pickle(data_path + train_file)
    train['S'] = train['S'].astype(int)

    test = pd.read_pickle(data_path + test_file)
    test = test[test.columns.drop('S')]

    X = train[train.columns.drop(['qid', 'C', 'S'])]
    Y = train['S']

    gamma = load_gamma_svm(gamma_file) if is_svm else load_gamma(gamma_file)

    lambda_ = inverse_mills(np.matmul(X, gamma))

    xcols = list(filter(lambda x: x[0] == 'X', train.columns))
    X = train[xcols]
    Y = train['C']

    params = OLS(Y, np.append(X, lambda_.reshape(-1,1), 1))[:-1]

    X_test = test[test.columns.drop(['qid', 'C'])]
    Eval = test[['qid', 'C']]
    Eval['Relv'] = np.matmul(X_test, params)
    
    del train
    del test
    del gamma
    
    return Eval

def eval(eval_data):
    avg_rank = 0.0
    cnt = 0
    for name, group in eval_data.groupby(['qid']):
        df = group
        df['ranks'] = df['Relv'].rank(ascending=False).astype(int)
        avg_rank += df[df['C'] == 1]['ranks'].mean()
        cnt += 1
    return avg_rank / cnt

In [75]:
results = pd.DataFrame(columns=['seen', 'avg_rank'])

scores = [0.0] * num_files
for i in range(num_files):
    gamma_file = svm_gamma_filename if is_svm else (gamma_filename % (i+1))
    eval_data = heckman(train_file % (i+1), test_file % (i+1), gamma_file, is_svm)
    scores[i] = eval(eval_data)
    print('done %d' % (i+1))

results['avg_rank'] = scores
results['seen'] = list(range(1, num_files+1))
results

done 1
done 2
done 3
done 4
done 5
done 6
done 7
done 8
done 9
done 10
done 11
done 12
done 13
done 14
done 15
done 16
done 17
done 18
done 19
done 20
done 21
done 22
done 23
done 24
done 25
done 26
done 27
done 28
done 29
done 30


Unnamed: 0,seen,avg_rank
0,1,2.59792
1,2,2.863636
2,3,3.107807
3,4,3.425349
4,5,3.11583
5,6,3.281843
6,7,3.329661
7,8,3.477506
8,9,3.435877
9,10,3.975628


In [76]:
results.to_csv('results/svm_10pass_eta2_gamma.csv', index=None, header=None)