## Lexical Features (table-retrieval LTR baseline)

In [1]:
import nltk
import numpy as np
import pandas as pd

In [2]:
tables = pd.read_json(r'data/tables/re_tables-0875.json')
features = pd.read_csv(r'data/features/features.txt')
qrels = pd.read_csv(r'data/queries/qrels.txt', sep='\t', header=None)
queries = pd.read_csv(r'data/queries/queries.txt', header=None)
queries = pd.DataFrame([row[0][row[0].find(' ') + 1:] for index, row in queries.iterrows()])

In [3]:
# list of stop words from nltk english corpus
sw = nltk.corpus.stopwords.words('english')

In [4]:
features.head()

Unnamed: 0,query_id,query,table_id,row,col,nul,in_link,out_link,pgcount,tImp,...,cmax,csum,cavg,csim,remax,resum,reavg,resim,query_l,rel
0,1,world interest rates Table,table-0875-680,8,2,0,31,21,51438,1.0,...,0.666667,5.291894,0.048108,0.354686,0.241209,3.716354,0.033785,0.28113,4,0
1,1,world interest rates Table,table-1020-619,4,3,0,18,0,324,1.0,...,1.0,11.116121,0.101056,0.718895,1.0,8.075247,0.073411,0.71025,4,0
2,1,world interest rates Table,table-0288-531,3,5,0,23,22,26419,0.5,...,0.0,0.0,0.0,0.0,0.067373,0.365818,0.003326,0.03368,4,0
3,1,world interest rates Table,table-0288-530,4,5,1,23,22,26419,0.5,...,0.0,0.0,0.0,0.0,0.067373,0.365818,0.003326,0.03368,4,0
4,1,world interest rates Table,table-1000-57,2,2,0,38,1,2268,1.0,...,1.0,10.147388,0.092249,0.372667,0.226134,4.564622,0.041497,0.279899,4,0


In [5]:
print(features.iloc[0])

query_id                                  1
query            world interest rates Table
table_id                     table-0875-680
row                                       8
col                                       2
nul                                       0
in_link                                  31
out_link                                 21
pgcount                               51438
tImp                                      1
tPF                             0.000259799
leftColhits                               0
SecColhits                                0
bodyhits                                  0
PMI                                       0
qInPgTitle                         0.333333
qInTableTitle                      0.222222
yRank                                   100
csr_score                       7.46742e-10
idf1                                29.6279
idf2                                24.1356
idf3                                27.1006
idf4                            

### Preprocessing

In [6]:
# save table ids for later use
table_ids = features['table_id']

In [9]:
# one-hot encode the data using pandas get_dummies
features = pd.get_dummies(features, columns = ['table_id'])

In [10]:
features.drop(['query', 'max', 'sum', 'avg', 'sim', 'emax', 'esum', 'eavg', 'esim', 'cmax', 'csum', 'cavg', 'csim', 'remax', 'resum', 'reavg', 'resim'], axis = 1, inplace = True)

In [11]:
# extract labels from features (labels are the values we want to predict)
labels = np.array(features['rel'])

# remove labels from features
features.drop(['rel'], axis = 1, inplace = True)

# save feature names for later use
feature_columns = list(features.columns)

# convert features to numpy array
features = np.array(features)

### Training and Testing Sets

In [12]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [14]:
print(f'Training Features Shape: {train_features.shape}')
print(f'Training Labels Shape: {train_labels.shape}')
print(f'Testing Features Shape: {test_features.shape}')
print(f'Testing Labels Shape: {test_labels.shape}')

Training Features Shape: (2340, 2956)
Training Labels Shape: (2340,)
Testing Features Shape: (780, 2956)
Testing Labels Shape: (780,)


### Model Training (random forests)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# train the model on training data
rf.fit(train_features, train_labels)

#### Save model to file

In [None]:
from joblib import dump
dump(rf, 'random_forest.joblib')

#### Load model from file

In [15]:
from joblib import load
rf = load('random_forest.joblib')

### Results

In [16]:
from sklearn import metrics

predictions = rf.predict(test_features)
print(f'mean square error  : {metrics.mean_squared_error(test_labels, predictions)}')
print(f'mean absolute error: {metrics.mean_absolute_error(test_labels, predictions)}')

mean square error  : 0.18987227435897439
mean absolute error: 0.22587692307692311


In [17]:
predictions_all = rf.predict(features)
print(f'mean square error  : {metrics.mean_squared_error(labels, predictions_all)}')
print(f'mean absolute error: {metrics.mean_absolute_error(labels, predictions_all)}')

mean square error  : 0.06934630448717949
mean absolute error: 0.1244096153846154


In [49]:
# generate results in trec_eval format
raw = pd.read_csv(r'data/features/features.txt')
res = {
    'query-id': list(),
    'q0': list(),
    'document-id': list(),
    'rank': list(),
    'score': list(),
    'name': list()
}

for index, row in raw.iterrows():
    res['query-id'].append(row['query_id'])
    res['q0'].append('Q0')
    res['document-id'].append(row['table_id'])
    res['rank'].append(0)
    res['score'].append(predictions_all[index])
    res['name'].append('STANDARD')

In [55]:
df_res = pd.DataFrame.from_dict(res)

In [56]:
df_res.head()

Unnamed: 0,query-id,q0,document-id,rank,score,name
0,1,Q0,table-0875-680,0,0.02,STANDARD
1,1,Q0,table-1020-619,0,0.002,STANDARD
2,1,Q0,table-0288-531,0,0.004,STANDARD
3,1,Q0,table-0288-530,0,0.002,STANDARD
4,1,Q0,table-1000-57,0,0.002,STANDARD


In [57]:
# save results to .txt file (for running trec_eval comparison)
df_res.to_csv('results.txt', sep=' ', index=False, header=False)

### Query features

In [None]:
# QLEN
def get_qlen(query):
    return len(query.split(' '))

# IDF
def get_idf(query, field):
    # instantiate count vectorizer
    cv=CountVectorizer(field, stop_words=sw)
    # this steps generates word counts for the words in your docs
    word_count_vector=cv.fit_transform(field)
    # instantiate tfidf transformer (with use_idf true in order to compute idf scores)
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    # compute the idf scores
    tfidf_transformer.fit(word_count_vector)
    # compute the sum of idf scores for all query terms
    score = sum([tfidf_transformer.idf_[cv.get_feature_names().index(term)] for term in query.split(' ')])
    # return idf score
    return score

### Lexical features

In [None]:
def get_lexical_features(entries):
    res = list()
    for entry in entries:
        features = list()
        # QLEN
        features.append(entry[41])
        # IDF scores
        features.append(entry[19])
        features.append(entry[20])
        features.append(entry[21])
        features.append(entry[22])
        features.append(entry[23])
        features.append(entry[24])
        # number of rows
        features.append(entry[3])
        # number of columns
        features.append(entry[4])
        # number of empty cells
        features.append(entry[5])
        # PMI
        features.append(entry[14])
        # number of in-links
        features.append(entry[6])
        # number of out-links
        features.append(entry[7])
        # number of page views
        features.append(entry[8])
        # table importance
        features.append(entry[9])
        # table page fraction
        features.append(entry[10])
        # hits left column
        features.append(entry[11])
        # hits second to left column
        features.append(entry[12])
        # hits body
        features.append(entry[13])
        # ratio of query tokens found in page title
        features.append(entry[15])
        # ratio of query tokens found in table title
        features.append(entry[16])
        # y-rank
        features.append(entry[17])
        # mlm similarity
        features.append(entry[28])
        # add features to results list
        res.append(features)
    return res

In [None]:
import bigjson.bigjson as bj

with open('data/tables.json', 'rb') as f:
    reader = bj.FileReader(f, 'utf-8')
    i = reader.read(True, False)

In [None]:
print(i.keys())

In [None]:
with open('data/tableMentions.json', 'rb') as f:
    reader = bj.FileReader(f, 'utf-8')
    j = reader.read(True, True)

In [None]:
print(j.items())