In [9]:
import nltk
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Coco\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
tables = pd.read_json(r'data/tables/re_tables-0875.json')
features = pd.read_csv(r'data/features/features.txt')
qrels = pd.read_csv(r'data/queries/qrels.txt', sep='\t', header=None)
queries = pd.read_csv(r'data/queries/queries.txt', header=None)
queries = pd.DataFrame([row[0][row[0].find(' ') + 1:] for index, row in queries.iterrows()])

In [4]:
# list of stop words from nltk english corpus
sw = nltk.corpus.stopwords.words('english')

In [30]:
print(features.iloc[0])

query_id                                  1
query            world interest rates Table
table_id                     table-0875-680
row                                       8
col                                       2
nul                                       0
in_link                                  31
out_link                                 21
pgcount                               51438
tImp                                      1
tPF                             0.000259799
leftColhits                               0
SecColhits                                0
bodyhits                                  0
PMI                                       0
qInPgTitle                         0.333333
qInTableTitle                      0.222222
yRank                                   100
csr_score                       7.46742e-10
idf1                                29.6279
idf2                                24.1356
idf3                                27.1006
idf4                            

### Query features

In [6]:
# QLEN
def get_qlen(query):
    return len(query.split(' '))

# IDF
def get_idf(query, field):
    # instantiate count vectorizer
    cv=CountVectorizer(field, stop_words=sw)
    # this steps generates word counts for the words in your docs
    word_count_vector=cv.fit_transform(field)
    # instantiate tfidf transformer (with use_idf true in order to compute idf scores)
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    # compute the idf scores
    tfidf_transformer.fit(word_count_vector)
    # compute the sum of idf scores for all query terms
    score = sum([tfidf_transformer.idf_[cv.get_feature_names().index(term)] for term in query.split(' ')])
    # return idf score
    return score

### Lexical features

In [32]:
def get_lexical_features(entries):
    res = list()
    for entry in entries:
        features = list()
        # QLEN
        features.append(entry[41])
        # IDF scores
        features.append(entry[19])
        features.append(entry[20])
        features.append(entry[21])
        features.append(entry[22])
        features.append(entry[23])
        features.append(entry[24])
        # number of rows
        features.append(entry[3])
        # number of columns
        features.append(entry[4])
        # number of empty cells
        features.append(entry[5])
        # PMI
        features.append(entry[14])
        # number of in-links
        features.append(entry[6])
        # number of out-links
        features.append(entry[7])
        # number of page views
        features.append(entry[8])
        # table importance
        features.append(entry[9])
        # table page fraction
        features.append(entry[10])
        # hits left column
        features.append(entry[11])
        # hits second to left column
        features.append(entry[12])
        # hits body
        features.append(entry[13])
        # ratio of query tokens found in page title
        features.append(entry[15])
        # ratio of query tokens found in table title
        features.append(entry[16])
        # y-rank
        features.append(entry[17])
        # mlm similarity
        features.append(entry[28])
        # add features to results list
        res.append(features)
    return res