## Lexical Features (table-retrieval LTR baseline)

In [None]:
import nltk
import numpy as np
import pandas as pd

In [None]:
tables = pd.read_json(r'data/tables/re_tables-0875.json')
features = pd.read_csv(r'data/features/features.txt')
qrels = pd.read_csv(r'data/queries/qrels.txt', sep='\t', header=None)
queries = pd.read_csv(r'data/queries/queries.txt', header=None)
queries = pd.DataFrame([row[0][row[0].find(' ') + 1:] for index, row in queries.iterrows()])

In [None]:
# list of stop words from nltk english corpus
sw = nltk.corpus.stopwords.words('english')

In [None]:
features.head()

In [None]:
print(features.iloc[1])

### Preprocessing

In [None]:
# save table ids for later use
table_ids = features['table_id']

In [None]:
# one-hot encode the data using pandas get_dummies
features = pd.get_dummies(features, columns = ['table_id'])

In [None]:
features.drop(['query', 'max', 'sum', 'avg', 'sim', 'emax', 'esum', 'eavg', 'esim', 'cmax', 'csum', 'cavg', 'csim', 'remax', 'resum', 'reavg', 'resim'], axis = 1, inplace = True)

In [None]:
# extract labels from features (labels are the values we want to predict)
labels = np.array(features['rel'])

# remove labels from features
features.drop(['rel'], axis = 1, inplace = True)

# save feature names for later use
feature_columns = list(features.columns)

# convert features to numpy array
features = np.array(features)

### Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.20, random_state = 42)

In [None]:
print(f'Training Features Shape: {train_features.shape}')
print(f'Training Labels Shape: {train_labels.shape}')
print(f'Testing Features Shape: {test_features.shape}')
print(f'Testing Labels Shape: {test_labels.shape}')

### Model Training (random forests)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, max_depth=3, n_jobs = 10, random_state = 42)

# train the model on training data
rf.fit(train_features, train_labels)

#### Save model to file

In [None]:
from joblib import dump
# random_forest_1000_none.joblib: n_estimators = 1000, max_depth = None
# random_forest_1000_3.joblib: n_estimators = 1000, max_depth = 3
dump(rf, 'random_forest_1000_3.joblib')

#### Load model from file

In [None]:
from joblib import load
rf = load('random_forest.joblib')

#### Grid search

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# create the parameter grid
param_grid = {
    'bootstrap': [True],
    'max_depth': [3, 5, None],
    'n_estimators': [100, 500, 1000, 1500, 2000]
}

# create a based model
rf = RandomForestRegressor()

# instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 2)

In [None]:
# fit the grid search to the data
grid_search.fit(train_features, train_labels)

### Results

In [None]:
from sklearn import metrics

predictions = rf.predict(test_features)
print(f'mean square error  : {metrics.mean_squared_error(test_labels, predictions)}')
print(f'mean absolute error: {metrics.mean_absolute_error(test_labels, predictions)}')

In [None]:
predictions = rf.predict(features)
print(f'mean square error  : {metrics.mean_squared_error(labels, predictions)}')
print(f'mean absolute error: {metrics.mean_absolute_error(labels, predictions)}')

In [None]:
# generate results in trec_eval format
raw = pd.read_csv(r'data/features/features.txt')
res = {
    'query-id': list(),
    'q0': list(),
    'document-id': list(),
    'rank': list(),
    'score': list(),
    'name': list()
}

for index, row in raw.iterrows():
    res['query-id'].append(row['query_id'])
    res['q0'].append('Q0')
    res['document-id'].append(row['table_id'])
    res['rank'].append(0)
    res['score'].append(predictions[index])
    res['name'].append('STANDARD')

In [None]:
df_res = pd.DataFrame.from_dict(res)

In [None]:
df_res.head()

In [None]:
# save results to .txt file (for running trec_eval comparison)
df_res.to_csv('results_1000_3.txt', sep=' ', index=False, header=False)

### Query features

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# QLEN
def get_qlen(query):
    return len(query.split(' '))

# IDF
def get_idf(query, field):
    # instantiate count vectorizer
    cv=CountVectorizer(field, stop_words=sw)
    # this steps generates word counts for the words in your docs
    word_count_vector=cv.fit_transform(field)
    # instantiate tfidf transformer (with use_idf true in order to compute idf scores)
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    # compute the idf scores
    tfidf_transformer.fit(word_count_vector)
    # compute the sum of idf scores for all query terms
    score = sum([tfidf_transformer.idf_[cv.get_feature_names().index(term)] for term in query.split(' ')])
    # return idf score
    return score

### Lexical features

In [None]:
import wikipediaapi
import pageviewapi.period
from wikitables import import_tables

wiki = wikipediaapi.Wikipedia('en')

for i in range(875, 876):
    prefix = '0' * (4 - len(str(i)))
    file = prefix + str(i)
    table = 'data/tables/re_tables-' + file +'.json'
    tmp = pd.read_json(table)
    tmpT = tmp.T
    
    page_titles = set()
    section_titles = set()
    table_captions = set()
    table_bodies = set()
    
    features = {
        'table_id': list(),
        'rows': list(),
        'cols': list(),
        'nulls': list(),
        'inlinks': list(),
        'outlinks': list(),
        'views': list(),
        'table_imp': list(),
        'table_fraction': list()
    }
    
    i = 0
    for index, row in tmpT.iterrows():
        table_id = _tmp.iloc[0].index[i]
        rows = row['numDataRows']
        cols = row['numCols']
        title = row['pgTitle']
        caption = row['caption']
        data = row['data']
        section_title = [item.lower() for item in row['title']]
        section_titles.update(section_title)
        
        inlinks = 0; outlinks = 0; views = 0; table_imp = 0; text_len = 0; chars = 0; nulls = 0
        
        for entry in data:
            for item in entry:
                table_bodies.update({item.lower()})
                if len(item) == 0:
                    nulls += 1
                chars += len(item)
                
        page = wiki.page(title)
        if page.exists():
            inlinks = len(page.backlinks)
            outlinks = len(page.links)
            views = pageviewapi.period.sum_last('en.wikipedia', title, last=365, access='all-access', agent='all-agents')
            table_imp = 1 / (len(import_tables(title)) + 1)
            text_len = len(page.text)
        
        table_fraction = chars / (text_len + 1)
        
        page_titles.update({title.lower()})
        table_captions.update({caption.lower()})
        
        features['table_id'].append(table_id)
        features['rows'].append(rows)
        features['cols'].append(cols)
        features['nulls'].append(nulls)
        features['inlinks'].append(inlinks)
        features['outlinks'].append(outlinks)
        features['views'].append(views)
        features['table_imp'].append(table_imp)
        features['table_fraction'].append(table_fraction)
        i += 1

In [None]:
print(features)
print(page_titles)
print(section_titles)
print(table_captions)
print(table_bodies)

In [None]:
import bigjson.bigjson as bj

with open('tables.json', 'rb') as f:
    reader = bj.FileReader(f, 'utf-8')
    i = reader.read(True, False)
    
print(i.items())

In [None]:
with open('tableMentions.json', 'rb') as f:
    reader = bj.FileReader(f, 'utf-8')
    j = reader.read(True, True)
    
print(j.items())