In [3]:
cd ../..

/home/neon/Documents/cwi_assignament


# Pipeline

In [6]:
import pandas as pd
import numpy as np

from src.utils import format_table
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


We first load the `corpus.parquet` and `queries.parquet` files, which contain the tables and desired queries to be answered

In [28]:
corpus = pd.read_parquet('./data/corpus.parquet')
queries = pd.read_parquet('./data/queries.parquet')

Since pandas dataframes that have been saved lose their data types, it is necessary to format each row to be a proper table.

In [29]:
corpus['table'] = corpus['table'].apply(lambda x: format_table(x)) 
corpus.iloc[2]['table']

Unnamed: 0,Year,Award,Nominee,Category,Result
0,2013,DJ Magazine Awards,Dyro,Top 100 DJs,30
1,2014,DJ Magazine Awards,Dyro,Top 100 DJs,27
2,2015,DJ Magazine Awards,Dyro,Top 100 DJs,27
3,2016,DJ Magazine Awards,Dyro,Top 100 DJs,93


Here is an example of a Query and Answer from our data:

In [32]:
idx = queries.sample()['database_id'].values[0]
print('Q:{}\nA:{}'.format(queries[queries['database_id'] == idx]['query'].values[0],
queries[queries['database_id'] == idx]['answer'].values[0]))

Q:In what role on what series did Jack Donnelly guest-star in 2016?
A:In 2016, Jack Donnelly guest starred as Samuel Walker in the series Death in Paradise.


## RAG PIPELINES


In [37]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

def naive_pipeline(corpus, queries):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    corpus_sequences  = [x.to_html() for x in corpus['table']]
    queries_sequences = [x for x in queries['query']]
    context_sequences = [str(x) for x in corpus['context']]

    corpus_embeddings  = model.encode(corpus_sequences)
    queries_embeddings = model.encode(queries_sequences)
    context_embeddings = model.encode(context_sequences)

    embeddings = (corpus_embeddings+context_embeddings)/2

    cosim = cosine_similarity(queries_embeddings, embeddings)

    return cosim


    # for i, query_text in enumerate(queries_sequences):
    #     sorted_indices = np.argsort(cosim[i])
    #     sorted_tables  = corpus['table'].to_numpy()[sorted_indices]
    # sorted_context = corpus['context'].to_numpy()[sorted_indices]
    # answer_text    = queries['answer'][i]

### Naive Solution

In [38]:
coss = naive_pipeline(corpus, queries)

In [52]:
sorted_indices = np.argsort(coss, axis=1)
naive_top_five = corpus['table'].to_numpy()[sorted_indices[:, :5]]

### Lanchain

In [55]:
print()
naive_top_five[0, 0]

Unnamed: 0,State,State reptile,Scientific name,Year adopted,Conservation status,Photograph,References
0,Alabama,Alabama red-bellied turtle,Pseudemys alabamensis,1990,Endangered,A red-bellied turtle with its limbs retracted ...,-
1,Arizona,Arizona ridge-nosed rattlesnake,Crotalus willardi subspecies willardi,1986,Least Concern,An Arizona Ridge-Nosed Rattlesnake somewhat co...,-
2,California,Desert tortoise,Gopherus agassizii,1972,Vulnerable,A desert tortoise standing on dry and cracked ...,-
3,Colorado,Western painted turtle,Chrysemys picta subspecies bellii,2008,Least Concern,A western painted turtle facing away from the ...,-
4,Florida,American alligator (state reptile),Alligator mississippiensis,1987,Least Concern,A large American alligator standing half on sa...,-
5,Florida,Loggerhead sea turtle (state saltwater reptile),Caretta caretta,2008,Vulnerable,A loggerhead sea turtle swimming in an aquarium.,-
6,Florida,Gopher tortoise (state tortoise),Gopherus polyphemus,2008,Vulnerable,An abraded tortoise walking on sandy ground.,-
7,Georgia,Gopher tortoise,Gopherus polyphemus,1989,Vulnerable,An abraded tortoise walking on sandy ground.,-
8,Illinois,Painted turtle,Chrysemys picta,2005,Least Concern,A midland painted turtle standing on rocky gro...,-
9,Kansas,Ornate box turtle,Terrapene ornata,1986,Near Threatened,An ornate box turtle with a slightly dirty car...,-


In [38]:


print('Query: {}'.format(query_text))
print('Answer: {}'.format(answer_text))
sorted_tables[-2]

Query: In what films did Pooja Ramachandran play Cathy?
Answer: Pooja Ramachandran starred as Cathy in Kadhalil Sodhappuvadhu Yeppadi and its Telugu version Love Failure.


Unnamed: 0,Year,Film,Role,Language,Notes
0,2002,Yathrakarude Sradhakku,-,Malayalam,-
1,2012,Kadhalil Sodhappuvadhu Yeppadi,Cathy,Tamil,-
2,2012,Love Failure,Cathy,Telugu,-
3,2012,Nanban,Jeeva's Wife,Tamil,-
4,2012,Pizza,Smitha,Tamil,-
5,2013,Swamy Ra Ra,Bhanu,Telugu,-
6,2013,Lucky Star,Swapna,Malayalam,-
7,2013,D Company,Teena,Malayalam,-
8,2014,Adavi Kaachina Vennela,-,Telugu,-
9,2015,Nannbenda,-,Tamil,-


In [37]:
sorted_context[-2]

{'table_page_title': 'Pooja Ramachandran',
 'table_section_title': 'Filmography'}