In [1]:
!pip install metapy
import metapy

Collecting metapy
[?25l  Downloading https://files.pythonhosted.org/packages/81/a4/92dae084446597d6bbf355e7eaff3e83dcb51e33d434f43ecdea4c0c4b0a/metapy-0.2.13-cp36-cp36m-manylinux1_x86_64.whl (14.3MB)
[K     |████████████████████████████████| 14.3MB 315kB/s 
[?25hInstalling collected packages: metapy
Successfully installed metapy-0.2.13


In [2]:
!wget -nc https://raw.githubusercontent.com/meta-toolkit/meta/master/data/lemur-stopwords.txt
#!wget -N http://www-personal.umich.edu/~shiyansi/covid_ir.tar.gz
#!tar xf covid_ir.tar.gz

--2020-12-16 17:42:22--  https://raw.githubusercontent.com/meta-toolkit/meta/master/data/lemur-stopwords.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2747 (2.7K) [text/plain]
Saving to: ‘lemur-stopwords.txt’


2020-12-16 17:42:23 (44.8 MB/s) - ‘lemur-stopwords.txt’ saved [2747/2747]



In [39]:
with open('critics/tutorial.toml', 'w') as f:
    f.write('type = "line-corpus"\n')
    f.write('store-full-text = true\n')

config = """prefix = "." # tells MeTA where to search for datasets

dataset = "critics" # a subfolder under the prefix directory
corpus = "tutorial.toml" # a configuration file for the corpus specifying its format & additional args

index = "critics-idx" # subfolder of the current working directory to place index files

query-judgements = "critics/doesntmatter.txt" # file containing the relevance judgments for this dataset

stop-words = "lemur-stopwords.txt"

[[analyzers]]
method = "ngram-word"
ngram = 1
filter = "default-unigram-chain"
"""
with open('critics-config.toml', 'w') as f:
    f.write(config)

### Build the inverted index with metapy

In [40]:
inv_idx = metapy.index.make_inverted_index('critics-config.toml') 

In [6]:
# You can define your own retrieval function 
import math 
class MyPivoted(metapy.index.RankingFunction):                                                                                                                    
    def __init__(self, b = 0.1):                            
        self.b = b
        # You *must* invoke the base class __init__() here!
        super(MyPivoted, self).__init__()                                        
                                                                                 
    def score_one(self, sd):
        
        """
        You need to override this function to return a score for a single term.
        You may want to call some of the following variables when implementing your retrieval function:
        
        sd.avg_dl: average document length of the collection
        sd.num_docs: total number of documents in the index
        sd.query_term_weight: query term count (or weight in case of feedback)
        sd.doc_count: number of documents that a term t_id appears in
        sd.doc_term_count: number of times the term appears in the current document
        
        """
        # Treating score for a term as: score = A*numerator/denominator*B
        # Set parameter
        b = self.b
        # Term A
        A = sd.query_term_weight
        # Numerator
        numerator = (1 + math.log(1+math.log(sd.doc_term_count)))
        # Denominator
        denominator = (1 - b + b*(sd.doc_size/sd.avg_dl))
        # Term B
        B = math.log((sd.num_docs+1)/sd.doc_count)
        # Return results
        return (A*numerator/denominator*B)

        

In [7]:
# You can define your own retrieval function 
import math 
class MyBM25Reimplementation(metapy.index.RankingFunction):                                                                                                                    
    def __init__(self, b = 0.5, k1=1.2,k3=500):                                             
        self.b = b
        self.k1 = k1
        self.k3 = k3
        # You *must* invoke the base class __init__() here!
        super(MyBM25Reimplementation, self).__init__()                                        
                                                                                 
    def score_one(self, sd):
        """
        You need to override this function to return a score for a single term.
        
        You may want to call some of the following variables when implementing your retrieval function:
        
        sd.avg_dl: average document length of the collection
        sd.num_docs: total number of documents in the index
        sd.total_terms: total number of terms in the index
        sd.query_length: the total length of the current query (sum of all term weights)
        sd.query_term_weight: query term count (or weight in case of feedback)
        sd.doc_count: number of documents that a term t_id appears in
        sd.corpus_term_count: number of times a term t_id appears in the collection
        sd.doc_term_count: number of times the term appears in the current document
        sd.doc_size: total number of terms in the current document
        sd.doc_unique_terms: number of unique terms in the current document
        
        """
        # Set parameters
        b = self.b      
        k1 = self.k1
        k3 = self.k3 

        # Term A
        numerator = sd.num_docs-sd.doc_count+0.5
        denominator = sd.doc_count + 0.5
        A = math.log(numerator/denominator)

        # Term B
        numerator = (k1+1)*sd.doc_term_count
        denominator = k1*(1-b+b*sd.doc_size/sd.avg_dl)+sd.doc_term_count
        B = numerator/denominator

        # Term C
        numerator = (k3+1)*sd.query_term_weight
        denominator = k3+sd.query_term_weight
        C = numerator/denominator

        #if (sd.query_term_weight!=1.0):
        #  print(sd.query_term_weight)
        # Return results
        return (A*B*C)

In [117]:
def refine_query(text):
  l_query = text.strip().split()
  l_important = ["racing","races","shooter","indie","independent","rpg","RPG","strategy","adventure","tactical","platform","platformer","metroidvania","Metroidvania","fps","FPS","sport","sports","open","world","open-world","turn-based"]
  l_new_query = l_query.copy()
  for word in l_query:
    #print("word in l_query:",l_query)
    if word in l_important: # if word is "important" then add it one more time to give it more weight in the query
      #print("word in important:",word)
      l_new_query.append(word)
      #print("word appended in new_query:",l_new_query)
  new_query = ""
  new_query = ' '.join(l_new_query)
  return (new_query.strip())

In [8]:
ranker = metapy.index.OkapiBM25(k1 = 1.2, b = 0.5, k3 = 500)
my_ranker_bm25 = MyBM25Reimplementation(k1 = 1.2, b = 0.75, k3 = 500)
my_ranker_pivoted = MyPivoted(b=0.1)

In [118]:
import pandas as pd
# Read scraped data
df_data = pd.read_csv('SCRAPE_EXAMPLE.csv')
# Specify number of results to show
num_results = 1
custom_ranking_retrieval_results = []
my_custom_ranking_retrieval_results = []
my_pivoted_results = []
with open('critics/critics-queries.txt') as query_file:
    for query_num, line in enumerate(query_file):
        # We need to filter the queries
        query = metapy.index.Document()
        # Remove positive adjectives from query
        preprocessed_query = refine_query(line)
        query.content(preprocessed_query)
        results = ranker.score(inv_idx, query, num_results)  
        my_results = my_ranker_bm25.score(inv_idx, query, num_results)
        pivoted_results = my_ranker_pivoted.score(inv_idx,query,num_results)
        res_list = [(query_num + 1, x[0]) for x in results]
        custom_ranking_retrieval_results += res_list
        my_res_list = [(query_num + 1, x[0]) for x in my_results]
        my_custom_ranking_retrieval_results += my_res_list
        my_pivoted_list = [(query_num + 1, x[0]) for x in pivoted_results]
        my_pivoted_results += my_pivoted_list
        #print(query_num)
        

        #print("RESULTS:",results)
        #print("SET RESULTS:",set_results)

        # intilize a null list 
        unique_list = [] 
        content_list = []  
    
        print("Query:", line)
        for num, (d_id, _) in enumerate(results):
            content_list.append(inv_idx.metadata(d_id).get('content'))

        # traverse for all elements 
        for x in content_list: 
            # check if exists in unique_list or not 
            if x not in unique_list: 
                unique_list.append(x.strip())

        for searchfor in unique_list:
          final_index = 0
          index = 0
          # For each game
          for item in df_data["Critics"]:
            if searchfor in item: # if game found retrieve index and stop loop
              final_index = index
              break
            else:
              index+=1 
          # Game attributes
          game_title = df_data["Game_names"][final_index]
          release_date = df_data["Release_dates"][final_index]
          meta_score = df_data["Meta_scores"][final_index]
          user_score = df_data["User_scores"][final_index]
          summary = df_data["Summaries"][final_index]       
          # Print result
          print("Game:",game_title)
          print("Release date:",release_date)
          print("Metascore:",meta_score)
          print("User's score:",user_score)
          print("Metacritic ranking:",final_index+1)
          print("Summary:",summary.strip())
          print("Most similar review:",searchfor)
          print("\n")
          print("-------------------------------------------------------------")
          print("\n")
        #print("Retrieved Results Pivoted Normalization")
        #for num, (d_id, _) in enumerate(pivoted_results):
        #   content = inv_idx.metadata(d_id).get('content')
        #   print(str(num + 1), content)

        #print("Query: ", query.content())
        #print("Retrieved Results")
        #for num, (d_id, _) in enumerate(results):
        #   content = inv_idx.metadata(d_id).get('content')
        #   print(str(num + 1), content)
        #for num, (d_id, _) in enumerate(my_results):
        #   content = inv_idx.metadata(d_id).get('content')
        #   print(str(num + 1), content)
      

Query: Zelda

Game: The Legend of Zelda: Breath of the Wild
Release date: March 3, 2017
Metascore: 97
User's score: 8.6
Metacritic ranking: 1
Summary: Forget everything you know about The Legend of Zelda games. Step into a world of discovery, exploration and adventure in The Legend of Zelda: Breath of the Wild, a boundary-breaking new game in the acclaimed series. Travel across fields, through forests and to mountain peaks as you discover what has become of the ruined kingdom of Hyrule in this open-air adventure. Explore the wilds of Hyrule any way you like - Climb up towers and mountain peaks in search of new destinations, then set your own path to get there and plunge into the wilderness. Along the way, you'll battle towering enemies, hunt wild beasts and gather ingredients for the food and elixirs you'll need to sustain you on your journey. More than 100 Shrines of Trials to discover and explore - Shrines dot the landscape, waiting to be discovered in any order you want. Search for 