# Packages importieren

In [1]:
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import pickle

from nltk import word_tokenize

import numpy as np
import json

import math

In [3]:
with open('./data/the_index.pickle', 'rb') as handle:
    the_index = pickle.load(handle)

In [4]:
with open('./data/documents.pickle', 'rb') as handle:
    documents = pickle.load(handle)

In [5]:
with open('./data/docid_counter.pickle', 'rb') as handle:
    docid_counter = pickle.load(handle)

In [6]:
with open('./data/games_reviewed_en.pickle', 'rb') as handle:
    games_reviewed = pickle.load(handle)

# Vector Space Model - Funktionen

In [7]:
def tf(term, docid):
    '''
    Calculate term frequency for term in docid. Return 0 if term not in index,
    or term does not appear in document.
    '''
    if docid in the_index[term]['docs'].keys():
        return the_index[term]['docs'][docid]
    else:
        return 0 

def df(term):
    '''
    Extract frequency of term for document with id docid from index
    '''
    if term in the_index.keys():
        return the_index[term]['df']
    else:
        return 0

def idf(term):
    '''
    Compute idf_t for a term
    '''
    df_t = df(term)
    return math.log(docid_counter/df_t)

def tf_idf(term, docid):
    '''
    Compute tf-idf for term and docid
    '''
    # TODO: implement
    return tf(term,docid)*idf(term)

def norm_cosine(docid):
    '''
    Compute cosine normalization for docid
    '''
    sos = 0
    for term in the_index.keys():
        if docid in the_index[term]['docs'].keys():
            sos += tf_idf(term,docid)**2
    return math.sqrt(sos)

In [8]:
def cosine_score(query, K=3):
    '''
    Compute cosine scores for query `query` and return the top K results
    according to Figure 6.14.
    '''
    # Initialize arrays so we can use docids as indices
    scores = [0] * (len(documents.keys())+1)
    length = [0] * (len(documents.keys())+1)
    result = [0] * (len(documents.keys())+1)
    # Precompute length array values -- these are the normalization factors
    for d in documents.keys():
        length[d] = norm_cosine(d)
    # print('length array: ', length)
    # TODO: implement
    # 1. compute scores for each document, store in scores array
    for word in word_tokenize(query):
        if word in the_index.keys():
            for document_id in the_index[word]['docs']:
                score = tf_idf(word, document_id)
                print(score)
                scores[document_id] += score
    # 2. Normalize using the length array
    #result = np.array(scores) / np.array(length)
    # print(f'scores array {scores}')
    for i in range(1,len(result)):
        result[i] = scores[i]/length[i]
    # 3. Return top-K: sort by descending score and return first K elements of result.
    print(f'result array: {result}')
    top_docs = np.array(result).argsort()[-K:][::-1]
    result = np.array(result)
    result[::-1].sort()
    top_scores = result[:K]
    return list(zip(top_docs, top_scores))


In [9]:
def print_result(results, df):
    """[summary]

    Args:
        results ([type]): [description]

    Returns:
        [type]: [description]
    """
    
    # if not results:
    #     print("No documents found")
    #     return_result = {
    #         "filename": json.loads(0),
    #         "file_url": json.loads(0),
    #         "docid": json.loads(0),
    #         "score": json.loads(0),
    #         "document": json.loads(0)
    #     }
    
    # If we got some results, print them
    # Multiple documents might be accurate, for the beginning we only return the one with the highest score.
    # else:

    result_dict = {}
    for i in range(1,1+len(results)):
        # print(results)
        docid, score = results[i-1]
        if score > 0:
            # print(docid)
            # print(score)
            # db = MongoClient().file_db

            # found_filename = db.documents.find_one({'content':documents[docid]})['filename']
            # file_url = db.files.find_one({'filename':found_filename})['file_url']
            # print(f'There is a document in the documents database with the filename: {found_filename} and corresponding url: {file_url}')

            return_result = {
                "game": df[df["docid"] == docid]["title"].values[0],
                "docid": json.dumps(int(docid)),
                "score": json.dumps(float(score))
            }

            result_dict[i] = return_result
        else:
            # print("No documents found")
            # return_result = {
            #     "filename": json.loads(0),
            #     "file_url": json.loads(0),
            #     "docid": json.loads(0),
            #     "score": json.loads(0),
            #     "document": json.loads(0)
            # }
            print("No documents found")
            return_result = {
                "game": '',
                "docid": '',
                "score": ''
            }
            result_dict[i] = return_result
            break


    return result_dict

In [10]:
tmp = cosine_score("really good story", K=3)

0.38303652976916314
1.7023845767518362
22.301237955449054
0.2979173009315713
0.38303652976916314
0.0851192288375918
135.97796806805292
1.361907661401469
0.7235134451195303
1.6172653479142443
0.6809538307007345
0.1702384576751836
0.1702384576751836
0.38303652976916314
0.0425596144187959
0.9363115172135099
299.70480473716077
0.8511922883759181
1.2767884325638772
0.38303652976916314
0.7660730595383263
128.18955862941326
0.6809538307007345
0.5107153730255508
0.42559614418795905
0.1702384576751836
479.817092957505
0.2979173009315713
9.746151701904262
0.5107153730255508
178.11198634266086
294.34229332039246
0.0425596144187959
4.9369152725803245
0.0425596144187959
0.893751902794714
1.4044672758202648
2.298219178614979
0.0425596144187959
0.1276788432563877
0.7660730595383263
9.66103247306667
0.3404769153503672
0.21279807209397952
0.1276788432563877
0.6383942162819386
0.7831023929066672
4.0025233415229655
33.760414271976316
0.8701137698962969
0.17402275397925937
0.8701137698962969
293.663397340

In [11]:
topN = print_result(tmp, games_reviewed)

In [12]:
print(topN)

{1: {'game': 'Grand Theft Auto V', 'docid': '18', 'score': '0.06484940153681608'}, 2: {'game': 'MONSTER HUNTER: WORLD', 'docid': '23', 'score': '0.047761440501625084'}, 3: {'game': 'ASTRONEER', 'docid': '3', 'score': '0.04043530366381354'}}
