In [232]:
import io
import pandas as pd
import requests
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import get_dataset_dir
from collections import defaultdict
from SPARQLWrapper import SPARQLWrapper, JSON
import re
import pdb


min_films = 5
knn = 4

In [233]:
def get_info():
    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    info = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as file:
        for line in file:
            line = line.split('|')
            info[line[0]] = (line[1], line[2])
    return info

In [234]:
def rnk_surprise(usr):
    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    sim_options = {'name': 'cosine', 'user_based': True, 'min_support': min_films}
    algo = KNNWithMeans(k = knn, min_k = knn, sim_options = sim_options)
    algo.fit(trainset)

    testset = trainset.build_anti_testset()
    testset = filter(lambda x: x[0] == usr, testset)
    predictions = algo.test(testset)
    top_n = defaultdict(list)

    for uid, iid, _, est, _ in predictions:
        top_n[uid].append((iid, round(est, 3)))

    for uid, user_rnk in top_n.items():
        user_rnk.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_rnk[:min_films]

    info = get_info()
    
    query = []
    row_format = "{:^10} {:<70} {:^1}"
    
    print("User " + str(usr))
    for film_rid, rnk in top_n[usr]:
        print(row_format.format(film_rid, str(info[film_rid]), rnk))
        
        film_year = re.findall(r"\d{4}", info[film_rid][0])
        
        if info[film_rid][0].find(" (") != -1:
            at = info[film_rid][0].find(" (")
            film = info[film_rid][0][:at]
            
        if info[film_rid][0].find(", The") != -1: # обрезать по ", " ??
            at = info[film_rid][0].find(", The")
            film = info[film_rid][0][:at]
            
        if info[film_rid][0].find(", Das") != -1:
            at = info[film_rid][0].find(", Das")
            film = info[film_rid][0][:at]
            
        query.append([film, film_year[0]])
    return query

In [235]:
usr = input("Enter user id: ")
query = rnk_surprise(usr)

Enter user id: 1
Computing the cosine similarity matrix...
Done computing similarity matrix.
User 1
   302     ('L.A. Confidential (1997)', '01-Jan-1997')                            4.953
   902     ('Big Lebowski, The (1998)', '26-Dec-1997')                            4.87
   1367    ('Faust (1994)', '01-Jan-1994')                                        4.761
   516     ('Local Hero (1983)', '01-Jan-1983')                                   4.76
   1449    ('Pather Panchali (1955)', '22-Mar-1996')                              4.736


In [236]:
API_ENDPOINT = "https://www.wikidata.org/w/api.php"
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")


for i in range(len(query)):
    flag = True
    n = 0
    while flag:
        params = {
            'action' : 'wbsearchentities',
            'format' : 'json',
            'language' : 'en',
            'continue' : n,
            'search': query[i][0]
        }    
        res = requests.get(API_ENDPOINT, params = params)
        rslt = res.json()['search']
        
        if len(rslt) != 0:
            for j in range(len(rslt)):
                if rslt[j]['description'].find(query[i][1]) != -1:
                    movie_code = rslt[j]['id']
                    flag = False
                    break
            n +=7
        else:
            print("")
            print(query[i][0] + ", " +query[i][1] + " has a different publication year or a different name on Wikidata!")
            movie_code = False
            flag = False

    if not movie_code:
        continue
           
    spaqrql_query = """
        SELECT DISTINCT ?film ?filmLabel
        WHERE 
        {
          {
            SELECT ?screenwriter_req
            WHERE
            {
              wd:""" + movie_code + """ wdt:P58 ?screenwriter_req.
            }
          }

          ?film wdt:P31 wd:Q11424.
          ?film wdt:P58 ?screenwriter.

          FILTER(?screenwriter = ?screenwriter_req)
          SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
        }
        ORDER BY (?filmLabel)
        """

    sparql.setQuery(spaqrql_query)

    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    print("")
    
    if results['results']['bindings']:
        results_df = pd.io.json.json_normalize(results['results']['bindings'])
        print("All films from screenwriters of " + query[i][0] + ", " + query[i][1] + ":")
        print(results_df[['film.value','filmLabel.value']])
    else:
        print(query[i][0] + ", " +query[i][1] + " doesn't have a screenwriter on Wikidata!")


All films from screenwriters of L.A. Confidential, 1997:
                                  film.value  \
0    http://www.wikidata.org/entity/Q3599091   
1    http://www.wikidata.org/entity/Q3600228   
2     http://www.wikidata.org/entity/Q114076   
3    http://www.wikidata.org/entity/Q1364051   
4     http://www.wikidata.org/entity/Q739498   
5     http://www.wikidata.org/entity/Q598338   
6    http://www.wikidata.org/entity/Q1464230   
7     http://www.wikidata.org/entity/Q506661   
8    http://www.wikidata.org/entity/Q1166150   
9   http://www.wikidata.org/entity/Q50276507   
10    http://www.wikidata.org/entity/Q632328   
11   http://www.wikidata.org/entity/Q1540481   
12    http://www.wikidata.org/entity/Q339876   
13  http://www.wikidata.org/entity/Q17182571   
14    http://www.wikidata.org/entity/Q973722   
15    http://www.wikidata.org/entity/Q258009   
16    http://www.wikidata.org/entity/Q221586   
17    http://www.wikidata.org/entity/Q279057   
18  http://www.wikidata.org/en