In [580]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
import requests
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

In [581]:
bx_users = pd.read_csv("data/BX-Users.csv", on_bad_lines='skip', sep=';')
bx_books = pd.read_csv("data/BX-Books.csv", on_bad_lines='skip')
bx_books_ratings = pd.read_csv("data/BX-Book-Ratings.csv", on_bad_lines='skip', sep=';')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [582]:
bx_books_ratings = bx_books_ratings[bx_books_ratings['Book-Rating'] != 0]

In [583]:
books_show = bx_books
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

def show_rating(urls):
    for idx, chunk in enumerate(chunks(urls, 10)):
        fig = plt.figure(figsize=(20, 4))
        for n, i in enumerate(chunk):
            try:
                r = requests.get(i)
                im = Image.open(BytesIO(r.content))

            except:
                print('Something went wrong with {url}'.format(url=i))
                continue

            a = fig.add_subplot(1, urls.size, n + 1)
            plt.imshow(im)
            plt.axis('off')
        plt.show()

### 1. Реализовать персональный топ  - принимает на вход возраст и локацию, на выходе персональный топ   - 1 балл

Персональный топ - это топ товаров по похожим возрасту/интересам/локации. Как сделать? Разбить на сегменты по выбраным признакам. Топ делать по книгам с хорошим средним рейтингом.


In [584]:
#Выводит топ лучших из dataframe по оценке
def getTopItems(dataframe, id_name="ISBN", rating_column_name="Book-Rating", top_n=10, min_count=2):
    vc = dataframe[id_name].value_counts()
    ll = dataframe[dataframe[id_name].isin(vc[vc>min_count].index)]
    ll = ll[[id_name, rating_column_name]].groupby(id_name).mean()
    ll = ll.sort_values(rating_column_name, ascending=False)[:top_n]
    ll["id"] = dataframe[id_name]
    return ll

In [585]:
def rec_user(location, age, dataframe, rating_dataframe, age_column_name='Age', location_column_name='Location', user_id_column_name = 'User-ID', top_n=10, min_count=2):
    by_location = dataframe[dataframe[location_column_name].isin([location]) | dataframe[age_column_name].isin([age])][user_id_column_name]
    rating_location = rating_dataframe[rating_dataframe[user_id_column_name].isin(by_location)]
    return getTopItems(rating_location, top_n=top_n, min_count=min_count)

In [619]:
rec = rec_user("castellar del valles, barcelona, spain", 20, bx_users, bx_books_ratings, top_n=10)

KeyError: 'Location'

In [620]:
rec

Unnamed: 0_level_0,Book-Rating,id
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
451524934,9.8,
679781587,9.75,
875421180,9.333333,
345337662,9.333333,
345339711,9.333333,
140378278,9.0,
316776963,8.75,
671027344,8.714286,
440206154,8.666667,
440224764,8.666667,


In [587]:
show_rating(rec)

Something went wrong with Book-Rating
Something went wrong with id


<Figure size 1440x288 with 0 Axes>

### 2. На основе метода кластеризации похожих пользователей построить рекомендации (Слайд 27) - 3 балла

Нужно топ-10 рекомендаций с самой высокой оценкой. Считаем среднюю оценку для каждой книги по кластеру и выводим топ-10 книг.


In [588]:
le = LabelEncoder()
# bx_books_ratings['ISBN'] = le.fit_transform(bx_books_ratings['ISBN'])
# bx_books['ISBN'] = le.fit_transform(bx_books['ISBN'])

In [589]:
bx_users["Location"] = bx_users['Location'].str.split(",")

In [590]:
bx_users = bx_users.drop(bx_users[bx_users['Location'].apply(lambda x: len(x) <= 2)].index)

In [591]:
location = bx_users['Location']

In [592]:
city = [location.iloc[i][0] for i in range(location.shape[0])]
state = [location.iloc[i][1] for i in range(location.shape[0])]
country = [location.iloc[i][2] for i in range(location.shape[0])]

In [593]:
bx_users["city"] = pd.Series(city)
bx_users["state"] = pd.Series(state)
bx_users["country"] = pd.Series(city)

In [594]:
bx_users = bx_users.drop("Location", axis=1)

In [595]:
city_encoder = LabelEncoder()
state_encoder = LabelEncoder()
country_encoder = LabelEncoder()
bx_users["city"] = city_encoder.fit_transform(bx_users["city"])
bx_users["state"] = state_encoder.fit_transform(bx_users["state"])
bx_users["country"] = country_encoder.fit_transform(bx_users["country"])

In [596]:
bx_users["Age"] = bx_users["Age"].fillna(bx_users["Age"].median())

In [597]:
n_clusters = 10
users_kmeans = bx_users.drop("User-ID", axis=1)
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(users_kmeans)

KMeans(n_clusters=10)

In [598]:
np.bincount(kmeans.labels_)

array([31102, 27768, 26734, 29228, 25888, 26240, 24292, 24515, 31427,
       31660])

In [599]:
claster_numbers = kmeans.labels_
bx_users["claster"] = pd.Series(claster_numbers)

In [600]:
def user_recomendation(user):
    cluster = kmeans.predict([user])
    cluster_ids = bx_users[bx_users["claster"] == cluster[0]]
    cluster_rating = bx_books_ratings[bx_books_ratings["User-ID"].isin(cluster_ids["User-ID"])]
    return getTopItems(cluster_rating)

In [601]:
def rec_user_cluster(city, state, country, age):
    user = [city_encoder.transform([city])[0], state_encoder.transform([state])[0], country_encoder.transform([country])[0], age]
    return user_recomendation(user)

In [602]:
cluster_books = rec_user_cluster("castellar del valles", " barcelona", "spain", 20)

In [603]:
show_rating(cluster_books["id"])

Something went wrong with nan
Something went wrong with nan
Something went wrong with nan
Something went wrong with nan
Something went wrong with nan
Something went wrong with nan
Something went wrong with nan
Something went wrong with nan
Something went wrong with nan
Something went wrong with nan


<Figure size 1440x288 with 0 Axes>

### 3. Совстречаемость - 3 балла

В совстречаемости также учитывать оценки. Вес пары книг встретившихся у пользователя - полусумма их оценок.

In [None]:
class Recomendations():
    def __init__(self, Int):
        # Создаём словарь для того, чтобы по product_id возвращать url товара
        self.product_id_to_url = {}
        for i in Int[["product_id", "picture_url"]].drop_duplicates().values:
            self.product_id_to_url[i[0]] = i[1]
        self.interactions = Int

    def coocurrency_count(self):
        Int = self.interactions[["vid", "product_id"]].drop_duplicates()
        # Для каждого пользователя собираем товары, с которыми он взамиодействовал
        user_products = Int.groupby(["vid"])["product_id"].apply(list).reset_index()
        # Считаем число товаров, с которыми взаимодействовал каждый пользователь и записываем в колонку prod_num
        product_num = [len(i) for i in user_products["product_id"]]
        user_products["prod_num"] = product_num
        # Фильтруем пользователей, взаимодействовавших с одним товаром
        user_products = user_products[user_products["prod_num"] > 1]

        # Заводим словарь под товары
        cooc = {}
        #print(user_products)
        for i in tqdm.tqdm_notebook(user_products.values):
            for j in range(len(i[1])):
                for k in range(len(i[1])):
                    if j != k:
                        jRate = interactions[(interactions['product_id'] == i[1][j]) & (interactions['vid'] == i[0]) ]['Book-Rating']
                        kRate = interactions[(interactions['product_id'] == i[1][k]) & (interactions['vid'] == i[0]) ]['Book-Rating']
                        try:
                            cooc[str(i[1][j]) + "_" + str(i[1][k])] += (jRate+kRate)/2
                        except:
                            cooc[str(i[1][j]) + "_" + str(i[1][k])] = (jRate+kRate)/2
        cooc_list = []
        for i, j in cooc.items():
            # Если товары встетились вместе больше одного раза, то добавляем в список
            if j > 1:
                cooc_list.append(i.split("_") + [j])
        self.cooc_rec = pd.DataFrame(cooc_list, columns=["item1", "item2", "measure"])

    def get_rec(self, i, show=False):
        recs = self.cooc_rec[self.cooc_rec["item1"] == str(i)]
            .sort_values("measure", ascending=False)
            .head(10)
        print(u"Для товара")
        rec_imaging([i], self.product_id_to_url)
        print(u"Такие рекомендации")
        rec_imaging(recs["item2"].values.astype(int), self.product_id_to_url,
                    recs["measure"].values.astype(int))

In [None]:
cooc_rec = Recomendations(interactions)
cooc_rec.coocurrency_count()

### 4. Коллаборативная фильтрация - 3 балла

Коллаборативную фильтрацию реализовывать как на слайде 51 презентации, посоветовав каждому пользователю топ-10 книг с самой высокой оценкой. Сделать рекомендации User-based и Item-based и сравнить.

Если совсем сложно - можно сделать как в семинарской части, поставив оценку "0", если рейтинг < 5 и "1" - в противном случае. Тогда максимум за это - 1 балл. Реализовать U2I и I2I рекомендации.

In [604]:
books_meets = bx_books_ratings.groupby("ISBN")["User-ID"].count().reset_index().rename(columns={"User-ID": "user_num"})
user_meets = bx_books_ratings.groupby("User-ID")["ISBN"].count().reset_index().rename(columns={"ISBN": "books_num"})

In [605]:
interactions = bx_books_ratings.merge(books_meets, on=["ISBN"]).merge(user_meets, on=["User-ID"])

In [606]:
interactions = interactions[(interactions["user_num"] > 5) &
                            (interactions["books_num"] > 5) &
                            (interactions["books_num"] < 200)]

In [607]:
interactions = interactions.merge(bx_books[["ISBN", "Image-URL-M", "Book-Title"]].rename(
    columns={"Image-URL-M": "picture_url"}), on=["ISBN"])

In [608]:
interactions

Unnamed: 0,User-ID,ISBN,Book-Rating,user_num,books_num,picture_url,Book-Title
0,86583,3404139178,9,8,18,http://images.amazon.com/images/P/3404139178.0...,Das Lacheln der Fortuna: Historischer Roman
1,132500,3404139178,10,8,43,http://images.amazon.com/images/P/3404139178.0...,Das Lacheln der Fortuna: Historischer Roman
2,66483,3404139178,10,8,83,http://images.amazon.com/images/P/3404139178.0...,Das Lacheln der Fortuna: Historischer Roman
3,276866,3404139178,9,8,11,http://images.amazon.com/images/P/3404139178.0...,Das Lacheln der Fortuna: Historischer Roman
4,106534,3404139178,6,8,6,http://images.amazon.com/images/P/3404139178.0...,Das Lacheln der Fortuna: Historischer Roman
...,...,...,...,...,...,...,...
12631,183213,3499137909,10,7,8,http://images.amazon.com/images/P/3499137909.0...,Der Plan von der Abschaffung des Dunkels.
12632,140226,3499137909,5,7,10,http://images.amazon.com/images/P/3499137909.0...,Der Plan von der Abschaffung des Dunkels.
12633,206652,1860465811,9,7,10,http://images.amazon.com/images/P/1860465811.0...,Wind-Up Bird Chronicle
12634,27100,3442760003,8,7,6,http://images.amazon.com/images/P/3442760003.0...,Liebesleben


In [609]:
interactions["product_id"] = le.fit_transform(interactions["ISBN"])
userLe = LabelEncoder()
userLe.fit(interactions["User-ID"])
interactions["vid"] = userLe.transform(interactions["User-ID"])

In [610]:
csr_rates = coo_matrix((interactions["Book-Rating"], (interactions["vid"], interactions["product_id"])),
                       shape=(len(set(interactions["vid"])), len(set(interactions["product_id"]))))

In [611]:
csr_array = csr_rates.toarray()

In [612]:
id_book = interactions.drop_duplicates(["Book-Title"])

In [613]:
class Colloborative():
    def __init__(self):
        self.csr_rates = csr_rates

    def user_based(self, idx):
        user_rates = self.csr_rates.getrow(idx).toarray()[0]
        watched_items = np.where(user_rates != 0)[0]
        # считаем косинус между пользователем idx и всеми пользователями
        metrics = cosine_similarity([user_rates], self.csr_rates).reshape(-1, 1)
        # домножаем оценки пользователя на коэффициент похожести
        rates = csr_matrix.multiply(self.csr_rates.copy(), metrics)
        # чтобы не рекомендовать уже просмотренные - зануляем веса просмотренных
        total_rate = (1 - user_rates.astype(bool)) * np.array(np.sum(rates, axis=0))[0]
        # печатаем рекомендации
        self.get_rec(watched_items, np.arange(self.csr_rates.shape[1])[np.argsort(total_rate)[::-1][:10]],
                     np.sort(total_rate)[::-1][:10])

    def item_based(self, idx):
        item_rates = self.csr_rates.getcol(idx).toarray().reshape(1, -1)
        # считаем косинус между товаром idx и всеми пользователями
        metrics = cosine_similarity(item_rates, self.csr_rates.T).reshape(-1, 1)
        # домножаем оценки пользователя на коэффициент похожести
        rates = csr_matrix.multiply(self.csr_rates.T.copy(), metrics)
        total_rate = np.array(np.sum(rates, axis=1)).ravel()
        # печатаем рекомендации
        self.get_rec([idx], np.arange(self.csr_rates.shape[1])[np.argsort(total_rate)[::-1][:10]],
                     np.sort(total_rate)[::-1][:10])

    def get_rec(self, watched, recs, measure):
        print(u"Для таких товаров")
        # print([i for i in watched])
        print([i for i in watched])
        in_urls = id_book[id_book["product_id"].isin(watched)]["Book-Title"]
        print([in_urls.iloc[i] for i in range(in_urls.shape[0])])
        print(u"Такие рекомендации")
        print([i for i in recs])
        out_urls = id_book[id_book["product_id"].isin(recs)]["Book-Title"]
        print([out_urls.iloc[i] for i in range(out_urls.shape[0])])
        show_rating(out_urls)

In [614]:
col = Colloborative()

In [615]:
id_book[id_book["Book-Title"] == "Harry Potter and the Sorcerer's Stone (Book 1)"]

Unnamed: 0,User-ID,ISBN,Book-Rating,user_num,books_num,picture_url,Book-Title,product_id,vid
6665,143163,043936213X,10,44,89,http://images.amazon.com/images/P/043936213X.0...,Harry Potter and the Sorcerer's Stone (Book 1),328,3239


In [616]:
col.item_based(328)

Для таких товаров
[328]
["Harry Potter and the Sorcerer's Stone (Book 1)"]
Такие рекомендации
[328, 327, 636, 844, 36, 692, 855, 683, 135, 434]
['Harry Potter and the Order of the Phoenix (Book 5)', 'All I Really Need to Know', "The No. 1 Ladies' Detective Agency (Today Show Book Club #8)", 'Morality for Beautiful Girls (No.1 Ladies Detective Agency)', "The Lost Boy: A Foster Child's Search for the Love of a Family", 'A Child Called \\It\\": One Child\'s Courage to Survive"', 'Anne of Green Gables (Anne of Green Gables Novels (Paperback))', 'The World According to Garp', "Harry Potter and the Sorcerer's Stone (Book 1)", 'The Perfect Storm : A True Story of Men Against the Sea']
Something went wrong with Harry Potter and the Order of the Phoenix (Book 5)
Something went wrong with All I Really Need to Know
Something went wrong with The No. 1 Ladies' Detective Agency (Today Show Book Club #8)
Something went wrong with Morality for Beautiful Girls (No.1 Ladies Detective Agency)
Something w

<Figure size 1440x288 with 0 Axes>

In [617]:
col.user_based(3239)

Для таких товаров
[327, 328, 636, 833, 844, 852]
['Harry Potter and the Order of the Phoenix (Book 5)', 'Chicken Soup for the Teenage Soul (Chicken Soup for the Soul)', 'All I Really Need to Know', 'A Child Called \\It\\": One Child\'s Courage to Survive"', "Life's Little Instruction Book; Volume II", "Harry Potter and the Sorcerer's Stone (Book 1)"]
Такие рекомендации
[483, 855, 336, 692, 831, 859, 840, 94, 951, 365]
["Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))", 'The Firm', 'Chicken Soup for the Soul (Chicken Soup for the Soul)', "The Color of Water: A Black Man's Tribute to His White Mother", 'The Bridges of Madison County', "The No. 1 Ladies' Detective Agency (Today Show Book Club #8)", 'Chicken Soup for the Teenage Soul II (Chicken Soup for the Soul Series)', "The Lost Boy: A Foster Child's Search for the Love of a Family", 'Chocolat', "Life's Little Instruction Book (Life's Little Instruction Books (Paperback))"]
Something went wrong with Harry Potter and th

<Figure size 1440x288 with 0 Axes>

In [618]:
#ВСЕ С КАЙФОМ