# Eda

In [2]:
import datetime
from itertools import chain

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from itertools import chain
import scipy.sparse as sp
from implicit.nearest_neighbours import CosineRecommender, BM25Recommender, TFIDFRecommender
from lightfm.data import Dataset
from lightfm import LightFM
from tqdm.auto import tqdm

In [3]:
genome_scores = pd.read_csv('../input/diplom/genome-scores.csv')
genome_tags = pd.read_csv('../input/diplom/genome-tags.csv')
ratings = pd.read_csv('../input/diplom/ratings.csv')
movies = pd.read_csv('../input/diplom/movies.csv')
kinopoisk_data = pd.read_csv('../input/diplom/100pct.csv')

In [4]:
def is_float(element) -> bool:
    try:
        float(element)
        return False
    except ValueError:
        return True

In [5]:
movies

In [6]:
ratings

In [7]:
active_movies = movies[movies.movieId.isin(ratings.movieId.unique())]

In [8]:
active_movies

In [9]:
merged_films = kinopoisk_data.merge(active_movies, how='right', left_on='title', right_on='title')

In [10]:
null_movie_id = merged_films[merged_films.original_title.isnull() & merged_films.film_id.isnull()].movieId

In [11]:
#Количество пользователей, которые смотрели фильмы, которые не встречаются в базе данных кинопоиска
len(ratings[ratings.movieId.isin(null_movie_id)].userId.unique())

In [12]:
bad_users = ratings[(ratings.movieId.isin(null_movie_id))].userId.unique()

In [13]:
ratings_unique = ratings[(~ratings.userId.isin(bad_users))]

In [14]:
len(ratings_unique)

In [15]:
ratings_unique.timestamp = pd.to_datetime(ratings_unique.timestamp, unit='s')

In [16]:
ratings_unique.timestamp.hist(bins=100)

In [17]:
number_views = ratings.groupby('userId')['movieId'].agg(['count']).reset_index()

In [18]:
number_views['count'].min()

Рассмотрим Kinopoisk

In [19]:
kinopoisk_data.year = pd.to_datetime(kinopoisk_data.year.astype(str).apply(lambda x: x[:4] if len(x)>1 else None))
kinopoisk_data.kinopoisk_rating	= kinopoisk_data.kinopoisk_rating.apply(lambda x: None if x=='–' or is_float(x) else x)
kinopoisk_data.duration = kinopoisk_data.duration.astype(str).apply(lambda x: x[:-3]).apply(lambda x: x if len(x)>0 else None).astype(float)
kinopoisk_data.genres = kinopoisk_data.genres.str.split(', ')
# kinopoisk_data.film_id = kinopoisk_data.film_id.astype(int)
kinopoisk_data.country = kinopoisk_data.country.str.split(', ')
kinopoisk_data.scenario = kinopoisk_data.scenario.apply(lambda x: None if x=='—' else x)
kinopoisk_data.scenario = kinopoisk_data.scenario.str.split(', ')
kinopoisk_data.producer = kinopoisk_data.producer.apply(lambda x: None if x=='—' else x)
kinopoisk_data.producer = kinopoisk_data.producer.str.split(', ')

In [20]:
kinopoisk_data = kinopoisk_data[['title','kinopoisk_rating','imdb_rating','year','country','genres','producer','scenario','age','duration']]

In [21]:
kinopoisk_data[(kinopoisk_data.kinopoisk_rating != None) & (kinopoisk_data.imdb_rating != None)].imdb_rating.astype(float).median()

In [22]:
kinopoisk_data[(kinopoisk_data.kinopoisk_rating != None) & (kinopoisk_data.imdb_rating != None)].kinopoisk_rating.astype(float).median()

In [23]:
kinopoisk_data

In [24]:
list_country = kinopoisk_data.country.dropna().tolist()
value_list_country = pd.Series(list(chain(*list_country))).value_counts()
value_list_country = pd.DataFrame(data={
    'country':value_list_country.index,
    'values':value_list_country.values
})

In [25]:
top10_list_country = value_list_country[:10].copy()
other_list_county = pd.DataFrame(data={
    'country':['Другие'],
    'values':[value_list_country['values'][10:].sum()]
})
top10_list_country = pd.concat([top10_list_country,other_list_county], ignore_index=True)

In [26]:
colors = ['#06145f', '#ff7f50', '#ffe700', '#FF0000', '#3c2121', '#008DB8', '#00AAAA',
          '#bf9b30', '#53ea0f', '#acbdb2', '#00FA80']
fig, axes = plt.subplots(figsize=(10,10))
top10_list_country.plot(kind = 'pie',subplots=True, y='values', colors=colors, ax = axes,figsize=(10,10))
axes.set_title('Топ 10 стран')
axes.legend(labels=top10_list_country['country'].tolist())
plt.show()

In [27]:
list_genres = kinopoisk_data.genres.dropna().tolist()
value_list_genres = pd.Series(list(chain(*list_genres))).value_counts()
value_list_genres = pd.DataFrame(data={
    'genres':value_list_genres.index,
    'values':value_list_genres.values
})

In [28]:
top10_list_genres = value_list_genres[:15].copy()
other_list_genres = pd.DataFrame(data={
    'genres':['Другие'],
    'values':[value_list_genres['values'][15:].sum()]
})
top10_list_genres = pd.concat([top10_list_genres,other_list_genres], ignore_index=True)

In [29]:
colors = ['#06145f', '#ff7f50', '#FF0000', '#E5E3E0', '#3c2121', '#008DB8', '#00AAAA',
          '#bf9b30', '#53ea0f', '#acbdb2', '#00FA80', '#E400FF', '#64FF00', '#000000', '#00E0FF', '#FF8F00']
fig, axes = plt.subplots(figsize=(10,10))
top10_list_genres.plot(kind = 'pie',subplots=True, y='values', colors=colors, ax = axes,figsize=(10,10))
axes.set_title('Топ 10 жанров')
axes.legend(labels=top10_list_genres['genres'].tolist())
plt.show()

## Метрика

In [30]:
def compute_metrics(train, test, recs, top_N):
    result = {}
    test_recs = test.set_index(['userId', 'movieId']).join(recs.set_index(['userId', 'movieId']))
    test_recs = test_recs.sort_values(by=['userId', 'rank'])

    test_recs['users_item_count'] = test_recs.groupby(level='userId')['rank'].transform(np.size)
    test_recs['reciprocal_rank'] = (1 / test_recs['rank']).fillna(0)
    test_recs['cumulative_rank'] = test_recs.groupby(level='userId').cumcount() + 1
    test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs['rank']
    
    users_count = test_recs.index.get_level_values('userId').nunique()
        
    result[f'MAP@{top_N}'] = (test_recs['cumulative_rank'] / test_recs['users_item_count']).sum() / users_count

    return pd.Series(result)

## Implicit model

In [31]:
def get_coo_matrix(df, 
                   user_col='userId', 
                   item_col='movieId', 
                   weight_col=None, 
                   users_mapping={}, 
                   items_mapping={}):
    
    if weight_col is None:
        weights = np.ones(len(df), dtype=np.float32)
    else:
        weights = df[weight_col].astype(np.float32)

    interaction_matrix = sp.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix

In [32]:
def generate_implicit_recs_mapper(
    model,
    train_matrix,
    top_N,
    user_mapping,
    item_inv_mapping,
    filter_already_liked_items
):
    def _recs_mapper(user):
        userId = user_mapping[user]
        recs = model.recommend(userId, 
                               train_matrix, 
                               N=top_N, 
                               filter_already_liked_items=filter_already_liked_items)
        return [item_inv_mapping[item] for item, _ in recs]
    return _recs_mapper

In [33]:
users_inv_mapping = dict(enumerate(ratings_unique['userId'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

items_inv_mapping = dict(enumerate(ratings_unique['movieId'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

In [34]:
metrics = []
for model_class in [CosineRecommender, BM25Recommender, TFIDFRecommender]:    
    top_N = 10
    test = ratings_unique.groupby('userId').tail(10)
    train = ratings_unique.drop(test.index)

    train_mat = get_coo_matrix(
        train,
        users_mapping=users_mapping,
        items_mapping=items_mapping,
    ).tocsr()

    model = model_class(K=top_N)
    model.fit(train_mat.T, show_progress=False) 

    mapper = generate_implicit_recs_mapper( 
        model,
        train_mat,
        top_N,
        users_mapping,
        items_inv_mapping,
        filter_already_liked_items=True
    )

    recs = pd.DataFrame({'userId': train['userId'].unique()})
    recs['movieId'] = recs['userId'].map(mapper)
    recs_explode = recs.explode('movieId')
    recs_explode['rank'] = recs_explode.groupby('userId').cumcount() + 1
    metrics.append(compute_metrics(train, test, recs_explode, top_N))

In [35]:
print(f"MAP10 CosineRecommender = {metrics[0]}")
print(f"MAP10 BM25Recommender = {metrics[1]}")
print(f"MAP10 TFIDFRecommender = {metrics[2]}")

## Последние популярные для всех пользователей

In [36]:
max_timestamp = ratings_unique.timestamp.max()
most_popular = ratings_unique[max_timestamp-ratings_unique.timestamp<datetime.timedelta(days=14)].movieId.value_counts().index[:10].to_list()

In [37]:
recs_most_popular = recs.copy()
recs_most_popular['movieId'] = recs['movieId'].apply(lambda x: most_popular)
recs_most_popular = recs_most_popular.explode('movieId')
recs_most_popular['rank'] = recs_most_popular.groupby('userId').cumcount() + 1

In [38]:
compute_metrics(train, test, recs_most_popular, top_N)