# Data Loading

In [27]:
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
import zipfile as zf
import requests

url = "https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip"

req = requests.get(url, stream=True)

with open("kion.zip", "wb") as fd:
    total_size_in_bytes = int(req.headers.get("Content-Length", 0))
    for chunk in req.iter_content(chunk_size=2**20):
        fd.write(chunk)

files = zf.ZipFile("kion.zip", "r")
files.extractall()
files.close()

In [2]:
!ls

HW5.ipynb                [34m__MACOSX[m[m                 kion.zip
HW6.ipynb                [34mdata_original[m[m            popular_kNN_models.ipynb


In [5]:
interactions = pd.read_csv('data_original/interactions.csv')
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

In [10]:
interactions['last_watch_dt'] = pd.to_datetime(interactions['last_watch_dt'])

In [21]:
def load_data():
    interactions = pd.read_csv('data_original/interactions.csv')
    users = pd.read_csv('data_original/users.csv')
    items = pd.read_csv('data_original/items.csv')
    interactions.rename(
    columns={
        'last_watch_dt': 'datetime',
        'total_dur': 'weight',
            },
    inplace=True,
    )

    interactions['datetime'] = pd.to_datetime(interactions['datetime'])
    return interactions, users, items


# Recommend popular items

In [18]:
def recommend_popular(
    df: pd.DataFrame, k: int = 10, days: int = 7
) -> list:
    """
    Returns most popular items for the last k days
    """

    min_date = df["last_watch_dt"].max().normalize() - pd.DateOffset(days)
    result = list(df.loc[df["last_watch_dt"] > min_date, "item_id"]
                   .value_counts()
                   .head(k)
                   .index.values)
    return result

In [20]:
recommend_popular(interactions)

[9728, 15297, 10440, 14488, 13865, 12192, 341, 4151, 3734, 512]

# KNN

In [22]:
interactions, users, items = load_data()

In [23]:
interactions.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [24]:
users.head()

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0


In [25]:
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


In [26]:
#interactions.rename(columns={'last_watch_dt': Columns.Datetime,
#                            'total_dur': Columns.Weight}, 
#                    inplace=True) 

#interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [1]:
# train test split 
# test = last 1 week 
from rectools.model_selection import TimeRangeSplit

n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1
freq = f"{n_units}{unit}"

last_date = interactions['datetime'].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

ImportError: cannot import name 'TimeRangeSplit' from 'rectools.model_selection' (/Users/yuriybalandin/opt/anaconda3/envs/recsys/lib/python3.9/site-packages/rectools/model_selection/__init__.py)

In [None]:
train = interactions.loc[train_ids]
test = interactions.loc[test_ids]