In [40]:
import numpy as np
import pandas as pd
import pyarrow.feather as feather
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load data

In [41]:
users = pd.read_csv('/content/drive/MyDrive/Проекты/recsys/data/users.csv')
items = pd.read_csv('/content/drive/MyDrive/Проекты/recsys/data/items.csv')
interactions = pd.read_csv(
    '/content/drive/MyDrive/Проекты/recsys/data/interactions.csv', parse_dates=True)

# Preprocessing


## Users

In [42]:
users['age'] = users['age'].fillna('age_unknown')
users['age'] = users['age'].astype('category')

In [43]:
users['income'] = users['income'].fillna('income_unknown')
users['income'] = users['income'].astype('category')

In [44]:
users['sex'] = users['sex'].fillna('sex_unknown')
users.loc[users.sex == 'М', 'sex'] = 'M'
users.loc[users.sex == 'Ж', 'sex'] = 'F'
users['sex'] = users['sex'].astype('category')

In [45]:
users['kids_flg'] = users['kids_flg'].astype('bool')
users.duplicated(subset=['user_id'], keep=False).sum()

0

In [46]:
users.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840197 entries, 0 to 840196
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype   
---  ------    --------------   -----   
 0   user_id   840197 non-null  int64   
 1   age       840197 non-null  category
 2   income    840197 non-null  category
 3   sex       840197 non-null  category
 4   kids_flg  840197 non-null  bool    
dtypes: bool(1), category(3), int64(1)
memory usage: 9.6 MB


  """Entry point for launching an IPython kernel.


## Items

In [47]:
items['content_type'] = items['content_type'].astype('category')

In [48]:
items.duplicated(subset=['title'], keep=False).sum()

1266

In [49]:
items['title'] = items['title'].str.lower()

In [50]:
items['title_orig'] = items['title_orig'].fillna('None') 

In [51]:
items.loc[items['release_year'].isna(), 'release_year'] = 2020.
items.loc[items['release_year'] < 1920, 'release_year_cat'] = 'inf_1920'
items.loc[items['release_year'] >= 2020, 'release_year_cat'] = '2020_inf'

for i in range (1920, 2020, 10):
    items.loc[(items['release_year'] >= i) & (items['release_year'] < i+10),
              'release_year_cat'] = f'{i}-{i+10}'

items = items.drop(columns=['release_year'])
items['release_year_cat'] = items['release_year_cat'].astype('category')

In [52]:
items['genres'] = items['genres'].astype('category')

In [53]:
items.loc[items.countries.isna(), 'countries'] = 'Россия'
items['countries'] = items['countries'].str.lower()
items['countries'] = items['countries'].apply(lambda x: ', '.join(sorted(list(set(x.split(', '))))))
items['countries'] = items['countries'].astype('category')


In [54]:
items['for_kids'] = items['for_kids'].fillna(0)
items['for_kids'] = items['for_kids'].astype('bool')

In [55]:
items.loc[items.age_rating.isna(), 'age_rating'] = 0
items['age_rating'] = items['age_rating'].astype('category')

In [56]:
items['studios'] = items['studios'].fillna('Unknown')
items['studios'] = items['studios'].str.lower()
items['studios'] = items['studios'].apply(lambda x: ', '.join(sorted(list(set(x.split(', '))))))
items['studios'] = items['studios'].astype('category')

In [57]:
items['directors'] = items['directors'].fillna('Unknown')
items['directors'] = items['directors'].str.lower()
items['directors'] = items['directors'].astype('category')

In [58]:
items['actors'] = items['actors'].fillna('Unknown')
items['actors'] = items['actors'].astype('category')

In [59]:
items['keywords'] = items['keywords'].fillna('Unknown')
items['keywords'] = items['keywords'].astype('category')

In [60]:
items['description'] = items['description'].fillna('-')

In [61]:
items.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15963 entries, 0 to 15962
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   item_id           15963 non-null  int64   
 1   content_type      15963 non-null  category
 2   title             15963 non-null  object  
 3   title_orig        15963 non-null  object  
 4   genres            15963 non-null  category
 5   countries         15963 non-null  category
 6   for_kids          15963 non-null  bool    
 7   age_rating        15963 non-null  category
 8   studios           15963 non-null  category
 9   directors         15963 non-null  category
 10  actors            15963 non-null  category
 11  description       15963 non-null  object  
 12  keywords          15963 non-null  category
 13  release_year_cat  15963 non-null  category
dtypes: bool(1), category(9), int64(1), object(3)
memory usage: 2.4+ MB


  """Entry point for launching an IPython kernel.


## Interactions

In [62]:
interactions['watched_pct'] = interactions['watched_pct'].astype(pd.Int8Dtype())
interactions['watched_pct'] = interactions['watched_pct'].fillna(0)

In [63]:
interactions['last_watch_dt'] = pd.to_datetime(interactions['last_watch_dt'])

In [64]:
interactions.info(null_counts=True, verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype         
---  ------         --------------    -----         
 0   user_id        5476251 non-null  int64         
 1   item_id        5476251 non-null  int64         
 2   last_watch_dt  5476251 non-null  datetime64[ns]
 3   total_dur      5476251 non-null  int64         
 4   watched_pct    5476251 non-null  Int8          
dtypes: Int8(1), datetime64[ns](1), int64(3)
memory usage: 177.6 MB


  """Entry point for launching an IPython kernel.


# Save Data

In [65]:
feather.write_feather(
    users, '/content/drive/MyDrive/Проекты/recsys/data/processed_data/users_process.csv')
feather.write_feather(
    items, '/content/drive/MyDrive/Проекты/recsys/data/processed_data/items_process.csv')
feather.write_feather(
    interactions, 
    '/content/drive/MyDrive/Проекты/recsys/data/processed_data/interactions_process.csv')