# Anime Recommender 03: Feature Engineering

## Imports

In [1]:
import pandas as pd
import numpy as np
import pickle

## User data

In [2]:
users_db = pd.read_pickle("../data/user_db_lightfm_genres.pkl")
print(users_db.shape)
users_db.head()

(912234, 4)


Unnamed: 0,userId,mediaId,rating,genres
0,710080,1535,70,"[Mystery, Psychological, Supernatural, Thriller]"
1,710080,21459,70,"[Action, Adventure, Comedy]"
2,710080,113415,70,"[Action, Drama, Supernatural]"
3,710080,11757,70,"[Action, Adventure, Fantasy, Romance]"
4,710080,5114,70,"[Action, Adventure, Drama, Fantasy]"


In [3]:
users_db['userId'].nunique()

14260

In [5]:
def convert_to_str(genres_list):
    if genres_list is np.nan:
        return None
    return ", ".join(genres_list)

In [114]:
users_db['genres_str'] = users_db['genres'].apply(convert_to_str)

In [86]:
mask_avg = users_db['rating'] >= users_db['user_avg_rating']
mask_75pct = users_db['rating'] >= users_db['user_75pct_rating']

#### Choosing liked genres based on average rating and 75th percentile rating

In [82]:
users_avg_rating = {x:int(5 * round(y/5)) 
                    for x,y in zip (users_db.groupby('user_id').mean()['rating'].index,
                                    users_db.groupby('user_id').mean()['rating'].values)}

In [83]:
users_75pct_rating = {x:int(5 * round(y/5)) 
                      for x,y in zip (users_db.groupby('user_id').quantile(.75)['rating'].index,
                                      users_db.groupby('user_id').quantile(.75)['rating'].values)}

In [84]:
users_db['user_avg_rating'] = users_db['user_id'].map(users_avg_rating)

In [85]:
users_db['user_75pct_rating'] = users_db['user_id'].map(users_75pct_rating)

In [87]:
users_liked_genres_avg = {x:sorted(list(set(list(y.split(', '))))) 
                          for x,y in zip(users_db[mask_avg].groupby('user_id')['genres_str'].apply(lambda x: ', '.join(x)).index, 
                                         users_db[mask_avg].groupby('user_id')['genres_str'].apply(lambda x: ', '.join(x)).values)}


In [88]:
users_liked_genres_75pct = {x:sorted(list(set(list(y.split(', '))))) 
                            for x,y in zip(users_db[mask_75pct].groupby('user_id')['genres_str'].apply(lambda x: ', '.join(x)).index, 
                                           users_db[mask_75pct].groupby('user_id')['genres_str'].apply(lambda x: ', '.join(x)).values)}

In [89]:
users_db['liked_genres_avg'] = users_db['user_id'].map(users_liked_genres_avg)
users_db['liked_genres_75pct'] = users_db['user_id'].map(users_liked_genres_75pct)

#### Choosing top 3 or 5 most common genres based on average rating and 75th percentile rating

In [104]:
watched_genres_avg = {x:sorted(list(y.split(', '))) 
                     for x,y in zip(users_db[mask_avg].groupby('user_id')['genres_str'].apply(lambda x: ', '.join(x)).index,
                                    users_db[mask_avg].groupby('user_id')['genres_str'].apply(lambda x: ', '.join(x)).values)}

In [105]:
watched_genres_75pct = {x:sorted(list(y.split(', ')))
                        for x,y in zip(users_db[mask_75pct].groupby('user_id')['genres_str'].apply(lambda x: ', '.join(x)).index,
                                       users_db[mask_75pct].groupby('user_id')['genres_str'].apply(lambda x: ', '.join(x)).values)}

In [106]:
users_db['watched_genres_avg'] = users_db['user_id'].map(watched_genres_avg)
users_db['watched_genres_75pct'] = users_db['user_id'].map(watched_genres_75pct)

In [107]:
genres = pickle.load(open('../data/anime_genres.pkl', 'rb'))

In [108]:
def get_top_n_genres(watched_genres, n):
    genres_count = [0] * 18
    output_list = []
    for watched_genre in watched_genres:
        for idx,genre in enumerate(genres):
            if watched_genre == genre:
                genres_count[idx]+=1
                break
    
    top_n = sorted(zip(genres_count,genres),reverse=True)[:n]
    for pair in top_n:
        output_list.append(pair[1])
    
    return sorted(output_list)

In [110]:
users_db['top_3_genres_avg'] = users_db['watched_genres_avg'].apply(get_top_n_genres,args=(3,))
users_db['top_5_genres_avg'] = users_db['watched_genres_avg'].apply(get_top_n_genres,args=(5,))
users_db['top_3_genres_75pct'] = users_db['watched_genres_75pct'].apply(get_top_n_genres,args=(3,))
users_db['top_5_genres_75pct'] = users_db['watched_genres_75pct'].apply(get_top_n_genres,args=(5,))

### Export

In [118]:
users_db.drop(columns=['genres_str','user_avg_rating','user_75pct_rating','watched_genres_avg','watched_genres_75pct'],inplace=True)

In [120]:
users_db.head()

Unnamed: 0,user_id,media_id,rating,genres,liked_genres_avg,liked_genres_75pct,top_3_genres_avg,top_5_genres_avg,top_3_genres_75pct,top_5_genres_75pct
0,710080,1535,70,"[Mystery, Psychological, Supernatural, Thriller]","[Action, Adventure, Comedy, Drama, Ecchi, Fant...","[Action, Adventure, Comedy, Drama, Ecchi, Fant...","[Action, Comedy, Drama]","[Action, Comedy, Drama, Fantasy, Supernatural]","[Action, Comedy, Drama]","[Action, Comedy, Drama, Fantasy, Slice of Life]"
1,710080,21459,70,"[Action, Adventure, Comedy]","[Action, Adventure, Comedy, Drama, Ecchi, Fant...","[Action, Adventure, Comedy, Drama, Ecchi, Fant...","[Action, Comedy, Drama]","[Action, Comedy, Drama, Fantasy, Supernatural]","[Action, Comedy, Drama]","[Action, Comedy, Drama, Fantasy, Slice of Life]"
2,710080,113415,70,"[Action, Drama, Supernatural]","[Action, Adventure, Comedy, Drama, Ecchi, Fant...","[Action, Adventure, Comedy, Drama, Ecchi, Fant...","[Action, Comedy, Drama]","[Action, Comedy, Drama, Fantasy, Supernatural]","[Action, Comedy, Drama]","[Action, Comedy, Drama, Fantasy, Slice of Life]"
3,710080,11757,70,"[Action, Adventure, Fantasy, Romance]","[Action, Adventure, Comedy, Drama, Ecchi, Fant...","[Action, Adventure, Comedy, Drama, Ecchi, Fant...","[Action, Comedy, Drama]","[Action, Comedy, Drama, Fantasy, Supernatural]","[Action, Comedy, Drama]","[Action, Comedy, Drama, Fantasy, Slice of Life]"
4,710080,5114,70,"[Action, Adventure, Drama, Fantasy]","[Action, Adventure, Comedy, Drama, Ecchi, Fant...","[Action, Adventure, Comedy, Drama, Ecchi, Fant...","[Action, Comedy, Drama]","[Action, Comedy, Drama, Fantasy, Supernatural]","[Action, Comedy, Drama]","[Action, Comedy, Drama, Fantasy, Slice of Life]"


In [121]:
users_db.to_pickle("../data/users_db_feateng.pkl")