In [23]:
import json
from collections import defaultdict
import tqdm

import pandas as pd
import numpy as np

import gensim.downloader as api
from gensim.parsing import preprocess_string, strip_non_alphanum, strip_punctuation, strip_multiple_whitespaces

from deeprec import ROOT

In [2]:
DATA_DIR = ROOT.joinpath('data')

In [50]:
def clean_series(series):
    filters = [strip_non_alphanum, strip_punctuation, strip_multiple_whitespaces]
    return (preprocess_string(title.lower(), filters=filters) for title in series)


def safe_embed(model, token):
    try:
        return model[token]
    except KeyError:
        return np.zeros(model.vector_size)


def embed_title(model, title):
    if len(title) == 1:
        return safe_embed(model, title).flatten()
    embeds = np.array([safe_embed(model, t) for t in title])
    if embeds.ndim != 2:
        raise ValueError(f'Expected array with 2 dims, got {embeds.ndim}')
    return embeds.sum(0)


def get_embeds(data, model, col='title'):
    return pd.DataFrame(
        [embed_title(model, title) for title in tqdm.tqdm(clean_series(data[col]), total=len(data[col]))],
        columns=[f'embed_{i}' for i in range(model.vector_size)]
    )


def preprocess(data, model, write_embeds=False, dummies=None):
    embeds = get_embeds(data, model)
    if write_embeds:
        embeds.to_parquet(DATA_DIR.joinpath('embeds.parq.gzip'), compression='gzip')
    if dummies:
        data = pd.get_dummies(data, columns=dummies)
    return data.drop('title', axis=1).join(embeds)


def make_train_test(data, write=False):
    df = data.sort_values(by=['year', 'month', 'hour'])
    mask = df['year'] > 1999
    train, test = df[~mask], df[mask]
    if write:
        train.to_parquet(DATA_DIR.joinpath('train.parq.gzip'), compression='gzip')
        test.to_parquet(DATA_DIR.joinpath('test.parq.gzip'), compression='gzip')
    else:
        return train, test


def make_metadata(data, write=False, cats=['user', 'movie']):
    metadata = defaultdict(dict)
    metadata['title_emb_size'] = 25  # size of embedding of glove-twitter-25
    metadata['string_na'] = 'XX'  # Defined in project 1.1
    metadata['genres'] = [c for c in data.columns if 'genre' in c]
    metadata['ages'] = data['age'].unique().tolist()
    metadata['occupations'] = data['occupation'].unique().tolist()
    for cat in cats:
        if data[cat].nunique() > 25:
            res = data[cat].value_counts(normalize=True)
            sample = 0
            for k, v in res.to_dict().items():
                if sample > 0.8:
                    break
                # print(f'working on cat: {k}, {v}%')
                metadata[cat].update({k: v})
                sample += v
                # print(f'percent covered: {sample}')
        else:
            metadata[cat] = data[cat].value_counts(normalize=True).to_dict()
    if write:
        with open(DATA_DIR.joinpath('metadata.json'), 'w') as fp:
            json.dump(metadata, fp)
    else:
        return metadata

In [56]:
file = DATA_DIR.joinpath('dataset.parq.gzip')
data = pd.read_parquet(file)
data.head()

Unnamed: 0_level_0,user,movie,rating,hour,day_of_week,month,gender,age,occupation,city,...,genre_fantasy,genre_filmnoir,genre_horror,genre_musical,genre_mystery,genre_romance,genre_scifi,genre_thriller,genre_war,genre_western
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1193,5,22,6,12,F,1,10,Royal Oak,...,0,0,0,0,0,0,0,0,0,0
1,2,1193,5,21,6,12,M,56,16,Marrero,...,0,0,0,0,0,0,0,0,0,0
2,12,1193,4,23,5,12,M,25,12,Winter Park,...,0,0,0,0,0,0,0,0,0,0
3,15,1193,4,18,5,12,M,25,7,Charlottesville,...,0,0,0,0,0,0,0,0,0,0
4,17,1193,5,6,5,12,M,50,1,Modesto,...,0,0,0,0,0,0,0,0,0,0


In [57]:
data.describe().round(2)

Unnamed: 0,user,movie,rating,hour,day_of_week,month,age,occupation,year,genre_action,...,genre_fantasy,genre_filmnoir,genre_horror,genre_musical,genre_mystery,genre_romance,genre_scifi,genre_thriller,genre_war,genre_western
count,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0,...,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0
mean,3024.51,1865.54,3.58,11.92,2.8,8.71,29.74,8.04,1986.7,0.26,...,0.04,0.02,0.08,0.04,0.04,0.15,0.16,0.19,0.07,0.02
std,1728.41,1096.04,1.12,7.89,2.04,2.72,11.75,6.53,14.35,0.44,...,0.19,0.13,0.27,0.2,0.2,0.35,0.36,0.39,0.25,0.14
min,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1919.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1506.0,1030.0,3.0,4.0,1.0,7.0,25.0,2.0,1982.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3070.0,1835.0,4.0,14.0,3.0,9.0,25.0,7.0,1992.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4476.0,2770.0,4.0,19.0,5.0,11.0,35.0,14.0,1997.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,6040.0,3952.0,5.0,23.0,6.0,12.0,56.0,20.0,2000.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [58]:
data.columns

Index(['user', 'movie', 'rating', 'hour', 'day_of_week', 'month', 'gender',
       'age', 'occupation', 'city', 'state', 'zip', 'title', 'year',
       'genre_action', 'genre_adventure', 'genre_animation', 'genre_childrens',
       'genre_comedy', 'genre_crime', 'genre_documentary', 'genre_drama',
       'genre_fantasy', 'genre_filmnoir', 'genre_horror', 'genre_musical',
       'genre_mystery', 'genre_romance', 'genre_scifi', 'genre_thriller',
       'genre_war', 'genre_western'],
      dtype='object')

In [59]:
drop_cols = ['zip']
for col in drop_cols:
    try:
        data.drop(col, axis=1, inplace=True)
    except KeyError:
        print(f'Column {col} already dropped.')
        pass

In [70]:
model = api.load("glove-twitter-25")
final = preprocess(data, model)
final['gender'] = (final['gender'] == 'F').astype(int)
final.to_parquet(DATA_DIR.joinpath('final.parq.gzip'), compression='gzip')

100%|██████████| 1000209/1000209 [00:20<00:00, 48939.38it/s]


In [61]:
final.columns

Index(['user', 'movie', 'rating', 'hour', 'day_of_week', 'month', 'gender',
       'age', 'occupation', 'city', 'state', 'year', 'genre_action',
       'genre_adventure', 'genre_animation', 'genre_childrens', 'genre_comedy',
       'genre_crime', 'genre_documentary', 'genre_drama', 'genre_fantasy',
       'genre_filmnoir', 'genre_horror', 'genre_musical', 'genre_mystery',
       'genre_romance', 'genre_scifi', 'genre_thriller', 'genre_war',
       'genre_western', 'embed_0', 'embed_1', 'embed_2', 'embed_3', 'embed_4',
       'embed_5', 'embed_6', 'embed_7', 'embed_8', 'embed_9', 'embed_10',
       'embed_11', 'embed_12', 'embed_13', 'embed_14', 'embed_15', 'embed_16',
       'embed_17', 'embed_18', 'embed_19', 'embed_20', 'embed_21', 'embed_22',
       'embed_23', 'embed_24'],
      dtype='object')

In [62]:
final.shape

(1000209, 55)

In [64]:
final.head()

Unnamed: 0_level_0,user,movie,rating,hour,day_of_week,month,gender,age,occupation,city,...,embed_15,embed_16,embed_17,embed_18,embed_19,embed_20,embed_21,embed_22,embed_23,embed_24
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1193,5,22,6,12,1,1,10,Royal Oak,...,2.17616,2.86298,2.238394,-2.15952,1.21048,-3.436165,-0.329595,3.61988,-2.47053,1.399963
1,2,1193,5,21,6,12,0,56,16,Marrero,...,2.17616,2.86298,2.238394,-2.15952,1.21048,-3.436165,-0.329595,3.61988,-2.47053,1.399963
2,12,1193,4,23,5,12,0,25,12,Winter Park,...,2.17616,2.86298,2.238394,-2.15952,1.21048,-3.436165,-0.329595,3.61988,-2.47053,1.399963
3,15,1193,4,18,5,12,0,25,7,Charlottesville,...,2.17616,2.86298,2.238394,-2.15952,1.21048,-3.436165,-0.329595,3.61988,-2.47053,1.399963
4,17,1193,5,6,5,12,0,50,1,Modesto,...,2.17616,2.86298,2.238394,-2.15952,1.21048,-3.436165,-0.329595,3.61988,-2.47053,1.399963


In [65]:
make_train_test(final, write=True)

In [66]:
cats = ['user', 'movie', 'city', 'state']
res = make_metadata(final, write=False, cats=cats)

In [67]:
res

defaultdict(dict,
            {'title_emb_size': 25,
             'string_na': 'XX',
             'genres': ['genre_action',
              'genre_adventure',
              'genre_animation',
              'genre_childrens',
              'genre_comedy',
              'genre_crime',
              'genre_documentary',
              'genre_drama',
              'genre_fantasy',
              'genre_filmnoir',
              'genre_horror',
              'genre_musical',
              'genre_mystery',
              'genre_romance',
              'genre_scifi',
              'genre_thriller',
              'genre_war',
              'genre_western'],
             'ages': [1, 56, 25, 50, 18, 45, 35],
             'occupations': [10,
              16,
              12,
              7,
              1,
              3,
              4,
              8,
              17,
              0,
              2,
              9,
              19,
              18,
              15,
              11,
  

In [69]:
make_metadata(final, write=True, cats=cats)