In [21]:
import re

import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from uszipcode import SearchEngine

from deeprec import ROOT

In [22]:
DATA_DIR = ROOT.joinpath('data')
ENCODING = "latin-1"

In [32]:
def load_data():
    data = {}
    for file in ['users', 'movies', 'ratings']:
        data[file] = pd.read_csv(DATA_DIR.joinpath(f'{file}.csv'), encoding=ENCODING)
    return data


def get_title(text):
    return text.strip()[:-7]


def get_year(text):
    return int(re.findall(r'(19\d{2}|20\d{2})', text)[-1])


def clean_genres(genres):
    return ['genre_' + re.sub(r'[^a-z0-9]+', '', s.lower()) for s in genres]


def split_genres(series):
    mlb = MultiLabelBinarizer()
    arr = mlb.fit_transform(series.str.split('|'))
    return pd.DataFrame(arr, columns=clean_genres(mlb.classes_))


def convert_timestamps(series, use_names=True):
    ts = pd.to_datetime(series, unit='s')
    if use_names:
        return pd.DataFrame(
            {'hour': ts.dt.hour, 'day_of_week': ts.dt.day_name(), 'month': ts.dt.month_name(), 'ts': ts})
    return pd.DataFrame({'hour': ts.dt.hour, 'day_of_week': ts.dt.dayofweek, 'month': ts.dt.month, 'ts': ts})


def std_zip(zipcode):
    return zipcode[:5]


def get_city_state(zipcode, engine):
    res = engine.by_zipcode(std_zip(zipcode))
    try:
        return res.major_city, res.state_abbr
    except AttributeError:
        return None, None


def expand_zips(series, engine):
    df = pd.DataFrame([get_city_state(x, engine) for x in series], columns=['city', 'state'])
    df['zip'] = series.apply(std_zip)
    return df


def preprocess(dd, use_names=True, na_val='XX'):
    search = SearchEngine()

    movies = pd.DataFrame({
        'movie': dd['movies']['movie'],
        'title': dd['movies']['title'].apply(get_title),
        'year': dd['movies']['title'].apply(get_year),
    }).join(split_genres(dd['movies']['genres']))

    ratings = dd['ratings'].iloc[:, :-1].join(convert_timestamps(dd['ratings']['timestamp'], use_names))
    users = dd['users'].iloc[:, :-1].join(expand_zips(dd['users']['zip'], search))

    return ratings.merge(users, on='user').merge(movies, on='movie').fillna(na_val)

In [33]:
data = load_data()

In [34]:
data['movies'].head()

Unnamed: 0,movie,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [35]:
data['users'].head()

Unnamed: 0,user,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [36]:
data['ratings'].head()

Unnamed: 0,user,movie,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [37]:
proc = preprocess(data, use_names=False)

In [38]:
proc.shape

(1000209, 33)

In [39]:
proc.columns

Index(['user', 'movie', 'rating', 'hour', 'day_of_week', 'month', 'ts',
       'gender', 'age', 'occupation', 'city', 'state', 'zip', 'title', 'year',
       'genre_action', 'genre_adventure', 'genre_animation', 'genre_childrens',
       'genre_comedy', 'genre_crime', 'genre_documentary', 'genre_drama',
       'genre_fantasy', 'genre_filmnoir', 'genre_horror', 'genre_musical',
       'genre_mystery', 'genre_romance', 'genre_scifi', 'genre_thriller',
       'genre_war', 'genre_western'],
      dtype='object')

In [40]:
proc.head()

Unnamed: 0,user,movie,rating,hour,day_of_week,month,ts,gender,age,occupation,...,genre_fantasy,genre_filmnoir,genre_horror,genre_musical,genre_mystery,genre_romance,genre_scifi,genre_thriller,genre_war,genre_western
0,1,1193,5,22,6,12,2000-12-31 22:12:40,F,1,10,...,0,0,0,0,0,0,0,0,0,0
1,2,1193,5,21,6,12,2000-12-31 21:33:33,M,56,16,...,0,0,0,0,0,0,0,0,0,0
2,12,1193,4,23,5,12,2000-12-30 23:49:39,M,25,12,...,0,0,0,0,0,0,0,0,0,0
3,15,1193,4,18,5,12,2000-12-30 18:01:19,M,25,7,...,0,0,0,0,0,0,0,0,0,0
4,17,1193,5,6,5,12,2000-12-30 06:41:11,M,50,1,...,0,0,0,0,0,0,0,0,0,0


In [41]:
proc.to_parquet(DATA_DIR.joinpath('dataset.parq.gzip'), compression='gzip')

In [42]:
for col in ['user', 'movie']:
    print(f'Number of unique {col} obs: {proc[col].nunique()}')

Number of unique user obs: 6040
Number of unique movie obs: 3706
