In [1]:
import datetime
import re

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylab
import scipy
import seaborn as sns
import sklearn
from sklearn import *
import statsmodels as sm

np.random.seed(1337)

%matplotlib inline

sns.set(font_scale=1.3)
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette('muted'))

pd.options.display.max_colwidth = 30
pd.options.display.float_format = '{:,.3f}'.format

## Movies

In [2]:
movies = pd.read_csv('data/ml-20m/movies.csv')
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
movieId    27278 non-null int64
title      27278 non-null object
genres     27278 non-null object
dtypes: int64(1), object(2)
memory usage: 639.4+ KB


In [3]:
movies.columns = ['movie_id', 'title', 'genres']

In [4]:
year_pattern = re.compile(r'.*\((\d+)\)')
def parse_year(title):
    try:
        return int(year_pattern.match(title).group(1))
    except:
        return None
movies['year'] = movies.title.apply(parse_year)

In [5]:
all_genres = set()
def parse_genres(genres):
    ret = []
    for genre in genres.split('|'):
        genre = re.sub('[^a-z]', '', genre.lower())
        ret.append(genre)
        all_genres.add(genre)
    return ret
movies['parsed_genres'] = movies.genres.apply(parse_genres)
del movies['genres']
for genre in all_genres:
    movies['genre_{0}'.format(genre)] = movies.parsed_genres.apply(lambda genres: int(genre in genres))

In [6]:
movies.sample(10)

Unnamed: 0,movie_id,title,year,parsed_genres,genre_adventure,genre_children,genre_animation,genre_thriller,genre_crime,genre_action,...,genre_filmnoir,genre_romance,genre_western,genre_nogenreslisted,genre_horror,genre_imax,genre_documentary,genre_comedy,genre_drama,genre_mystery
1877,1961,Rain Man (1988),1988.0,[drama],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
25233,119051,Crime at the Chinese Resta...,1981.0,"[comedy, mystery, thriller]",0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1
9187,27074,Brave New World (1998),1998.0,[scifi],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22618,108332,Mystery of the Yellow Room...,2003.0,"[comedy, crime, mystery]",0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
25252,119139,Ascension (2014),2014.0,"[drama, scifi]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
25382,119948,Let's Kill Ward's Wife (2014),2014.0,[comedy],0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
23159,110366,Jean-Michel Basquiat: The ...,2010.0,[documentary],0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6017,6116,"Pirate Movie, The (1982)",1982.0,"[adventure, comedy, musical]",1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
24133,114417,"Average Little Man, An (Un...",1977.0,[drama],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7139,7251,Where the Day Takes You (1...,1992.0,[drama],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


## Tags

In [7]:
tags = pd.read_csv('data/ml-20m/tags.csv')
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 465564 entries, 0 to 465563
Data columns (total 4 columns):
userId       465564 non-null int64
movieId      465564 non-null int64
tag          465548 non-null object
timestamp    465564 non-null int64
dtypes: int64(3), object(1)
memory usage: 14.2+ MB


In [8]:
tags.columns = ['user_id', 'movie_id', 'tag', 'timestamp']

In [9]:
tags['year'] = tags.timestamp.apply(lambda ts: datetime.datetime.fromtimestamp(ts).year)

In [10]:
tags.sample(10)

Unnamed: 0,user_id,movie_id,tag,timestamp,year
56826,11248,97752,multiple storylines,1355438408,2012
442181,130827,38824,traumatic childhood,1227626017,2008
36710,9815,1343,serial killer,1400935071,2014
114935,28599,94150,based on a true story,1394668108,2014
329673,97198,3527,classic,1311157805,2011
393226,122523,26082,realistic action,1420576822,2015
86252,22074,48780,Scarlett Johansson,1368850974,2013
53707,11081,8507,TREACHEROUS SPOUSES,1172502842,2007
58579,12271,4621,John Travolta,1165704878,2006
138307,38615,1206,Nudity (Topless - Brief),1205616101,2008


## Ratings

In [11]:
ratings = pd.read_csv('data/ml-20m/ratings.csv')
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
dtypes: float64(1), int64(3)
memory usage: 610.4 MB


In [12]:
ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

In [13]:
ratings['year'] = ratings.timestamp.apply(lambda ts: datetime.datetime.fromtimestamp(ts).year)

In [14]:
ratings.sample(10)

Unnamed: 0,user_id,movie_id,rating,timestamp,year
5811064,40008,2959,4.0,1231036229,2009
2895131,19630,1472,3.0,945068532,1999
15624435,108063,4235,5.0,1170622829,2007
14179476,97891,292,3.0,1233752577,2009
14113568,97491,594,3.0,1102513683,2004
6937387,47816,708,3.5,1096917985,2004
4668987,32053,3827,3.0,1035986204,2002
2568462,17404,1909,3.0,1249324050,2009
18110383,125268,2734,1.0,943456957,1999
14339942,99067,3111,3.0,986016633,2001


## Write to disk

In [15]:
movies.to_csv('data/movies_clean.csv', index=False)
ratings.to_csv('data/ratings_clean.csv', index=False)
tags.to_csv('data/tags_clean.csv', index=False)

In [16]:
ratings = ratings.sample(frac=1.0)
train_lim = int(0.8 * len(ratings))
ratings[:train_lim].to_csv('data/ratings_train.csv', index=False)
ratings[train_lim:].to_csv('data/ratings_test.csv', index=False)