In [1]:
# we are using the movielens latest dataset, available here 
# http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('precision',2)

In [4]:
data_path = '../ml-latest'
all_files = [i for i in os.listdir(data_path) if i.endswith('.csv')]
print(all_files)

['tags.csv', 'links.csv', 'movies.csv', 'ratings.csv']


In [5]:
def get_data(fn):
    df = pd.read_csv(os.path.join(data_path, '{}.csv'.format(fn)))
    print('{} total: {} rows.'.format(fn, len(df)))
    return df

In [6]:
movies = get_data('movies')
movies.head()

movies total: 9125 rows.


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
# extract the year from movie title
movies.title = movies.title.apply(lambda x: x.split(' ('))
movies['year'] = movies.title.apply(lambda x: x[1][:4] if len(x)==2 else 0).astype(int)
movies.title = movies.title.apply(lambda x: x[0])

In [7]:
# split genres and binarize them

movies.genres = movies.genres.apply(lambda x: x.split('|'))

all_genres = list(set([item for sublist in movies.genres.tolist() for item in sublist]))

from sklearn.preprocessing import label_binarize
movies.genres = movies.genres.apply(lambda x: np.sum(label_binarize(x, all_genres), axis=0))

movies = pd.concat([movies, pd.DataFrame(movies.genres.tolist(), columns=all_genres)], axis=1)
movies.head()

Unnamed: 0,movieId,title,genres,Animation,Horror,Adventure,Sci-Fi,IMAX,Drama,Mystery,...,Children,Documentary,War,Musical,Film-Noir,Thriller,(no genres listed),Action,Fantasy,Romance
0,1,Toy Story (1995),"[1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...",1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,2,Jumanji (1995),"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,3,Grumpier Old Men (1995),"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,Waiting to Exhale (1995),"[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,5,Father of the Bride Part II (1995),"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
tags = get_data('tags')
tags.head()

tags total: 1296 rows.


Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,Cambodia,1170560997
3,15,32892,Russian,1170626366
4,15,34162,forgettable,1141391765


In [18]:
tags['tag_length'] = tags.tag.apply(lambda x: len(str(x)))
tags.sort_values(by='tag_length', ascending=False).head(20)

Unnamed: 0,userId,movieId,tag,timestamp,tag_length
133,212,5507,play enough video games and you can become an ...,1253930345,55
1289,660,260,"imaginary world, characters, story, philosophical",1436680217,49
772,531,1028,villain nonexistent or not needed for good story,1243454382,48
336,353,31221,Try not to mistake this for an episode of Alias,1140389595,47
333,353,4721,As historicaly correct as Germany winning WW2,1140389056,45
539,402,260,"space epic, science fiction, hero's journey",1443393664,43
841,531,64969,easily confused with other movie(s) (title),1243454548,43
335,353,7376,"The Rocks ""finest"" work need I say more?",1140389511,40
668,431,6539,I loved it! Seen it five times already!,1140454336,39
148,212,27904,interesting concept - bad execution,1253931234,35


In [19]:
tags.tag.value_counts().head(20)

getdvd        33
Ei muista     29
tivo          26
toplist07     26
toplist12     20
tcm           20
toplist11     20
toplist08     19
toplist15     19
toplist10     18
toplist09     18
toplist14     18
holes00s      16
holes70s      16
toplist06     16
funny         16
toplist13     14
holes80s      14
sightsound    14
holes40s      13
Name: tag, dtype: int64

In [None]:
len(tags.tag.unique())

In [None]:
tags.tag.value_counts().plot.bar()

In [None]:
ratings = get_data('ratings')
ratings.head()

In [None]:
ratings.groupby('movieId').rating.mean().hist(bins=100)
# -> right skewed!

In [None]:
ratings.groupby('userId').rating.mean().hist(bins=100)
# right skewed! some users only rate 4* or 5*

In [None]:
links = get_data('links')
links.head()

In [None]:
gtags = get_data('genome-tags')
gtags.head()

In [None]:
gscores = get_data('genome-scores')
gscores.head()