In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')
print('movies: ', movies.shape)
print('ratings: ', ratings.shape)
print('tags: ', tags.shape)

movies:  (9742, 3)
ratings:  (100836, 4)
tags:  (3683, 4)


In [3]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [6]:
df = pd.merge(ratings, movies, on='movieId' , how='left')
df = df.drop('title', axis=1)
df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,genres
0,1,1,4.0,964982703,Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Comedy|Romance
2,1,6,4.0,964982224,Action|Crime|Thriller
3,1,47,5.0,964983815,Mystery|Thriller
4,1,50,5.0,964982931,Crime|Mystery|Thriller


In [7]:
df['genres'] = df['genres'].str.split('|')

In [8]:
df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,genres
0,1,1,4.0,964982703,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,1,3,4.0,964981247,"[Comedy, Romance]"
2,1,6,4.0,964982224,"[Action, Crime, Thriller]"
3,1,47,5.0,964983815,"[Mystery, Thriller]"
4,1,50,5.0,964982931,"[Crime, Mystery, Thriller]"


In [9]:
tags['tag'] = tags['tag'].str.split('|')
tags.drop('timestamp', axis=1, inplace=True)

In [10]:
tags = tags.groupby(['userId','movieId'])['tag'].apply(lambda x: ','.join(x.astype(str))).reset_index()
tags.head(5)

Unnamed: 0,userId,movieId,tag
0,2,60756,"['funny'],['Highly quotable'],['will ferrell']"
1,2,89774,"['Boxing story'],['MMA'],['Tom Hardy']"
2,2,106782,"['drugs'],['Leonardo DiCaprio'],['Martin Scors..."
3,7,48516,['way too long']
4,18,431,"['Al Pacino'],['gangster'],['mafia']"


In [11]:
df = pd.merge(df, tags, on=['userId','movieId'], how='left')

In [12]:
df.shape

(100836, 6)

In [13]:
df['tag'] = df['tag'].apply(lambda d: d if isinstance(d, list) else [])
df['genres'] = df['genres'].apply(lambda d: d if isinstance(d, list) else [])

In [14]:
df

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
0,1,1,4.0,964982703,"[Adventure, Animation, Children, Comedy, Fantasy]",[]
1,1,3,4.0,964981247,"[Comedy, Romance]",[]
2,1,6,4.0,964982224,"[Action, Crime, Thriller]",[]
3,1,47,5.0,964983815,"[Mystery, Thriller]",[]
4,1,50,5.0,964982931,"[Crime, Mystery, Thriller]",[]
5,1,70,3.0,964982400,"[Action, Comedy, Horror, Thriller]",[]
6,1,101,5.0,964980868,"[Adventure, Comedy, Crime, Romance]",[]
7,1,110,4.0,964982176,"[Action, Drama, War]",[]
8,1,151,5.0,964984041,"[Action, Drama, Romance, War]",[]
9,1,157,5.0,964984100,"[Comedy, War]",[]


In [16]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.2, stratify=df.userId)

In [26]:
train_data = train_data.sort_values(['userId', 'movieId'])
train_data

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
0,1,1,4.0,964982703,"[Adventure, Animation, Children, Comedy, Fantasy]",[]
2,1,6,4.0,964982224,"[Action, Crime, Thriller]",[]
3,1,47,5.0,964983815,"[Mystery, Thriller]",[]
4,1,50,5.0,964982931,"[Crime, Mystery, Thriller]",[]
6,1,101,5.0,964980868,"[Adventure, Comedy, Crime, Romance]",[]
8,1,151,5.0,964984041,"[Action, Drama, Romance, War]",[]
9,1,157,5.0,964984100,"[Comedy, War]",[]
10,1,163,5.0,964983650,"[Action, Romance, Western]",[]
11,1,216,5.0,964981208,[Comedy],[]
12,1,223,3.0,964980985,[Comedy],[]


In [27]:
test_data = test_data.sort_values(['userId','movieId'])
test_data

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
1,1,3,4.0,964981247,"[Comedy, Romance]",[]
5,1,70,3.0,964982400,"[Action, Comedy, Horror, Thriller]",[]
7,1,110,4.0,964982176,"[Action, Drama, War]",[]
16,1,296,3.0,964982967,"[Comedy, Crime, Drama, Thriller]",[]
24,1,441,4.0,964980868,[Comedy],[]
25,1,457,5.0,964981909,[Thriller],[]
31,1,553,5.0,964984153,"[Action, Drama, Western]",[]
32,1,590,4.0,964982546,"[Adventure, Drama, Western]",[]
36,1,608,5.0,964982931,"[Comedy, Crime, Drama, Thriller]",[]
39,1,673,3.0,964981775,"[Adventure, Animation, Children, Comedy, Fanta...",[]


In [28]:
train_data.to_csv('training_data.csv')
test_data.to_csv('testing_data.csv')