In [87]:
import numpy as np
import pandas as pd

from datetime import datetime

# Loading Data

In [88]:
## load movies data
movies = pd.read_csv("data/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [89]:
## load user ratings data
users = pd.read_csv("data/ratings.csv")
users.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# Data Cleaning

In [90]:
## Movies

## remove movies with no listed genres
movies = movies[movies["genres"] != "(no genres listed)"]

## assign new movie id to "movie"
movies["movie"] = range(0,len(movies))

## extract the movie's name from the "title" column
movies["name"] = movies["title"].str.split(r"(").str[0]

## extract the movie's year from the "title" column
movies["year"] = movies["title"].apply(lambda x: int(x.split(r"(")[-1].replace(r")", "")) if r"(" in x else np.nan)

## new feature: "old" is True if year is < 2000, o/w False
movies["old"] = movies["year"].apply(lambda x: 1 if x < 2000 else 0)
movies.head()

Unnamed: 0,movieId,title,genres,movie,name,year,old
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,Toy Story,1995.0,1
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,Jumanji,1995.0,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2,Grumpier Old Men,1995.0,1
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3,Waiting to Exhale,1995.0,1
4,5,Father of the Bride Part II (1995),Comedy,4,Father of the Bride Part II,1995.0,1


In [91]:
## Users

## scale new user id starting at 0 and assign to "user"
users["user"] = users["userId"].apply(lambda x: x-1)

## converting timestamp int into Timestamp
users["timestamp"] = users["timestamp"].apply(lambda x: datetime.fromtimestamp(x))

## flagging if timestamp is during the daytime
users["daytime"] = users["timestamp"].apply(lambda x: 1 if 6<int(x.strftime("%H"))<20 else 0)

## flagging if timestamp is on the weekend
users["weekend"] = users["timestamp"].apply(lambda x: 1 if x.weekday() in [5,6] else 0)

users.head()

Unnamed: 0,userId,movieId,rating,timestamp,user,daytime,weekend
0,1,1,4.0,2000-07-30 11:45:03,0,1,1
1,1,3,4.0,2000-07-30 11:20:47,0,1,1
2,1,6,4.0,2000-07-30 11:37:04,0,1,1
3,1,47,5.0,2000-07-30 12:03:35,0,1,1
4,1,50,5.0,2000-07-30 11:48:51,0,1,1


Unnamed: 0,userId,movieId,rating,timestamp,user,daytime,weekend
0,1,1,4.0,2000-07-30 11:45:03,0,1,1
1,1,3,4.0,2000-07-30 11:20:47,0,1,1
2,1,6,4.0,2000-07-30 11:37:04,0,1,1
3,1,47,5.0,2000-07-30 12:03:35,0,1,1
4,1,50,5.0,2000-07-30 11:48:51,0,1,1
...,...,...,...,...,...,...,...
100831,610,166534,4.0,2017-05-03 14:53:22,609,1,0
100832,610,168248,5.0,2017-05-03 15:21:31,609,1,0
100833,610,168250,5.0,2017-05-08 12:50:47,609,1,0
100834,610,168252,5.0,2017-05-03 14:19:12,609,1,0
