# Exploratory Data Analysis (EDA) of the MovieLens Dataset

This notebook is an exploratory data analysis (EDA) of the MovieLens dataset. The ratings are integers between 1 and 5. The dataset also contains the genre of each movie.

In [1]:
import pandas as pd

In [2]:
# Assumes that the data is in the data folder.
# Run prepare_data() on a datamodule to load the data.

ratings = pd.read_csv('data/ratings.csv')
movie_meta = pd.read_csv('data/movies.csv')

In [3]:
# Drop movies never rated
print(f"Before: {len(movie_meta)}")
movie_meta = movie_meta[movie_meta['movieId'].isin(ratings['movieId'])]
print(f"After: {len(movie_meta)}")

Before: 86537
After: 83239


In [4]:
ratings["datetime"] = pd.to_datetime(ratings["timestamp"], unit='s')


In [5]:
year_pattern = r"\((\d{4})\)"
movie_meta["year"] = movie_meta.title.str.extract(year_pattern)

# Fill missing year values with movie's min year from ratings
missing_mask = movie_meta["year"].isna()
mapping = (
    ratings.sort_values("datetime")
    .drop_duplicates("movieId", keep="first")
    .set_index("movieId")["datetime"]
    .apply(lambda x: x.year)
)
movie_meta.loc[missing_mask, "year"] = movie_meta.loc[missing_mask, "movieId"].map(
    mapping
)


print(f"Dropping {movie_meta.year.isna().sum()} movies with missing year")
movie_meta = movie_meta.dropna(subset=["year"])

movie_meta["year"] = movie_meta.year.astype(int)
movie_meta["title"] = movie_meta.title.str.replace(
    year_pattern, "", regex=True
).str.strip()
movie_meta.rename(columns={"movieId": "movie_id"}, inplace=True)

Dropping 0 movies with missing year


In [6]:
# Make genre dummies
genre_dummies = movie_meta.genres.str.get_dummies(sep='|').drop(columns="(no genres listed)").rename(columns=lambda x: x.lower().replace("-", "_"))

In [8]:
movie_meta = pd.concat([movie_meta[["movie_id", "title", "year"]], genre_dummies], axis=1)


In [9]:
display(movie_meta.info())
display(movie_meta.head())

<class 'pandas.core.frame.DataFrame'>
Index: 83239 entries, 0 to 86536
Data columns (total 22 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   movie_id     83239 non-null  int64 
 1   title        83239 non-null  object
 2   year         83239 non-null  int64 
 3   action       83239 non-null  int64 
 4   adventure    83239 non-null  int64 
 5   animation    83239 non-null  int64 
 6   children     83239 non-null  int64 
 7   comedy       83239 non-null  int64 
 8   crime        83239 non-null  int64 
 9   documentary  83239 non-null  int64 
 10  drama        83239 non-null  int64 
 11  fantasy      83239 non-null  int64 
 12  film_noir    83239 non-null  int64 
 13  horror       83239 non-null  int64 
 14  imax         83239 non-null  int64 
 15  musical      83239 non-null  int64 
 16  mystery      83239 non-null  int64 
 17  romance      83239 non-null  int64 
 18  sci_fi       83239 non-null  int64 
 19  thriller     83239 non-null  i

None

Unnamed: 0,movie_id,title,year,action,adventure,animation,children,comedy,crime,documentary,...,film_noir,horror,imax,musical,mystery,romance,sci_fi,thriller,war,western
0,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
