# Loading and Preprocessing the data from MovieLens

In [19]:
# Libraries
import pandas as pd
import os

In [6]:
# Defining file directories
data_directory = 'data'
users_data = 'users.dat'
movies_data = 'movies.dat'
rating_data = 'ratings.dat'

In [7]:
# Csv file naes to be saved
users_to_csv = 'users.csv'
movies_to_csv = 'movies.csv'
ratings_to_csv = 'ratings.csv'

In [24]:
# Loading the Ratings file data
ratings = pd.read_csv(os.path.join(data_directory, rating_data), sep='::', engine='python', encoding='latin-1', names=['user_id', 'movie_id', 'rating', 'timestamp'])
#The .drop method remove duplicate values of user_id, so that only the unique user ids are considered. The .max method is  used to find the maximum value of user_id among the remaining unique user ids.
max_userid = ratings['user_id'].drop_duplicates().max()
max_movieid = ratings['movie_id'].drop_duplicates().max()

The following code is preparing the ratings data to be used for training a deep learning model using the Keras library. 
* The first line is creating two new columns in the ratings dataframe called "user_emb_id" and "movie_emb_id".
* The second line sets the values in the "user_emb_id" column to be the corresponding user_id value minus one. This is because the model will use embedding layers to represent the users and movies as numerical vectors, and embedding layers in Keras expect input to be zero-indexed.
* The third line sets the values in the "movie_emb_id" column to be the corresponding movie_id value minus one, again to make sure the inputs are zero-indexed.
* The fourth line is simply printing the number of ratings that were loaded into the dataframe after these changes were made.

In [25]:
ratings['user_emb_id'] = ratings['user_id'] - 1
# Add movie_emb_id column whose values == movie_id - 1
ratings['movie_emb_id'] = ratings['movie_id'] - 1
# Save into ratings.csv
ratings.to_csv(ratings_to_csv, 
               sep='\t', 
               header=True, 
               encoding='latin-1', 
               columns=['user_id', 'movie_id', 'rating', 'timestamp', 'user_emb_id', 'movie_emb_id'])
print('In total', len(ratings), 'ratings were loaded.')

In total 1000209 ratings were loaded.


In [22]:
# Updating the Users file data with the full information
ages = { 1: "Under 18", 18: "18-24", 25: "25-34", 35: "35-44", 45: "45-49", 50: "50-55", 56: "56+" }
occupations = { 0: "other or not specified", 1: "academic/educator", 2: "artist", 3: "clerical/admin",
                4: "college/grad student", 5: "customer service", 6: "doctor/health care",
                7: "executive/managerial", 8: "farmer", 9: "homemaker", 10: "K-12 student", 11: "lawyer",
                12: "programmer", 13: "retired", 14: "sales/marketing", 15: "scientist", 16: "self-employed",
                17: "technician/engineer", 18: "tradesman/craftsman", 19: "unemployed", 20: "writer" }

# Loading the Users file data 
users = pd.read_csv(os.path.join(data_directory, users_data), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['user_id', 'gender', 'age', 'occupation', 'zipcode'])
users['age_desc'] = users['age'].apply(lambda x: ages[x])
users['occ_desc'] = users['occupation'].apply(lambda x: occupations[x])

# Save to users.csv
users.to_csv(path_or_buf='users.csv', 
             sep='\t', 
             header=True, 
             encoding='latin-1',
             columns=['user_id', 'gender', 'age', 'occupation', 'zipcode', 'age_desc', 'occ_desc'])

In [23]:
# Read the Movies File
movies = pd.read_csv(os.path.join(data_directory, movies_data), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['movie_id', 'title', 'genres'])
# Save into movies.csv
movies.to_csv(movies_to_csv, 
              sep='\t', 
              header=True, 
              columns=['movie_id', 'title', 'genres'])