In [1]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


# Loading in the data

In [3]:
from google.colab import drive
import pandas as pd
# Load in the data
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Load seperate datasets
# Define file paths
ratings_file = '/content/drive/My Drive/Thesis/Data/ml-1m/ratings.dat'
users_file = '/content/drive/My Drive/Thesis/Data/ml-1m/users.dat'
movies_file = '/content/drive/My Drive/Thesis/Data/ml-1m/movies.dat'

# Load datasets
ratings = pd.read_csv(ratings_file, sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
users = pd.read_csv(users_file, sep='::', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python')
movies = pd.read_csv(movies_file, sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python', encoding='ISO-8859-1')

In [22]:
# Merge the seperate datasets
ratings_plus_users = pd.merge(ratings, users, on='UserID')
df = pd.merge(ratings_plus_users, movies, on='MovieID')

# Feature Engineering

In [23]:
# Create time component features
df['Timestamp'] = df['Timestamp'].apply(datetime.fromtimestamp)
df['Year'] = df['Timestamp'].dt.year
df['Month'] = df['Timestamp'].dt.month
df['Day'] = df['Timestamp'].dt.day
df['Hour'] = df['Timestamp'].dt.hour

# Removing the unnecessary features (Zip-code and timestamp)
df = df.drop(columns=["Timestamp", "Zip-code"])

In [24]:
# Use temporary dataframe for calculations etc.
df_temp = df.copy()

In [25]:
# Change occupation labels for one-hot encoding later on
occupation_mapping = {
    0: "Other or not specified", 1: "Academic/educator", 2: "Artist",
    3: "Clerical/admin", 4: "College/grad student", 5: "Customer service",
    6: "Doctor/health care", 7: "Executive/managerial", 8: "Farmer",
    9: "Homemaker", 10: "K-12 student", 11: "Lawyer", 12: "Programmer",
    13: "Retired", 14: "Sales/marketing", 15: "Scientist", 16: "Self-employed",
    17: "Technician/engineer", 18: "Tradesman/craftsman", 19: "Unemployed",
    20: "Writer"
}

df_temp['Occupation'] = df_temp['Occupation'].map(occupation_mapping)

In [26]:
# Create time features and sorting df_temp
df_temp['Release_year'] = df_temp.Title.str.extract("\((\d{4})\)", expand=True).astype(str)
df_temp['Release_year'] = pd.to_datetime(df_temp.Release_year, format='%Y')
df_temp['Release_year'] = df_temp.Release_year.dt.year
df_temp['Title'] = df_temp.Title.str[:-7]

In [27]:
# Remove Title column (will be identified by MovieID)
df_temp = df_temp.drop(columns='Title')

In [29]:
# Create the favourite genre feature
# Creatig feature Genres_list which is list of genres instead of a string
df_temp['Genres_list'] = df_temp['Genres'].apply(lambda x: x.split('|'))

# Explode the genres to have each genre in a separate row
df_expanded = df_temp.explode('Genres_list')

# Calculate the average rating and find highest
user_genre_avg = df_expanded.groupby(['UserID', 'Genres_list'])['Rating'].mean().reset_index()
favorite_genre = user_genre_avg.loc[user_genre_avg.groupby('UserID')['Rating'].idxmax()]

# Rename and put in the dataframe
favorite_genre.rename(columns={'Genres_list': 'Favourite_genre'}, inplace=True)
df_temp = df_temp.merge(favorite_genre[['UserID', 'Favourite_genre']], on='UserID', how='left')

# Remove genres_list
df_temp = df_temp.drop(columns="Genres_list")


In [30]:
# Create the new column with the time difference
df_temp['Time_release_to_rating'] = df_temp['Year'] - df_temp['Release_year']

# Creating total ratings per movie
df_temp['Total_ratings_per_movie'] = df_temp.groupby('MovieID')['MovieID'].transform('count')

# Create total ratings per user
df_temp['Total_ratings_per_user'] = df_temp.groupby('UserID')['UserID'].transform('count')

In [31]:
df_temp.head()

Unnamed: 0,UserID,MovieID,Rating,Gender,Age,Occupation,Genres,Year,Month,Day,Hour,Release_year,Favourite_genre,Time_release_to_rating,Total_ratings_per_movie,Total_ratings_per_user
0,1,1193,5,F,1,K-12 student,Drama,2000,12,31,22,1975,War,25,1725,53
1,1,661,3,F,1,K-12 student,Animation|Children's|Musical,2000,12,31,22,1996,War,4,525,53
2,1,914,3,F,1,K-12 student,Musical|Romance,2000,12,31,22,1964,War,36,636,53
3,1,3408,4,F,1,K-12 student,Drama,2000,12,31,22,2000,War,0,1315,53
4,1,2355,5,F,1,K-12 student,Animation|Children's|Comedy,2001,1,6,23,1998,War,3,1703,53


# Encoding features

Need Encoding: Gender, Age (due to it being categorical), Occupation, Genres, Favourite_genre

In [33]:
# One-hot encode Gender
df_temp = pd.get_dummies(df_temp, columns=['Gender'], prefix='', prefix_sep='')
df_temp.rename(columns={'F': 'Female', 'M': 'Male'}, inplace=True)

In [34]:
# Label encode Age
label_encoder = LabelEncoder()
df_temp['Age'] = label_encoder.fit_transform(df_temp['Age'])

In [35]:
# One-Hot encode Occupation
df_temp = pd.get_dummies(df_temp, columns=['Occupation'], prefix='', prefix_sep='')

In [36]:
# One-Hot encode Genres
genres_split = df_temp['Genres'].str.get_dummies(sep='|').astype(bool)
df_temp = pd.concat([df_temp.drop(columns=['Genres']), genres_split], axis=1)


In [37]:
# One-Hot encode Favourite Genre
df_temp = pd.get_dummies(df_temp, columns=['Favourite_genre'], prefix='Favourite', prefix_sep='_')

Creating a subset of the dataframe due to resource constraints

In [40]:
# Sampling smaller dataset for improved efficiency
from sklearn.model_selection import train_test_split
df_final, _ = train_test_split(df_temp, test_size=0.6,
                                 stratify=df_temp[['Rating']],
                                 random_state=42)

# Check for missing values

In [45]:
# Count the total number of null values in the DataFrame
null_values = df_final.isnull().sum().sum()
print(f"Number of null values: {null_values}")

Number of null values: 0


In [44]:
# Count NaN values
nan_values = df_final.isna().sum().sum()
print(f"Number of NaN values: {nan_values}")

Number of NaN values: 0


No missing values -> however, not every user rated every movie, could be considered missing values

# Saving the dataframe

In [47]:
# Save df_final for model input - Feature engineered, encoded, scaled
df_final.to_csv('/content/drive/My Drive/Thesis/Data/df_final_2.csv', index=False)