In [38]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


In [None]:
# Load the final dataframe (feature engineered, encoded, scaled/unscaled)
df_final_unscaled = pd.read_csv('/Users/femke/Documents/Uni/DSS/Thesis/Data/df_final_unscaled.csv')
df_sampled_unscaled = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/df_sampled_unscaled.csv")


# Loading in the data

In [None]:
# Load seperate datasets
# Define file paths
ratings_file = '/Users/femke/Documents/Uni/DSS/Thesis/Data/ml-1m/ratings.dat'
users_file = '/Users/femke/Documents/Uni/DSS/Thesis/Data/ml-1m/users.dat'
movies_file = '/Users/femke/Documents/Uni/DSS/Thesis/Data/ml-1m/movies.dat'

# Load datasets
ratings = pd.read_csv(ratings_file, sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
users = pd.read_csv(users_file, sep='::', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python')
movies = pd.read_csv(movies_file, sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python', encoding='ISO-8859-1')

In [39]:
# Merge the seperate datasets
ratings_plus_users = pd.merge(ratings, users, on='UserID')
df = pd.merge(ratings_plus_users, movies, on='MovieID')

# Feature Engineering

In [40]:
# Create time component features
df['Timestamp'] = df['Timestamp'].apply(datetime.fromtimestamp)
#df['Year'] = df['Timestamp'].dt.year
df['Month'] = df['Timestamp'].dt.month
df['Day'] = df['Timestamp'].dt.day
df['Hour'] = df['Timestamp'].dt.hour

# Removing the unnecessary features (Zip-code and timestamp)
df = df.drop(columns=["Timestamp", "Zip-code"])

In [41]:
df_temp = df. copy()

In [None]:
# # Prepare df_temp (used for processing)
# df_temp = df.copy()

# # Change the names of the categories for the Occupation feature
# occupation_mapping = {
#     0: "Other or not specified", 1: "Academic/educator", 2: "Artist",
#     3: "Clerical/admin", 4: "College/grad student", 5: "Customer service",
#     6: "Doctor/health care", 7: "Executive/managerial", 8: "Farmer",
#     9: "Homemaker", 10: "K-12 student", 11: "Lawyer", 12: "Programmer",
#     13: "Retired", 14: "Sales/marketing", 15: "Scientist", 16: "Self-employed",
#     17: "Technician/engineer", 18: "Tradesman/craftsman", 19: "Unemployed",
#     20: "Writer"
# }
# df_temp['Occupation'] = df_temp['Occupation'].map(occupation_mapping)

In [42]:
# Create time features and sorting df_temp
df_temp['Release_year'] = df_temp.Title.str.extract("\((\d{4})\)", expand=True).astype(str)
df_temp['Release_year'] = pd.to_datetime(df_temp.Release_year, format='%Y')
df_temp['Release_year'] = df_temp.Release_year.dt.year
df_temp['Title'] = df_temp.Title.str[:-7]

# # Sort df_temp
# column_order = ['UserID', 'MovieID', 'Rating', 'Year', 'Month', 'Day', 'Hour','Gender', 'Age', 'Occupation',
#                 'Title', 'Release_year', 'Genres']
# df_temp = df_temp[column_order]

In [43]:
# Remove Title column (will be identified by MovieID)
df_temp = df_temp.drop(columns='Title')

In [44]:
df_temp.head()

Unnamed: 0,UserID,MovieID,Rating,Gender,Age,Occupation,Genres,Month,Day,Hour,Release_year
0,1,1193,5,F,1,10,Drama,12,31,22,1975
1,1,661,3,F,1,10,Animation|Children's|Musical,12,31,22,1996
2,1,914,3,F,1,10,Musical|Romance,12,31,22,1964
3,1,3408,4,F,1,10,Drama,12,31,22,2000
4,1,2355,5,F,1,10,Animation|Children's|Comedy,1,6,23,1998


In [None]:
# # Create the average rating per movie feature
# df_temp['Avg_rating_movie'] = df_temp.groupby('MovieID')['Rating'].transform('mean').round(2)

In [None]:
# # Create the deviation each rating is for the average rating of that movie feature
# df_temp['Dev_movie_avg'] = df_temp['Rating'] - df_temp['Avg_rating_movie'].round()

In [None]:
# # Create the average each user differs from the mean Rating given to a movie feature
# df_temp['Avg_dev_movie_avg'] = df_temp.groupby('UserID')['Dev_movie_avg'].transform('mean').round(2)

In [None]:
# # Check the range of the average deviation from the movie average
# max_value = df_temp['Avg_dev_movie_avg'].max()
# min_value = df_temp['Avg_dev_movie_avg'].min()

# print("Highest value in Avg_dev_movie_avg:", max_value)
# print("Lowest value in Avg_dev_movie_avg:", min_value)

Highest value in Avg_dev_movie_avg: 1.73
Lowest value in Avg_dev_movie_avg: -2.32


In [45]:
# Create the total ratings per user feature
df_temp['Total_ratings_per_user'] = df_temp.groupby('UserID')['UserID'].transform('count')

In [46]:
# Create the favourite genre feature

# Creatig feature Genres_list which is list of genres instead of a string
df_temp['Genres_list'] = df_temp['Genres'].apply(lambda x: x.split('|'))

# Explode the genres to have each genre in a separate row
df_expanded = df_temp.explode('Genres_list')

# Calculate the average rating and find highest
user_genre_avg = df_expanded.groupby(['UserID', 'Genres_list'])['Rating'].mean().reset_index()
favorite_genre = user_genre_avg.loc[user_genre_avg.groupby('UserID')['Rating'].idxmax()]

# Rename and put in the dataframe
favorite_genre.rename(columns={'Genres_list': 'Favourite_genre'}, inplace=True)
df_temp = df_temp.merge(favorite_genre[['UserID', 'Favourite_genre']], on='UserID', how='left')

# Remove genres_list
df_temp = df_temp.drop(columns="Genres_list")


In [47]:
# Result so far
df_temp.head()

Unnamed: 0,UserID,MovieID,Rating,Gender,Age,Occupation,Genres,Month,Day,Hour,Release_year,Total_ratings_per_user,Favourite_genre
0,1,1193,5,F,1,10,Drama,12,31,22,1975,53,War
1,1,661,3,F,1,10,Animation|Children's|Musical,12,31,22,1996,53,War
2,1,914,3,F,1,10,Musical|Romance,12,31,22,1964,53,War
3,1,3408,4,F,1,10,Drama,12,31,22,2000,53,War
4,1,2355,5,F,1,10,Animation|Children's|Comedy,1,6,23,1998,53,War


In [None]:
# Save and load df_temp
#df_temp.to_csv('/Users/femke/Documents/Uni/DSS/Thesis/Data/df_temp_features_included.csv', index=False)
df_temp = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/df_temp_features_included.csv")


# Encoding features

In [48]:
# Inspect datatypes in dataframe
df_temp.dtypes

Unnamed: 0,0
UserID,int64
MovieID,int64
Rating,int64
Gender,object
Age,int64
Occupation,int64
Genres,object
Month,int32
Day,int32
Hour,int32


Need Encoding: Gender, Age (due to it being categorical), Occupation, Genres, Favourite_genre

In [49]:
# One-hot encode Gender
df_temp = pd.get_dummies(df_temp, columns=['Gender'], prefix='', prefix_sep='')
df_temp.rename(columns={'F': 'Female', 'M': 'Male'}, inplace=True)

In [None]:
# # Label encode Age
# label_encoder = LabelEncoder()
# df_temp['Age'] = label_encoder.fit_transform(df_temp['Age'])

In [None]:
# # One-Hot encode Occupation
# df_temp = pd.get_dummies(df_temp, columns=['Occupation'], prefix='', prefix_sep='')

In [50]:
# One-Hot encode Genres
genres_split = df_temp['Genres'].str.get_dummies(sep='|').astype(bool)
df_temp = pd.concat([df_temp.drop(columns=['Genres']), genres_split], axis=1)


In [51]:
# One-Hot encode Favourite Genre
df_temp = pd.get_dummies(df_temp, columns=['Favourite_genre'], prefix='Favourite', prefix_sep='_')

In [52]:
# Result so far
pd.set_option('display.max_columns', None)
df_temp.head()

Unnamed: 0,UserID,MovieID,Rating,Age,Occupation,Month,Day,Hour,Release_year,Total_ratings_per_user,Female,Male,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Favourite_Action,Favourite_Adventure,Favourite_Animation,Favourite_Children's,Favourite_Comedy,Favourite_Crime,Favourite_Documentary,Favourite_Drama,Favourite_Fantasy,Favourite_Film-Noir,Favourite_Horror,Favourite_Musical,Favourite_Mystery,Favourite_Romance,Favourite_Sci-Fi,Favourite_Thriller,Favourite_War,Favourite_Western
0,1,1193,5,1,10,12,31,22,1975,53,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
1,1,661,3,1,10,12,31,22,1996,53,True,False,False,False,True,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
2,1,914,3,1,10,12,31,22,1964,53,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
3,1,3408,4,1,10,12,31,22,2000,53,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
4,1,2355,5,1,10,1,6,23,1998,53,True,False,False,False,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False


In [None]:
# # Save and load df_temp
# df_temp.to_csv('/Users/femke/Documents/Uni/DSS/Thesis/Data/df_final_unscaled.csv', index=False)
# #df_temp = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/df_temp_before_scaling.csv")

In [53]:
# creating smaller dataset for improved effieciency
from sklearn.model_selection import train_test_split
df_sampled, _ = train_test_split(df_temp, test_size=0.6, stratify=df_temp[['UserID']])

In [54]:
df_sampled.head()

Unnamed: 0,UserID,MovieID,Rating,Age,Occupation,Month,Day,Hour,Release_year,Total_ratings_per_user,Female,Male,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Favourite_Action,Favourite_Adventure,Favourite_Animation,Favourite_Children's,Favourite_Comedy,Favourite_Crime,Favourite_Documentary,Favourite_Drama,Favourite_Fantasy,Favourite_Film-Noir,Favourite_Horror,Favourite_Musical,Favourite_Mystery,Favourite_Romance,Favourite_Sci-Fi,Favourite_Thriller,Favourite_War,Favourite_Western
955177,5763,474,5,25,1,5,20,14,1993,677,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
459353,2828,2188,4,18,4,10,26,21,1998,58,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
729712,4363,589,5,35,14,8,2,3,1991,192,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
841564,5054,3386,3,35,2,8,1,17,1991,715,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
474146,2909,2792,2,25,7,10,19,16,1982,1258,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False


In [None]:
# # Save and load df_sampled (before scaling)
# df_sampled.to_csv('/Users/femke/Documents/Uni/DSS/Thesis/Data/df_sampled_unscaled.csv', index=False)
# #df_sampled_unscaled = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/df_sampled_unscaled.csv")

In [55]:
# Save and load df_sampled (before scaling)
df_sampled.to_csv('/content/drive/My Drive/Thesis/Data/sample_df_3010.csv', index=False)

# Final check for missing values

In [None]:
# Check for missing values (Null)
missing_values_null = df_final.isnull().sum()
print(missing_values_null)

UserID                0
MovieID               0
Rating                0
Year                  0
Month                 0
                     ..
Favourite_Romance     0
Favourite_Sci-Fi      0
Favourite_Thriller    0
Favourite_War         0
Favourite_Western     0
Length: 73, dtype: int64


In [None]:
# Check for missing values (NaN or None)
missing_values_NaN_None = df_final.isna().sum()
print(missing_values_NaN_None)

UserID                0
MovieID               0
Rating                0
Year                  0
Month                 0
                     ..
Favourite_Romance     0
Favourite_Sci-Fi      0
Favourite_Thriller    0
Favourite_War         0
Favourite_Western     0
Length: 73, dtype: int64


No missing values -> however, not every user rated every movie, could be considered missing values

# Saving the file

In [None]:
# Save df_final for model input - Feature engineered, encoded, scaled
df_final.to_csv('/Users/femke/Documents/Uni/DSS/Thesis/Data/df_final_scaled.csv', index=False)

# OLD CODE


In [None]:
# Save file with time component, movie dummy
df_expanded_movies.to_csv('/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_expanded_movies.csv', index=False)

In [None]:
# #OLD WAY TO GET TO df_expanded_movies_category and explore the movies dataset

# # Splitting the release year from the title for movies dataset
# movies['Release_year'] = movies.Title.str.extract("\((\d{4})\)", expand=True).astype(str)
# movies['Release_year'] = pd.to_datetime(movies.Release_year, format='%Y')
# movies['Release_year'] = movies.Release_year.dt.year
# movies['Title'] = movies.Title.str[:-7]

# # Separating the genres and creating the dummies
# # Creating all unique genres
# genres_unique = pd.DataFrame(movies.Genres.str.split('|').tolist()).stack().unique()
# genres_unique = pd.DataFrame(genres_unique, columns=['Genre'])

# # Remove old Genres column and get dummies
# movies = movies.join(movies.Genres.str.get_dummies().astype(bool))
# movies.drop('Genres', inplace=True, axis=1)

# # Merge expanded movie dataset (with dummies) with ratings and user dataset
# df_expanded_movies = pd.merge(ratings_plus_users, movies, on='MovieID')

# # Creating time components in df_expanded_movies
# df_expanded_movies['Timestamp'] = df_expanded_movies['Timestamp'].apply(datetime.fromtimestamp)
# df_expanded_movies['Year'] = df_expanded_movies['Timestamp'].dt.year
# df_expanded_movies['Month'] = df_expanded_movies['Timestamp'].dt.month
# df_expanded_movies['Day'] = df_expanded_movies['Timestamp'].dt.day
# df_expanded_movies['Hour'] = df_expanded_movies['Timestamp'].dt.hour

# # Creating df_expanded_movies_category (Occupation and Age in full + movie dummy)

# # Create a copy of the dataframe
# df_expanded_movies_category = df_expanded_movies.copy()

# # Change the names of the categories for the Age feature
# age_mapping = {
#     1: "Under 18", 18: "18-24", 25: "25-34", 35: "35-44",
#     45: "45-49", 50: "50-55", 56: "56+"
# }
# df_expanded_movies_category['Age'] = df_expanded_movies_category['Age'].map(age_mapping)

# # Change the names of the categories for the Occupation feature
# occupation_mapping = {
#     0: "other or not specified", 1: "academic/educator", 2: "artist",
#     3: "clerical/admin", 4: "college/grad student", 5: "customer service",
#     6: "doctor/health care", 7: "executive/managerial", 8: "farmer",
#     9: "homemaker", 10: "K-12 student", 11: "lawyer", 12: "programmer",
#     13: "retired", 14: "sales/marketing", 15: "scientist", 16: "self-employed",
#     17: "technician/engineer", 18: "tradesman/craftsman", 19: "unemployed",
#     20: "writer"
# }
# df_expanded_movies_category['Occupation'] = df_expanded_movies_category['Occupation'].map(occupation_mapping)

# # Sorting df_expanded_movies_category
# desired_columns = ['UserID', 'MovieID', 'Rating', 'Timestamp', 'Year', 'Month', 'Day', 'Hour','Gender', 'Age', 'Occupation', 'Zip-code',
#                    'Title', 'Release_year', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
#                    'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
#                    'Sci-Fi', 'Thriller', 'War', 'Western']

# # Now, we can reorder the dataframe and drop the 'Timestamp' column
# df_expanded_movies_category = df_expanded_movies_category[desired_columns]

In [None]:
# Creating df_category (Occupation and Age in full for EDA purposes)

# Create a copy of the dataframe
df_category = df.copy()

# Change the names of the categories for the Age feature
age_mapping = {
    1: "Under 18", 18: "18-24", 25: "25-34", 35: "35-44",
    45: "45-49", 50: "50-55", 56: "56+"
}
df_category['Age'] = df_category['Age'].map(age_mapping)

# Change the names of the categories for the Occupation feature
occupation_mapping = {
    0: "Other or not specified", 1: "Academic/educator", 2: "Artist",
    3: "Clerical/admin", 4: "College/grad student", 5: "Customer service",
    6: "Doctor/health care", 7: "Executive/managerial", 8: "Farmer",
    9: "Homemaker", 10: "K-12 student", 11: "Lawyer", 12: "Programmer",
    13: "Retired", 14: "Sales/marketing", 15: "Scientist", 16: "Self-employed",
    17: "Technician/engineer", 18: "Tradesman/craftsman", 19: "Unemployed",
    20: "Writer"
}
df_category['Occupation'] = df_category['Occupation'].map(occupation_mapping)

In [None]:
# Splitting the release year from the title and sorting the columns for df_category
df_category['Release_year'] = df_category.Title.str.extract("\((\d{4})\)", expand=True).astype(str)
df_category['Release_year'] = pd.to_datetime(df_category.Release_year, format='%Y')
df_category['Release_year'] = df_category.Release_year.dt.year
df_category['Title'] = df_category.Title.str[:-7]

column_order = ['UserID', 'MovieID', 'Rating', 'Year', 'Month', 'Day', 'Hour','Gender', 'Age', 'Occupation',
                'Title', 'Release_year', 'Genres']
df_category = df_category[column_order]

In [None]:
# Creating genre dummy for df_temp

# Splitting the genres
genres_split_2 = df_temp['Genres'].str.get_dummies(sep='|').astype(bool)

# Concatenate the original dataframe with the genres dummy variables
df_temp = pd.concat([df_temp.drop(columns=['Genres']), genres_split_2], axis=1)

# Creating genre dummy for df_category -> df_expanded_movies_category (seperate df so it can be used for EDA purposes)

# Splitting the genres
genres_split = df_category['Genres'].str.get_dummies(sep='|').astype(bool)

# Concatenate the original dataframe with the genres dummy variables
df_expanded_movies_category = pd.concat([df_category.drop(columns=['Genres']), genres_split], axis=1)

In [None]:
# One-Hot encode Genres when it still was a string
# Convert list to string with "|" separator
df_temp['Genres'] = df_temp['Genres'].apply(lambda x: '|'.join(x))

# Apply get_dummies to split the genres into binary columns
genres_split = df_temp['Genres'].str.get_dummies(sep='|')

# Concatenate the original dataframe with the genres dummy variables
df_temp = pd.concat([df_temp.drop(columns=['Genres']), genres_split], axis=1)

In [None]:
# Saving all dataframes
# Save file with time component
df.to_csv('/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df.csv', index=False)

# Save file with time component, categories renamed
df_category.to_csv('/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_category.csv', index=False)

# Save file with time component, categories renamed, movie dummy
df_expanded_movies_category.to_csv('/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_expanded_movies_category.csv', index=False)

# Save file with time component, categories renamed, movie dummy, ordered right
df_final.to_csv('/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_final.csv', index=False)

In [None]:
# Load file with time component
df = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df.csv")

# Load file with time component, categories renamed
df_category = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_category.csv")

# Load file with time component, movie dummy
df_expanded_movies = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_expanded_movies.csv")

# Load file with time component, categories renamed, movie dummy
df_expanded_movies_category = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_expanded_movies_category.csv")

# Load FINAL file with time component, categories renamed, movie dummy, ordered right
df_final = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_final.csv")

# 30 OKTOBER TRY OUT OF ARTICLE

In [1]:
# Google colab version
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

# Define file paths
ratings_file = '/content/drive/My Drive/Thesis/Data/ml-1m/ratings.dat'
users_file = '/content/drive/My Drive/Thesis/Data/ml-1m/users.dat'
movies_file = '/content/drive/My Drive/Thesis/Data/ml-1m/movies.dat'

# Load datasets
ratings = pd.read_csv(ratings_file, sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
users = pd.read_csv(users_file, sep='::', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python')
movies = pd.read_csv(movies_file, sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python', encoding='ISO-8859-1')

In [19]:
# Merge the seperate datasets
ratings_plus_users = pd.merge(ratings, users, on='UserID')
df = pd.merge(ratings_plus_users, movies, on='MovieID')

In [20]:
# Label encode Age on a scale of 1-7
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Age'] = label_encoder.fit_transform(df['Age']) + 1

In [21]:
# Define your genre mapping dictionary
genre_mapping = {
    'Action': 1,
    'Adventure': 2,
    'Animation': 3,
    'Children\'s': 4,
    'Comedy': 5,
    'Crime': 6,
    'Documentary': 7,
    'Drama': 8,
    'Fantasy': 9,
    'Film-Noir': 10,
    'Horror': 11,
    'Musical': 12,
    'Mystery': 13,
    'Romance': 14,
    'Sci-Fi': 15,
    'Thriller': 16,
    'War': 17,
    'Western': 18
}

# Function to convert genre string to numerical list
def convert_genres_to_numeric(genre_string):
    genres = genre_string.split('|')
    return [genre_mapping[genre] for genre in genres]

# Apply the function to the 'Genres' column
df['Genres_Numeric'] = df['Genres'].apply(convert_genres_to_numeric)



In [22]:
# Create time component features
df['Timestamp'] = df['Timestamp'].apply(datetime.fromtimestamp)
df['Month'] = df['Timestamp'].dt.month
df['Day'] = df['Timestamp'].dt.day
df['Hour'] = df['Timestamp'].dt.hour

# Removing the unnecessary features (Zip-code and timestamp)
df = df.drop(columns=["Timestamp", "Zip-code"])

In [23]:
# Create time features and sorting df_temp
df['Release_year'] = df.Title.str.extract("\((\d{4})\)", expand=True).astype(str)
df['Release_year'] = pd.to_datetime(df.Release_year, format='%Y')
df['Release_year'] = df.Release_year.dt.year
df['Title'] = df.Title.str[:-7]

In [32]:
# One-hot encode Gender
df = pd.get_dummies(df, columns=['Gender'], prefix='', prefix_sep='')
df.rename(columns={'F': 'Female', 'M': 'Male'}, inplace=True)

In [25]:
df = df.drop(columns='Title')

In [33]:
df.head()

Unnamed: 0,UserID,MovieID,Rating,Age,Occupation,Genres,Genres_Numeric,Month,Day,Hour,Release_year,Female,Male
0,1,1193,5,1,10,Drama,[8],12,31,22,1975,True,False
1,1,661,3,1,10,Animation|Children's|Musical,"[3, 4, 12]",12,31,22,1996,True,False
2,1,914,3,1,10,Musical|Romance,"[12, 14]",12,31,22,1964,True,False
3,1,3408,4,1,10,Drama,[8],12,31,22,2000,True,False
4,1,2355,5,1,10,Animation|Children's|Comedy,"[3, 4, 5]",1,6,23,1998,True,False


In [28]:
# creating smaller dataset for improved effieciency
from sklearn.model_selection import train_test_split
df_sampled, _ = train_test_split(df, test_size=0.6, stratify=df[['Rating']])

In [37]:
df.dtypes

Unnamed: 0,0
UserID,int64
MovieID,int64
Rating,int64
Age,int64
Occupation,int64
Genres_Numeric,object
Month,int32
Day,int32
Hour,int32
Release_year,int32


In [36]:
# One-Hot encode Genres
genres_split = df['Genres'].str.get_dummies(sep='|').astype(bool)
df = pd.concat([df.drop(columns=['Genres']), genres_split], axis=1)