In [1]:
import pandas as pd
from datetime import datetime

In [4]:
# Loading in the merged datasets

# Load file with time component
df = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df.csv")

# Load file with time component, categories renamed
df_category = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_category.csv")

# Load file with time component, movie dummy
df_expanded_movies = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_expanded_movies.csv")

# Load file with time component, categories renamed, movie dummy
df_expanded_movies_category = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_expanded_movies_category.csv")

# Load FINAL file with time component, categories renamed, movie dummy, ordered right
df_final = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_final.csv")

# Loading in the data

In [33]:
# Loading seperate datasets

# Define file paths
ratings_file = '/Users/femke/Documents/Uni/DSS/Thesis/Data/ml-1m/ratings.dat'
users_file = '/Users/femke/Documents/Uni/DSS/Thesis/Data/ml-1m/users.dat'
movies_file = '/Users/femke/Documents/Uni/DSS/Thesis/Data/ml-1m/movies.dat'

# Load datasets
ratings = pd.read_csv(ratings_file, sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
users = pd.read_csv(users_file, sep='::', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python')
movies = pd.read_csv(movies_file, sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python', encoding='ISO-8859-1')

In [51]:
# Merging the seperate datasets
ratings_plus_users = pd.merge(ratings, users, on='UserID')

# Merge the result with movies
df = pd.merge(ratings_plus_users, movies, on='MovieID')

# First data handling

In [52]:
# Splitting 'Timestamp'
df['Timestamp'] = df['Timestamp'].apply(datetime.fromtimestamp)
df['Year'] = df['Timestamp'].dt.year
df['Month'] = df['Timestamp'].dt.month
df['Day'] = df['Timestamp'].dt.day
df['Hour'] = df['Timestamp'].dt.hour

# Removing the unnessecary features (Zip-code and timestamp)
df = df.drop(columns=["Timestamp", "Zip-code"])

In [54]:
# Creating df_category (Occupation and Age in full)

# Create a copy of the dataframe
df_category = df.copy()

# Change the names of the categories for the Age feature
age_mapping = {
    1: "Under 18", 18: "18-24", 25: "25-34", 35: "35-44",
    45: "45-49", 50: "50-55", 56: "56+"
}
df_category['Age'] = df_category['Age'].map(age_mapping)

# Change the names of the categories for the Occupation feature
occupation_mapping = {
    0: "other or not specified", 1: "academic/educator", 2: "artist",
    3: "clerical/admin", 4: "college/grad student", 5: "customer service",
    6: "doctor/health care", 7: "executive/managerial", 8: "farmer",
    9: "homemaker", 10: "K-12 student", 11: "lawyer", 12: "programmer",
    13: "retired", 14: "sales/marketing", 15: "scientist", 16: "self-employed",
    17: "technician/engineer", 18: "tradesman/craftsman", 19: "unemployed",
    20: "writer"
}
df_category['Occupation'] = df_category['Occupation'].map(occupation_mapping)

In [55]:
# Splitting the release year from the title and sorting the columns
df_category['Release_year'] = df_category.Title.str.extract("\((\d{4})\)", expand=True).astype(str)
df_category['Release_year'] = pd.to_datetime(df_category.Release_year, format='%Y')
df_category['Release_year'] = df_category.Release_year.dt.year
df_category['Title'] = df_category.Title.str[:-7]

column_order = ['UserID', 'MovieID', 'Rating', 'Year', 'Month', 'Day', 'Hour','Gender', 'Age', 'Occupation', 
                'Title', 'Release_year', 'Genres']
df_category = df_category[column_order]

In [56]:
df_category.head()

Unnamed: 0,UserID,MovieID,Rating,Year,Month,Day,Hour,Gender,Age,Occupation,Title,Release_year,Genres
0,1,1193,5,2000,12,31,23,F,Under 18,K-12 student,One Flew Over the Cuckoo's Nest,1975,Drama
1,2,1193,5,2000,12,31,22,M,56+,self-employed,One Flew Over the Cuckoo's Nest,1975,Drama
2,12,1193,4,2000,12,31,0,M,25-34,programmer,One Flew Over the Cuckoo's Nest,1975,Drama
3,15,1193,4,2000,12,30,19,M,25-34,executive/managerial,One Flew Over the Cuckoo's Nest,1975,Drama
4,17,1193,5,2000,12,30,7,M,50-55,academic/educator,One Flew Over the Cuckoo's Nest,1975,Drama


# Movies dataset exploration

In [62]:
df_category.head()

Unnamed: 0,UserID,MovieID,Rating,Year,Month,Day,Hour,Gender,Age,Occupation,Title,Release_year,Genres
0,1,1193,5,2000,12,31,23,F,Under 18,K-12 student,One Flew Over the Cuckoo's Nest,1975,Drama
1,2,1193,5,2000,12,31,22,M,56+,self-employed,One Flew Over the Cuckoo's Nest,1975,Drama
2,12,1193,4,2000,12,31,0,M,25-34,programmer,One Flew Over the Cuckoo's Nest,1975,Drama
3,15,1193,4,2000,12,30,19,M,25-34,executive/managerial,One Flew Over the Cuckoo's Nest,1975,Drama
4,17,1193,5,2000,12,30,7,M,50-55,academic/educator,One Flew Over the Cuckoo's Nest,1975,Drama


In [57]:
# Splitting the release year from the title for movies dataset
movies['Release_year'] = movies.Title.str.extract("\((\d{4})\)", expand=True).astype(str)
movies['Release_year'] = pd.to_datetime(movies.Release_year, format='%Y')
movies['Release_year'] = movies.Release_year.dt.year
movies['Title'] = movies.Title.str[:-7]

In [59]:
# Separating the genres and creating the dummies
# Creating all unique genres
genres_unique = pd.DataFrame(movies.Genres.str.split('|').tolist()).stack().unique()
genres_unique = pd.DataFrame(genres_unique, columns=['Genre'])

# Remove old Genres column and get dummies
movies = movies.join(movies.Genres.str.get_dummies().astype(bool))
movies.drop('Genres', inplace=True, axis=1)

In [8]:
# Merge expanded movie dataset (with dummies) with ratings and user dataset
df_expanded_movies = pd.merge(ratings_plus_users, movies, on='MovieID')

In [None]:
# Creating time components in df_expanded_movies
df_expanded_movies['Timestamp'] = df_expanded_movies['Timestamp'].apply(datetime.fromtimestamp)
df_expanded_movies['Year'] = df_expanded_movies['Timestamp'].dt.year
df_expanded_movies['Month'] = df_expanded_movies['Timestamp'].dt.month
df_expanded_movies['Day'] = df_expanded_movies['Timestamp'].dt.day
df_expanded_movies['Hour'] = df_expanded_movies['Timestamp'].dt.hour

In [9]:
# Creating df_expanded_movies_category (Occupation and Age in full + movie dummy)

# Create a copy of the dataframe
df_expanded_movies_category = df_expanded_movies.copy()

# Change the names of the categories for the Age feature
age_mapping = {
    1: "Under 18", 18: "18-24", 25: "25-34", 35: "35-44",
    45: "45-49", 50: "50-55", 56: "56+"
}
df_expanded_movies_category['Age'] = df_expanded_movies_category['Age'].map(age_mapping)

# Change the names of the categories for the Occupation feature
occupation_mapping = {
    0: "other or not specified", 1: "academic/educator", 2: "artist",
    3: "clerical/admin", 4: "college/grad student", 5: "customer service",
    6: "doctor/health care", 7: "executive/managerial", 8: "farmer",
    9: "homemaker", 10: "K-12 student", 11: "lawyer", 12: "programmer",
    13: "retired", 14: "sales/marketing", 15: "scientist", 16: "self-employed",
    17: "technician/engineer", 18: "tradesman/craftsman", 19: "unemployed",
    20: "writer"
}
df_expanded_movies_category['Occupation'] = df_expanded_movies_category['Occupation'].map(occupation_mapping)

In [11]:
# Sorting df_expanded_movies_category
desired_columns = ['UserID', 'MovieID', 'Rating', 'Timestamp', 'Year', 'Month', 'Day', 'Hour','Gender', 'Age', 'Occupation', 'Zip-code', 
                   'Title', 'Release_year', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 
                   'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
                   'Sci-Fi', 'Thriller', 'War', 'Western']

# Now, we can reorder the dataframe and drop the 'Timestamp' column
df_expanded_movies_category = df_expanded_movies_category[desired_columns]

# Preparing final dataframe for use in model 

In [12]:
df_expanded_movies.dtypes

UserID                   int64
MovieID                  int64
Rating                   int64
Timestamp       datetime64[ns]
Gender                  object
Age                      int64
Occupation               int64
Zip-code                object
Title                   object
Release_year             int32
Action                    bool
Adventure                 bool
Animation                 bool
Children's                bool
Comedy                    bool
Crime                     bool
Documentary               bool
Drama                     bool
Fantasy                   bool
Film-Noir                 bool
Horror                    bool
Musical                   bool
Mystery                   bool
Romance                   bool
Sci-Fi                    bool
Thriller                  bool
War                       bool
Western                   bool
Year                     int32
Month                    int32
Day                      int32
Hour                     int32
dtype: o

In [13]:
# Creating a temporary df (Occupation in full)

# Change the names of the categories for the Occupation feature

df_temp = df_expanded_movies.copy()

occupation_mapping = {
    0: "Other or not specified", 1: "Academic/educator", 2: "Artist",
    3: "Clerical/admin", 4: "College/grad student", 5: "Customer service",
    6: "Doctor/health care", 7: "Executive/managerial", 8: "Farmer",
    9: "Homemaker", 10: "K-12 student", 11: "Lawyer", 12: "Programmer",
    13: "Retired", 14: "Sales/marketing", 15: "Scientist", 16: "Self-employed",
    17: "Technician/engineer", 18: "Tradesman/craftsman", 19: "Unemployed",
    20: "Writer"
}
df_temp['Occupation'] = df_temp['Occupation'].map(occupation_mapping)

In [14]:
# One-hot encode Gender
df_temp = pd.get_dummies(df_temp, columns=['Gender'], drop_first=True)
df_temp.rename(columns={'Gender_M': 'Male'}, inplace=True)

In [16]:
# Sorting the df_temp
desired_columns = ['UserID', 'MovieID', 'Rating', 'Year', 'Month', 'Day', 'Hour' ,'Male', 'Age', 'Occupation', 'Zip-code', 
                   'Title', 'Release_year', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 
                   'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
                   'Sci-Fi', 'Thriller', 'War', 'Western']

# Reorganize df in temporary dataset
df_temp = df_temp[desired_columns]

In [17]:
# One-Hot encode Occupation

# Assuming df_temp is your dataframe
df_temp = pd.get_dummies(df_temp, columns=['Occupation'], drop_first=True)

# Rename the columns to remove the prefix 'Occupation_'
df_temp.columns = [col.replace('Occupation_', '') for col in df_temp.columns]

In [18]:
# Remove zip-code
df_temp = df_temp.drop(columns='Zip-code')

In [19]:
# Turn df_temp into df_final
df_final = df_temp.copy()

In [22]:
pd.set_option('display.max_columns', None)
df_final.head()

Unnamed: 0,UserID,MovieID,Rating,Year,Month,Day,Hour,Male,Age,Title,Release_year,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Artist,Clerical/admin,College/grad student,Customer service,Doctor/health care,Executive/managerial,Farmer,Homemaker,K-12 student,Lawyer,Other or not specified,Programmer,Retired,Sales/marketing,Scientist,Self-employed,Technician/engineer,Tradesman/craftsman,Unemployed,Writer
0,1,1193,5,2000,12,31,23,False,1,One Flew Over the Cuckoo's Nest,1975,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
1,2,1193,5,2000,12,31,22,True,56,One Flew Over the Cuckoo's Nest,1975,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
2,12,1193,4,2000,12,31,0,True,25,One Flew Over the Cuckoo's Nest,1975,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
3,15,1193,4,2000,12,30,19,True,25,One Flew Over the Cuckoo's Nest,1975,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,17,1193,5,2000,12,30,7,True,50,One Flew Over the Cuckoo's Nest,1975,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


# Feature engineering

In [25]:
df_category.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres,Year,Month,Day,Hour
0,1,1193,5,2000-12-31 23:12:40,F,Under 18,K-12 student,48067,One Flew Over the Cuckoo's Nest (1975),Drama,2000,12,31,23
1,2,1193,5,2000-12-31 22:33:33,M,56+,self-employed,70072,One Flew Over the Cuckoo's Nest (1975),Drama,2000,12,31,22
2,12,1193,4,2000-12-31 00:49:39,M,25-34,programmer,32793,One Flew Over the Cuckoo's Nest (1975),Drama,2000,12,31,0
3,15,1193,4,2000-12-30 19:01:19,M,25-34,executive/managerial,22903,One Flew Over the Cuckoo's Nest (1975),Drama,2000,12,30,19
4,17,1193,5,2000-12-30 07:41:11,M,50-55,academic/educator,95350,One Flew Over the Cuckoo's Nest (1975),Drama,2000,12,30,7


# Saving the files

In [None]:
# Saving all dataframes
# Save file with time component
df.to_csv('/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df.csv', index=False)

# Save file with time component, categories renamed
df_category.to_csv('/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_category.csv', index=False)

# Save file with time component, movie dummy
df_expanded_movies.to_csv('/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_expanded_movies.csv', index=False)

# Save file with time component, categories renamed, movie dummy
df_expanded_movies_category.to_csv('/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_expanded_movies_category.csv', index=False)

# Save file with time component, categories renamed, movie dummy, ordered right
df_final.to_csv('/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_final.csv', index=False)


In [67]:
# Loading in the merged datasets

# Load file with time component
df_good = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df.csv")

# Load file with time component, categories renamed
df_category_good = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_category.csv")

# Load file with time component, movie dummy
df_expanded_movies_good = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_expanded_movies.csv")

# Load file with time component, categories renamed, movie dummy
df_expanded_movies_category_good = pd.read_csv("/Users/femke/Documents/Uni/DSS/Thesis/Data/ml1m_df_expanded_movies_category.csv")
