# Recommender Systems

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import chardet
%config Completer.use_jedi = False

## Loading data

It appears that there is inconsistency in the text encoding used in various data files. As a result, we must verify the encoding to ensure accurate data reading from these files.

In [1]:
def get_file_encoding(file_path):
    """
    This function checks the text enconding used in a particular file
    
    :param file_path: The file path you wish to examine for its encoding
    :return: String containing enconding type
    """
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        return result['encoding']

In [3]:
# Loading ratings data
ratings_path = "./data/ratings.dat"
ratings = pd.read_csv(ratings_path, delimiter="::", header=None, engine='python', encoding= get_file_encoding(ratings_path))
ratings = ratings.rename(columns={0: "UserID", 1: "MovieID", 2: "Rating", 3:"Timestamp"})# Set ratings column names

ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
# Loading movies data
movies_path = "./data/movies.dat"
movies = pd.read_csv(movies_path, delimiter="::", header=None, engine='python', encoding= get_file_encoding(movies_path))
movies = movies.rename(columns={0: "MovieID", 1: "Title", 2: "Genres"})

movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# Loading users data
users_path = "./data/users.dat"
users = pd.read_csv(users_path, delimiter="::", header=None, engine='python', encoding=get_file_encoding(users_path))
users = users.rename(columns={0: "UserID", 1: "Gender", 2: "Age", 3: "Occupation", 4: "Zip-code"})

users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## Pre-processing Data

### Preparing Users

In [6]:
# One Hot Encode Gender
encoder = OneHotEncoder(sparse_output=False)

# Encode genders
encoded_gender = encoder.fit_transform(users[['Gender']])
encoded_gender_df = pd.DataFrame(encoded_gender, columns=encoder.get_feature_names_out(['Gender']))

# Concat new hot encoded columns
users = pd.concat([users, encoded_gender_df], axis = 1)

# Drop previous gender column
users.drop(['Gender'], axis='columns', inplace=True)

In [7]:
# Label Encode Zip-code
le = LabelEncoder()

# Update column
users['Zip-code'] = le.fit_transform(users['Zip-code'])

In [8]:
users.head()

Unnamed: 0,UserID,Age,Occupation,Zip-code,Gender_F,Gender_M
0,1,1,10,1588,1.0,0.0
1,2,56,16,2248,0.0,1.0
2,3,25,15,1863,0.0,1.0
3,4,45,7,140,0.0,1.0
4,5,25,20,1938,0.0,1.0


In [11]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   UserID      6040 non-null   int64  
 1   Age         6040 non-null   int64  
 2   Occupation  6040 non-null   int64  
 3   Zip-code    6040 non-null   int64  
 4   Gender_F    6040 non-null   float64
 5   Gender_M    6040 non-null   float64
dtypes: float64(2), int64(4)
memory usage: 283.3 KB
