In [1]:
# Import dependency libraries.
import pandas as pd

In [20]:
# Basic user information, data from users.dat
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
user_df = pd.read_csv('./Data/ml-1m/users.dat',
                      sep='::',
                      header=None,
                      names=unames,
                      engine='python')

# Movie information, data from movies.dat
mnames = ['movie_id', 'title', 'genres']
movies_df = pd.read_csv('./Data/ml-1m/movies.dat',
                        sep='::',
                        header=None,
                        names=mnames,
                        engine='python',
                        encoding='ISO-8859-1')

# Rating information, data from rating.dat
rnames = ['user_id', 'movie_id', 'imdbId', 'timestamp']
ratings_df = pd.read_csv('./Data/ml-1m/ratings.dat',
                         sep='::',
                         header=None,
                         engine='python',
                         names=rnames)

In [21]:
print("user information dataframe")
user_df.head()

user information dataframe


Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [22]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     6040 non-null   int64 
 1   gender      6040 non-null   object
 2   age         6040 non-null   int64 
 3   occupation  6040 non-null   int64 
 4   zip         6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


In [23]:
print("movie information dataframe")
movies_df.head()

movie information dataframe


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [24]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  3883 non-null   int64 
 1   title     3883 non-null   object
 2   genres    3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [25]:
print("rating information dataframe")
ratings_df.head()

rating information dataframe


Unnamed: 0,user_id,movie_id,imdbId,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [26]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   user_id    1000209 non-null  int64
 1   movie_id   1000209 non-null  int64
 2   imdbId     1000209 non-null  int64
 3   timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


Remove the year from the movies.dat data file title: Remove the year from the title by regular expression.

In [27]:
import re
patter = re.compile(r'^(.*)\((\d+)\)$')
title = {val:patter.match(val).group(1) for i,val in enumerate(set(movies_df['title']))}
movies_df['title'] = movies_df['title'].map(title)  
movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story,Animation|Children's|Comedy
1,2,Jumanji,Adventure|Children's|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama
4,5,Father of the Bride Part II,Comedy


Convert timestamps in the ratings.dat file to a specific time: Convert timestamps to a specific time with the pd.to_datetime function in Pandas.



In [28]:
ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'],unit='s')
ratings_df.head()

Unnamed: 0,user_id,movie_id,imdbId,timestamp
0,1,1193,5,2000-12-31 22:12:40
1,1,661,3,2000-12-31 22:35:09
2,1,914,3,2000-12-31 22:32:48
3,1,3408,4,2000-12-31 22:04:35
4,1,2355,5,2001-01-06 23:38:11


Change DataFrame column name: Change the column name by pandas.DataFrame.rename function, the specific code is as follows.


In [29]:
ratings_df.rename(columns={'timestamp':'time'},inplace=True)
ratings_df.tail()

Unnamed: 0,user_id,movie_id,imdbId,time
1000204,6040,1091,1,2000-04-26 02:35:41
1000205,6040,1094,5,2000-04-25 23:21:27
1000206,6040,562,5,2000-04-25 23:19:06
1000207,6040,1096,4,2000-04-26 02:20:48
1000208,6040,1097,4,2000-04-26 02:19:29


Change the time format to 'year-month-day':
1. Use the to_datetime function in Pandas to convert the date column from object format to datetime format.
2. Fetch the year, month and day from strftime('%Y%m%d'), and apply this function to the ratings_df['timestamp'] column with apply lambda.

In [30]:
import datetime
date_df = pd.DataFrame({'time':ratings_df['time']})
date_df['date']=pd.to_datetime(date_df['time'])
date_df.tail()


Unnamed: 0,time,date
1000204,2000-04-26 02:35:41,2000-04-26 02:35:41
1000205,2000-04-25 23:21:27,2000-04-25 23:21:27
1000206,2000-04-25 23:19:06,2000-04-25 23:19:06
1000207,2000-04-26 02:20:48,2000-04-26 02:20:48
1000208,2000-04-26 02:19:29,2000-04-26 02:19:29


In [31]:
ratings_df['time'] = date_df['date'].apply(lambda x:x.strftime('%Y-%m-%d'))
ratings_df.tail()

Unnamed: 0,user_id,movie_id,imdbId,time
1000204,6040,1091,1,2000-04-26
1000205,6040,1094,5,2000-04-25
1000206,6040,562,5,2000-04-25
1000207,6040,1096,4,2000-04-26
1000208,6040,1097,4,2000-04-26
