# 1. Download ML-1M dataset

In [2]:
from io import StringIO, BytesIO
import zipfile
import requests
import io

In [3]:
url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'

In [4]:
request = requests.get(url)
zipDocument = zipfile.ZipFile(BytesIO(request.content))
zipDocument.extractall('./data')

# 2. Preprocess Dataset

In [33]:
import numpy as np
import pandas as pd
import os

In [19]:
data_path = './data/ml-1m/ratings.dat'
df = pd.read_csv(data_path, sep = '::', engine = 'python', header = None)

In [26]:
df

Unnamed: 0,0,1,2,3
0,1,3186,4,978300019
1,1,1270,5,978300055
2,1,1721,4,978300055
3,1,1022,5,978300055
4,1,2340,3,978300103
...,...,...,...,...
1000204,6040,2917,4,997454429
1000205,6040,1921,4,997454464
1000206,6040,1784,3,997454464
1000207,6040,161,3,997454486


## 2.0 Sorting by timestamp

In [21]:
df.sort_values([0, 3], ascending=[True, True], inplace = True)

df.reset_index(inplace=True, drop = True)

## 2.1 Re-index the user, item id

In [16]:
def re_index(s):
    
    i = 0
    
    s_map = {}
    
    for key in s:
        
        s_map[key] = i
        
        i += 1
        
    return s_map

In [22]:
user_list = df[0].unique()
item_list = df[1].unique()

user_map = re_index(user_list)
item_map = re_index(item_list)

In [23]:
new_df = pd.DataFrame()

In [24]:
new_df['user'] = df[0].map(lambda x:user_map[x])
new_df['item'] = df[1].map(lambda x:item_map[x])

In [25]:
new_df

Unnamed: 0,user,item
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
1000204,6039,1248
1000205,6039,370
1000206,6039,89
1000207,6039,464


## 2.2 Split data into training and testing

In [27]:
num_user = new_df['user'].unique().max() + 1
num_item = new_df['item'].unique().max() + 1

num_user, num_item

(6040, 3706)

In [29]:
u_dict = new_df.groupby('user')['item'].apply(list).to_dict()

In [31]:
train_u = []
train_i = []
test_u = []
test_i = []

for key, value in u_dict.items():
    
    train_item = value[:-1]
    train_u.extend([key] * len(train_item))
    train_i.extend(train_item)
    
    
  
    test_u.append(key)
    test_i.append(value[-1])

In [32]:
train_df = pd.DataFrame()
train_df['user'] = train_u
train_df['item'] = train_i

test_df = pd.DataFrame()
test_df['user'] = test_u
test_df['item'] = test_i

## 2.3 Save

In [34]:
processed_data_path = './processed_data/ml-1m/'
if not os.path.exists(processed_data_path):
    os.makedirs(processed_data_path)

In [35]:
train_df.to_csv(os.path.join(processed_data_path ,  'train.csv'), index = False)

test_df.to_csv(os.path.join(processed_data_path ,  'test.csv'), index = False)