In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [85]:
dfm = pd.read_csv('/mnt/sda1/projects/rouse_ai/ml-1m/movies.dat', sep='::', header=None, encoding='ISO-8859-1', engine='python',
                  names=['movieID' , 'title' , 'genres_str'])

In [86]:
dfm['year'] = dfm['title'].map(lambda x: x[-5:-1])
dfm['year'] = dfm['year'].astype(int)

In [87]:
genres = ["Action","Adventure","Animation","Children's","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western",]
dict_map_genres = dict(zip(genres, list(range(len(genres)))))

def replace_list_of_string_by_dict(l:list) -> list:
    result = []
    for single_str in l:
        replaced_str = single_str
        for word, replacement in dict_map_genres.items():
            replaced_str = replaced_str.replace(word, str(replacement))
        try:
            replaced_str = int(replaced_str)
        except ValueError:
            pass
        result.append(replaced_str)
    return result

dfm['genres'] = dfm['genres_str'].map(lambda x: x.split('|'))
dfm['genres'] = dfm['genres'].map(lambda x: replace_list_of_string_by_dict(x))    

In [88]:
dfm['year'] = pd.cut(dfm['year'], bins=[1918, 1922, 1927, 1931, 1936, 1940, 1945, 1949, 1954, 1958, 1963,
       1967, 1972, 1976, 1981, 1986, 1988, 1990, 1992, 1994, 1996, 1998, 2001], labels=list(range(22)))
dfm['year'] = pd.to_numeric(dfm['year']).astype(int)

In [89]:
movie_index_by_id = {id: idx for idx, id in enumerate(dfm["movieID"])}
dfm['movieIDX'] = dfm['movieID'].map(movie_index_by_id)
dfm = dfm[['movieIDX', 'year', 'genres']]

In [90]:
dfm

Unnamed: 0,movieIDX,year,genres
0,0,19,"[2, 3, 4]"
1,1,19,"[1, 3, 8]"
2,2,19,"[4, 13]"
3,3,19,"[4, 7]"
4,4,19,[4]
...,...,...,...
3878,3878,21,[4]
3879,3879,21,[7]
3880,3880,21,[7]
3881,3881,21,[7]


In [91]:
dfm.to_pickle('movie_info.pickle')

In [9]:
dfu = pd.read_csv('/mnt/sda1/projects/rouse_ai/ml-1m/users.dat', sep='::', header=None, encoding='ISO-8859-1', engine='python',
                  names=['userID', 'gender', 'age','occupation', 'zipcode'])

In [10]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [11]:
le.fit(dfu['gender'])
dfu['gender'] = le.transform(dfu['gender'])

le.fit(dfu['age'])
dfu['age'] = le.transform(dfu['age'])

In [12]:
dfu.drop(columns='zipcode', inplace=True)

In [13]:
user_index_by_id = {id: idx for idx, id in enumerate(dfu["userID"])}
dfu['userIDX'] = dfu['userID'].map(user_index_by_id)
dfu = dfu[['userIDX', 'gender', 'age', 'occupation']]

In [14]:
dfu

Unnamed: 0,userIDX,gender,age,occupation
0,0,0,0,10
1,1,1,6,16
2,2,1,2,15
3,3,1,4,7
4,4,1,2,20
...,...,...,...,...
6035,6035,0,2,15
6036,6036,0,4,1
6037,6037,0,6,1
6038,6038,0,4,0


In [15]:
dfu.to_pickle('user_info.pickle')

In [17]:
dfr = pd.read_csv('/mnt/sda1/projects/rouse_ai/ml-1m/ratings.dat', sep='::', header=None, names=['userID', 'movieID', 'rating', 'timestamp'], engine='python')
dfr.drop(columns='timestamp', inplace=True)

In [18]:
dfr['movieIDX'] = dfr['movieID'].map(movie_index_by_id)
dfr['userIDX'] = dfr['userID'].map(user_index_by_id)
dfr = dfr[['userIDX', 'movieIDX', 'rating']]

In [19]:
dfr

Unnamed: 0,userIDX,movieIDX,rating
0,0,1176,5
1,0,655,3
2,0,902,3
3,0,3339,4
4,0,2286,5
...,...,...,...
1000204,6039,1075,1
1000205,6039,1078,5
1000206,6039,558,5
1000207,6039,1080,4


In [20]:
dfr.to_pickle('ratings.pickle')

In [81]:
df = pd.concat([pd.merge(dfr[['userID']], dfm, on='userID', how='left'), 
           pd.merge(dfr[['movieID']], dfm, on='movieID', how='left'),
           dfr[['rating']]], axis=1)

In [82]:
df

Unnamed: 0,userID,gender,age,occupation,movieID,year,genres,rating
0,0,F,1,10,1192,13,[7],5
1,0,F,1,10,660,19,"[2, 3, 11]",3
2,0,F,1,10,913,11,"[11, 13]",3
3,0,F,1,10,3407,19,[7],4
4,0,F,1,10,2354,19,"[2, 3, 4]",5
...,...,...,...,...,...,...,...,...
1000204,6039,M,25,6,1090,17,[4],1
1000205,6039,M,25,6,1093,18,"[7, 13, 16]",5
1000206,6039,M,25,6,561,18,"[4, 7]",5
1000207,6039,M,25,6,1095,15,[7],4
