In [1]:
PATH='/home/kirana/Documents/phd/exp3_autoencoder'

In [2]:
DATAPATH='/home/kirana/Documents/final_dissertation_final/experiments/datasets/ml-1m'

In [3]:
import pandas as pd
import numpy as np
import os
import datetime
import sklearn
import torch

In [6]:
ls {DATAPATH}/*.dat

/home/kirana/Documents/final_dissertation_final/experiments/datasets/ml-1m/movies.dat
/home/kirana/Documents/final_dissertation_final/experiments/datasets/ml-1m/ratings.dat
/home/kirana/Documents/final_dissertation_final/experiments/datasets/ml-1m/users.dat


## Read Files

In [10]:
ratings=pd.read_csv(f'{DATAPATH}/ratings.dat',engine='python',sep='::',names=['userId','itemId','rating','timestamp'])
print (ratings.head())


   userId  itemId  rating  timestamp
0       1    1193       5  978300760
1       1     661       3  978302109
2       1     914       3  978301968
3       1    3408       4  978300275
4       1    2355       5  978824291


In [11]:
ratings.shape, ratings['userId'].nunique(),ratings['itemId'].nunique()

((1000209, 4), 6040, 3706)

In [None]:
users = pandas.read_csv('./ml-1m/users.dat', sep='::',
                        engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')

movies = pandas.read_csv('./ml-1m/movies.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre']).set_index('movieid')
movies['genre'] = movies.genre.str.split('|')

users.age = users.age.astype('category')
users.gender = users.gender.astype('category')
users.occupation = users.occupation.astype('category')
ratings.movieid = ratings.movieid.astype('category')
ratings.userid = ratings.userid.astype('category')

In [12]:
users=pd.read_csv(f'{DATAPATH}/users.dat',engine='python',sep='::',names=['userId','gender','age','occupation','zip'])
print (users.head())

   userId gender  age  occupation    zip
0       1      F    1          10  48067
1       2      M   56          16  70072
2       3      M   25          15  55117
3       4      M   45           7  02460
4       5      M   25          20  55455


In [13]:
items=pd.read_csv(f'{DATAPATH}/movies.dat',engine='python',sep='::',names=['itemId','title','genre'])
print (items.head())

   itemId                               title                         genre
0       1                    Toy Story (1995)   Animation|Children's|Comedy
1       2                      Jumanji (1995)  Adventure|Children's|Fantasy
2       3             Grumpier Old Men (1995)                Comedy|Romance
3       4            Waiting to Exhale (1995)                  Comedy|Drama
4       5  Father of the Bride Part II (1995)                        Comedy


In [14]:
ratings.shape

(1000209, 4)

In [15]:
ratings=pd.merge(ratings,users,on='userId',how='left')

In [16]:
ratings.shape

(1000209, 8)

In [17]:
ratings=pd.merge(ratings,items,on='itemId',how='left')

In [18]:
ratings.shape

(1000209, 10)

In [20]:
ratings.head().T

Unnamed: 0,0,1,2,3,4
userId,1,1,1,1,1
itemId,1193,661,914,3408,2355
rating,5,3,3,4,5
timestamp,978300760,978302109,978301968,978300275,978824291
gender,F,F,F,F,F
age,1,1,1,1,1
occupation,10,10,10,10,10
zip,48067,48067,48067,48067,48067
title,One Flew Over the Cuckoo's Nest (1975),James and the Giant Peach (1996),My Fair Lady (1964),Erin Brockovich (2000),"Bug's Life, A (1998)"
genre,Drama,Animation|Children's|Musical,Musical|Romance,Drama,Animation|Children's|Comedy


## Random Cross-Validation 90-10 split to compare to papers

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
ratings['rating'].value_counts()

4    348971
3    261197
5    226310
2    107557
1     56174
Name: rating, dtype: int64

In [23]:
temptrain,tempvalid=train_test_split(ratings,test_size=0.1,train_size=0.9,random_state=11,shuffle=True)

In [24]:
temptrain['random_dstype']='train'
tempvalid['random_dstype']='test'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [25]:
df=pd.concat([temptrain,tempvalid],axis=0)

In [26]:
df.shape

(1000209, 11)

In [27]:
df.head()

Unnamed: 0,userId,itemId,rating,timestamp,gender,age,occupation,zip,title,genre,random_dstype
648724,3908,2347,4,965941231,M,25,0,10021,"Pope of Greenwich Village, The (1984)",Action,train
203881,1254,1541,2,974823280,F,18,1,2144,Addicted to Love (1997),Comedy|Romance,train
243394,1467,2617,3,977173059,M,25,5,97007,"Mummy, The (1999)",Action|Adventure|Horror|Thriller,train
788507,4717,3754,3,963502544,M,45,17,48310,"Adventures of Rocky and Bullwinkle, The (2000)",Animation|Children's|Comedy,train
301066,1789,2997,5,974700402,M,1,10,19116,Being John Malkovich (1999),Comedy,train


## Prepare data

In [28]:
user_to_idx={j:i+1 for i, j in enumerate(df['userId'].unique())}
item_to_idx={j:i+1 for i,j in enumerate(df['itemId'].unique())}
idx_to_user={i+1:j for i, j in enumerate(df['userId'].unique())}
idx_to_item={i+1:j for i,j in enumerate(df['itemId'].unique())}

In [29]:
min(user_to_idx.values()),max(user_to_idx.values())

(1, 6040)

In [30]:
df['user_idx']=[user_to_idx.get(i) for i in df['userId']]
df['item_idx']=[item_to_idx.get(i) for i in df['itemId']]

In [31]:
df['dstype_random_train']=np.where(df['random_dstype']=='train',1,0)

In [32]:
df['dstype_random_valid']=np.where(df['random_dstype']!='train',1,0)

In [34]:
gender_to_idx={j:i+1 for i, j in enumerate(df['gender'].unique())}
age_to_idx={j:i+1 for i,j in enumerate(df['age'].unique())}
title_to_idx={j:i+1 for i,j in enumerate(df['title'].unique())}
genre_to_idx={j:i+1 for i,j in enumerate(df['genre'].unique())}
zip_to_idx={j:i+1 for i,j in enumerate(df['zip'].unique())}

In [35]:
min(gender_to_idx.values()),max(gender_to_idx.values())

(1, 2)

In [36]:
min(age_to_idx.values()),max(age_to_idx.values())

(1, 7)

In [37]:
min(title_to_idx.values()),max(title_to_idx.values())

(1, 3706)

In [38]:
min(genre_to_idx.values()),max(genre_to_idx.values())

(1, 301)

In [39]:
min(zip_to_idx.values()),max(zip_to_idx.values())

(1, 3439)

In [41]:
df['gender_idx']=[gender_to_idx.get(i) for i in df['gender']]
df['age_idx']=[age_to_idx.get(i) for i in df['age']]
df['title_idx']=[title_to_idx.get(i) for i in df['title']]
df['genre_idx']=[genre_to_idx.get(i) for i in df['genre']]
df['zip_idx']=[zip_to_idx.get(i) for i in df['zip']]

In [42]:
df[['gender_idx','age_idx','title_idx','genre_idx','zip_idx']].describe()

Unnamed: 0,gender_idx,age_idx,title_idx,genre_idx,zip_idx
count,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0
mean,1.246389,2.949683,938.8417,57.99979,1015.646
std,0.4309076,2.119191,740.0308,61.91348,795.1754
min,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,344.0,14.0,363.0
50%,1.0,2.0,750.0,36.0,833.0
75%,1.0,5.0,1395.0,80.0,1497.0
max,2.0,7.0,3706.0,301.0,3439.0


In [43]:
df[['gender_idx','age_idx','title_idx','genre_idx','zip_idx']].isnull().sum()

gender_idx    0
age_idx       0
title_idx     0
genre_idx     0
zip_idx       0
dtype: int64

In [44]:
df[['gender_idx','age_idx','title_idx','genre_idx','zip_idx']].describe()

Unnamed: 0,gender_idx,age_idx,title_idx,genre_idx,zip_idx
count,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0
mean,1.246389,2.949683,938.8417,57.99979,1015.646
std,0.4309076,2.119191,740.0308,61.91348,795.1754
min,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,344.0,14.0,363.0
50%,1.0,2.0,750.0,36.0,833.0
75%,1.0,5.0,1395.0,80.0,1497.0
max,2.0,7.0,3706.0,301.0,3439.0


In [46]:
df[['gender_idx','age_idx','title_idx','genre_idx','zip_idx']].dtypes

gender_idx    int64
age_idx       int64
title_idx     int64
genre_idx     int64
zip_idx       int64
dtype: object

In [47]:
df[['gender_idx','age_idx','title_idx','genre_idx','zip_idx']].dtypes

gender_idx    int64
age_idx       int64
title_idx     int64
genre_idx     int64
zip_idx       int64
dtype: object

In [48]:
from scipy.sparse import csr_matrix

In [49]:
from scipy.sparse import csr_matrix

def df_to_sm(data, vars_i, vars_j, var_val):
    grpr_i = data.groupby(vars_i).grouper

    idx_i = grpr_i.group_info[0]

    grpr_j = data.groupby(vars_j).grouper

    idx_j = grpr_j.group_info[0]

    data_sm = csr_matrix((data[var_val].values, (idx_i, idx_j)),
                         shape=(grpr_i.ngroups+1, grpr_j.ngroups+1))

    return data_sm, grpr_i, grpr_j



In [50]:
df.head()

Unnamed: 0,userId,itemId,rating,timestamp,gender,age,occupation,zip,title,genre,random_dstype,user_idx,item_idx,dstype_random_train,dstype_random_valid,gender_idx,age_idx,title_idx,genre_idx,zip_idx
648724,3908,2347,4,965941231,M,25,0,10021,"Pope of Greenwich Village, The (1984)",Action,train,1,1,1,0,1,1,1,1,1
203881,1254,1541,2,974823280,F,18,1,2144,Addicted to Love (1997),Comedy|Romance,train,2,2,1,0,2,2,2,2,2
243394,1467,2617,3,977173059,M,25,5,97007,"Mummy, The (1999)",Action|Adventure|Horror|Thriller,train,3,3,1,0,1,1,3,3,3
788507,4717,3754,3,963502544,M,45,17,48310,"Adventures of Rocky and Bullwinkle, The (2000)",Animation|Children's|Comedy,train,4,4,1,0,1,3,4,4,4
301066,1789,2997,5,974700402,M,1,10,19116,Being John Malkovich (1999),Comedy,train,5,5,1,0,1,4,5,5,5


In [51]:
df_ratings,_,_=df_to_sm(df,['user_idx'],['item_idx'],'rating')

In [52]:
type(df_ratings), df_ratings.shape

(scipy.sparse.csr.csr_matrix, (6041, 3707))

In [53]:
pd.__version__

'0.24.2'

In [54]:
df_ratings.shape

(6041, 3707)

In [55]:
dfflagtrain,_,_=df_to_sm(df,['user_idx'],['item_idx'],'dstype_random_train')


In [56]:
dfflagvalid,_,_=df_to_sm(df,['user_idx'],['item_idx'],'dstype_random_valid')



In [57]:
df_ratings.shape,dfflagtrain.shape,dfflagvalid.shape

((6041, 3707), (6041, 3707), (6041, 3707))

In [58]:
df_ratings.shape,dfflagtrain.shape

((6041, 3707), (6041, 3707))

In [59]:
df_train=df_ratings.multiply(dfflagtrain)

In [60]:
df_valid=df_ratings.multiply(dfflagvalid)

In [61]:
import pickle

In [62]:
pickle.dump([df,df_train,df_valid,df,df_ratings,idx_to_user,\
             idx_to_item,item_to_idx,user_to_idx],open(f'{DATAPATH}/reads.pkl','wb'))

In [63]:
len(user_to_idx), len(item_to_idx)

(6040, 3706)

In [64]:
df_ratings,_,_=df_to_sm(df,['item_idx'],['user_idx'],'rating')

In [65]:
type(df_ratings), df_ratings.shape

(scipy.sparse.csr.csr_matrix, (3707, 6041))

In [66]:
dfflagtrain,_,_=df_to_sm(df,['item_idx'],['user_idx'],'dstype_random_train')


In [67]:
dfflagvalid,_,_=df_to_sm(df,['item_idx'],['user_idx'],'dstype_random_valid')



In [68]:
df_ratings.shape,dfflagtrain.shape,dfflagvalid.shape

((3707, 6041), (3707, 6041), (3707, 6041))

In [69]:
df_train=df_ratings.multiply(dfflagtrain)

In [70]:
df_valid=df_ratings.multiply(dfflagvalid)

In [71]:
pickle.dump([df,df_train,df_valid,df,df_ratings,idx_to_user,\
             idx_to_item,item_to_idx,user_to_idx],open(f'{DATAPATH}/itemreads.pkl','wb'))

In [72]:
ls {DATAPATH}

[0m[01;34minter[0m/  itemreads.pkl  movies.dat  ratings.dat  README  reads.pkl  users.dat
