In [1]:
PATH='/home/kirana/Documents/phd/exp3_autoencoder'

In [6]:
DATAPATH='/home/kirana/Documents/final_dissertation_final/experiments/datasets/filmtrust'

In [3]:
import pandas as pd
import numpy as np
import os
import datetime
import sklearn
import torch

In [7]:
ls {DATAPATH}/*

[0m[01;31m/home/kirana/Documents/final_dissertation_final/experiments/datasets/filmtrust/filmtrust.zip[0m[K
/home/kirana/Documents/final_dissertation_final/experiments/datasets/filmtrust/ratings.txt
/home/kirana/Documents/final_dissertation_final/experiments/datasets/filmtrust/readme.txt
/home/kirana/Documents/final_dissertation_final/experiments/datasets/filmtrust/trust.txt

/home/kirana/Documents/final_dissertation_final/experiments/datasets/filmtrust/filmtrust:


## Read Files

In [9]:
ratings=pd.read_csv(f'{DATAPATH}/ratings.txt',sep=' ',names=['userId','itemId','rating'])
print (ratings.head())


   userId  itemId  rating
0       1       1     2.0
1       1       2     4.0
2       1       3     3.5
3       1       4     3.0
4       1       5     4.0


In [29]:
ratings.shape

(35497, 3)

In [30]:
ratings.drop_duplicates(inplace=True)

In [33]:
ratings[['userId','itemId']].duplicated().value_counts()

False    35494
True         2
dtype: int64

In [34]:
ratings=ratings.groupby(['userId','itemId'],as_index=False)['rating'].mean()

In [35]:
ratings.shape

(35494, 3)

In [36]:
trust=pd.read_csv(f'{DATAPATH}/trust.txt',sep=' ',names=['userId','userId_1','trust'])
print (trust.head())

   userId  userId_1  trust
0       2       966      1
1       2       104      1
2       5      1509      1
3       6      1192      1
4       7      1510      1


In [37]:
ratings.shape, trust.shape

((35494, 3), (1853, 3))

In [53]:
trust_agg=trust.groupby('userId')['trust'].agg(['mean','min','max'])

In [55]:
trust_agg.columns+='_on_other'

In [57]:
trust_agg.head()

Unnamed: 0_level_0,mean_on_other,min_on_other,max_on_other
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,1,1,1
5,1,1,1
6,1,1,1
7,1,1,1
12,1,1,1


In [62]:
trust_agg_on_self=trust.groupby('userId_1')['trust'].agg(['mean','min','max'])

In [63]:
trust_agg_on_self.columns+='_on_self'

In [65]:
trust_agg_on_self.head()

Unnamed: 0_level_0,mean_on_self,min_on_self,max_on_self
userId_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,1,1,1
4,1,1,1
6,1,1,1
13,1,1,1
16,1,1,1


In [66]:
trust_agg_on_self.reset_index(inplace=True)

In [68]:
trust_agg.reset_index(inplace=True)

In [69]:
trust_agg_on_self.rename({'userId_1':'userId'},axis=1,inplace=True)

In [70]:
trust_agg_on_self.head()

Unnamed: 0,userId,mean_on_self,min_on_self,max_on_self
0,2,1,1,1
1,4,1,1,1
2,6,1,1,1
3,13,1,1,1
4,16,1,1,1


In [71]:
ratings.shape

(35494, 3)

In [73]:
ratings=pd.merge(ratings,trust_agg_on_self,on='userId',how='left')

In [74]:
ratings.shape

(35494, 6)

In [76]:
ratings=pd.merge(ratings,trust_agg,on='userId',how='left')

In [77]:
ratings.shape

(35494, 9)

In [78]:
ratings.head()

Unnamed: 0,userId,itemId,rating,mean_on_self,min_on_self,max_on_self,mean_on_other,min_on_other,max_on_other
0,1,1,2.0,,,,,,
1,1,2,4.0,,,,,,
2,1,3,3.5,,,,,,
3,1,4,3.0,,,,,,
4,1,5,4.0,,,,,,


## Random Cross-Validation 90-10 split to compare to papers

In [79]:
from sklearn.model_selection import train_test_split

In [80]:
ratings['rating'].value_counts()

4.00    9169
3.00    7876
3.50    7141
2.50    4392
2.00    3113
1.50    1600
1.00    1141
0.50    1060
3.25       1
2.75       1
Name: rating, dtype: int64

In [81]:
temptrain,tempvalid=train_test_split(ratings,test_size=0.1,train_size=0.9,random_state=11,shuffle=True)

In [82]:
temptrain['random_dstype']='train'
tempvalid['random_dstype']='test'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [83]:
df=pd.concat([temptrain,tempvalid],axis=0)

In [84]:
df.shape

(35494, 10)

## Prepare data

In [85]:
user_to_idx={j:i+1 for i, j in enumerate(df['userId'].unique())}
item_to_idx={j:i+1 for i,j in enumerate(df['itemId'].unique())}
idx_to_user={i+1:j for i, j in enumerate(df['userId'].unique())}
idx_to_item={i+1:j for i,j in enumerate(df['itemId'].unique())}

In [86]:
df['user_idx']=[user_to_idx[i] for i in df['userId']]
df['item_idx']=[item_to_idx[i] for i in df['itemId']]

In [87]:
df['dstype_random_train']=np.where(df['random_dstype']=='train',1,0)

In [88]:
df['dstype_random_valid']=np.where(df['random_dstype']!='train',1,0)

In [89]:
df_ratings=df.pivot(index='userId',columns='itemId',values='rating')
df_ratings.fillna(0,inplace=True)

In [90]:
dfflagtrain=df.pivot(index='userId',columns='itemId',values='dstype_random_train')
dfflagtrain.head()
dfflagtrain.fillna(0,inplace=True)
df_train=df_ratings*dfflagtrain

In [92]:
dfflagvalid=df.pivot(index='userId',columns='itemId',values='dstype_random_valid')
dfflagvalid.head()
dfflagvalid.fillna(0,inplace=True)
df_valid=df_ratings*dfflagvalid

In [93]:
df.head()

Unnamed: 0,userId,itemId,rating,mean_on_self,min_on_self,max_on_self,mean_on_other,min_on_other,max_on_other,random_dstype,user_idx,item_idx,dstype_random_train,dstype_random_valid
3297,159,241,0.5,1.0,1.0,1.0,,,,train,1,1,1,0
16463,673,7,3.0,,,,,,,train,2,2,1,0
35390,1502,255,3.5,,,,,,,train,3,3,1,0
5367,224,10,4.0,1.0,1.0,1.0,,,,train,4,4,1,0
13666,557,9,4.0,,,,,,,train,5,5,1,0


In [95]:
import pickle

In [97]:
pickle.dump([df,df_train,df_valid,df,df_ratings,dfflagtrain,dfflagvalid,idx_to_user,\
             idx_to_item,item_to_idx,user_to_idx],open(f'{DATAPATH}/reads.pkl','wb'))

In [99]:
df_ratings=df.pivot(index='itemId',columns='userId',values='rating')
df_ratings.fillna(0,inplace=True)

In [100]:
dfflagtrain=df.pivot(index='itemId',columns='userId',values='dstype_random_train')
dfflagtrain.head()
dfflagtrain.fillna(0,inplace=True)
df_train=df_ratings*dfflagtrain

In [101]:
dfflagvalid=df.pivot(index='itemId',columns='userId',values='dstype_random_valid')
dfflagvalid.head()
dfflagvalid.fillna(0,inplace=True)
df_valid=df_ratings*dfflagvalid

In [102]:
pickle.dump([df,df_train,df_valid,df,df_ratings,dfflagtrain,dfflagvalid,idx_to_user,\
             idx_to_item,item_to_idx,user_to_idx],open(f'{DATAPATH}/itemreads.pkl','wb'))