In [56]:
library(arules)
library(arulesViz)
library(dplyr)
library(ggplot2)
library(magrittr)
library(stringr)

In [57]:
check_preds=function(preds,user_id) {
    plist = preds[[1]]
    ulist = users_movies[users_movies$user_id == user_id,"movie_id"][[1]]
    cnt = 0 
    
    for (p in plist) {
        if (grepl(p, ulist, fixed=TRUE)) cnt = cnt+1
    }
        
    return(cnt)
}

In [58]:
create_matrix=function(table,column1,column2){
    return(as(split(table[,column1],table[,column2]),'transactions'))
}

In [59]:
count_preds=function(preds) {
  len=length(preds)
    
  if(len>0 && (preds[[1]]==''))
      0 #avoid counting an empty list
  else
      len
}

In [60]:
format_dataset=function(dataset,columns){
    if(ncol(dataset)==1)
        return(str_split_fixed(dataset$V1,'::',columns))
    else
        return(dataset)  
}

In [61]:
make_pred=function(item,rules_df){
    antecedent=paste('{',item,'} =>',sep='')
    firingrules=rules_df[grep(antecedent,rules_df$rules,fixed=TRUE),1]
    gsub(' ','',toString(sub('\\}','',sub('.* =>\\{','',firingrules))))
}

In [62]:
make_unique=function(items){
    unique(as.list(strsplit(gsub(' ','',items),','))[[1]])
}

In [63]:
directory='data\\1millionfile\\ml-1m\\'

movies='movies.dat'
ratings='ratings.dat'
users='users.dat'

movies=read.csv(paste(c(directory,movies),collapse=''),sep='\n',stringsAsFactors=FALSE,header=FALSE)
ratings=read.csv(paste(c(directory,ratings),collapse=''),sep='\n',stringsAsFactors=FALSE,header=FALSE)
users=read.csv(paste(c(directory,users),collapse=''),sep='\n',stringsAsFactors=FALSE,header=FALSE)

In [64]:
head(movies)

V1
1::Toy Story (1995)::Animation|Children's|Comedy
2::Jumanji (1995)::Adventure|Children's|Fantasy
3::Grumpier Old Men (1995)::Comedy|Romance
4::Waiting to Exhale (1995)::Comedy|Drama
5::Father of the Bride Part II (1995)::Comedy
6::Heat (1995)::Action|Crime|Thriller


In [65]:
movies=as.data.frame(format_dataset(movies,3))
ratings=as.data.frame(format_dataset(ratings,4))
users=as.data.frame(format_dataset(users,5))

In [66]:
colnames(movies)=c('id','title','genre')
colnames(ratings)=c('user_id','movie_id','rating','timestamp')
colnames(users)=c('id','gender','age','occupation','zip_code')

ratings$user_id=as.integer(ratings$user_id)

In [67]:
movies$year=as.numeric(str_sub(str_trim(movies$title),start=-5,end=-2))
movies$title=substr(movies$title,1,nchar(as.character(movies$title))-7)

In [68]:
genres=unique(unlist(strsplit(as.character(movies$genre),('\\|'))))

In [69]:
for(genre in genres){
  movies[str_c('genre_',genre)]=ifelse((str_detect(movies$genre,genre)|str_detect(movies$genre,'no genre')),1,0)
}

In [70]:
user_item_matrix=create_matrix(ratings,'movie_id','user_id')

In [71]:
summary(ratings)

    user_id        movie_id      rating          timestamp      
 Min.   :   1   2858   :  3428   1: 56174   975528402 :     30  
 1st Qu.:1377   260    :  2991   2:107557   975440712 :     28  
 Median :2987   1196   :  2990   3:261197   975527781 :     28  
 Mean   :2975   1210   :  2883   4:348971   1025585635:     27  
 3rd Qu.:4485   480    :  2672   5:226310   975528243 :     27  
 Max.   :6040   2028   :  2653              975280276 :     26  
                (Other):982592              (Other)   :1000043  

In [72]:
train_ratings=ratings %>%
    select(user_id,movie_id) %>%
    filter(user_id<11)

In [73]:
test_ratings=ratings %>%
    select(user_id,movie_id) %>%
    filter(user_id>10 & user_id<16) %>%
    arrange(order(user_id))

In [74]:
user_movie_matrix=create_matrix(train_ratings,'movie_id','user_id')

In [75]:
rule_param=list(
  supp=0.001,
  conf=0.7,
  maxlen=2
)

In [76]:
rules=apriori(user_movie_matrix,parameter=rule_param)

Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
        0.7    0.1    1 none FALSE            TRUE       5   0.001      1
 maxlen target   ext
      2  rules FALSE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 0 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[1080 item(s), 10 transaction(s)] done [0.00s].
sorting and recoding items ... [1080 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2

"Mining stopped (maxlen reached). Only patterns up to a length of 2 returned!"

 done [0.49s].
writing ... [288258 rule(s)] done [0.08s].
creating S4 object  ... done [0.04s].


In [77]:
rules_df=as(rules,'data.frame')

In [78]:
test_ratings$preds=apply(test_ratings,1,function(X) make_pred(X['movie_id'],rules_df))
test_ratings

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [79]:
users_preds=as.data.frame(aggregate(preds~user_id,data=test_ratings,paste,collapse=','))
users_preds$preds=apply(users_preds,1,function(X) make_unique(X['preds']))

In [80]:
users_movies=as.data.frame(aggregate(movie_id~user_id,data=test_ratings,paste,collapse=','))
users_movies$movies=apply(users_movies,1,function(X) make_unique(X['movie_id']))

In [81]:
correct_preds=sum(apply(users_preds,1,function(X) check_preds(X["preds"],X["user_id"])))

In [82]:
total_preds=sum(apply(users_preds,1,function(X) count_preds(X['preds'][[1]])))

In [83]:
precision=correct_preds*100/total_preds

In [84]:
cat("precision=", precision, "corr=",correct_preds,"total=",total_preds)

precision= 0.003107443 corr= 5 total= 160904

In [85]:
inspect(head(sort(rules, by="lift"),3));

    lhs       rhs    support confidence lift count
[1] {281}  => {1952} 0.1     1          10   1    
[2] {1952} => {281}  0.1     1          10   1    
[3] {281}  => {29}   0.1     1          10   1    


In [None]:
plot(rules);
 
head(quality(rules));
 
plot(rules, measure=c("support","lift"), shading="confidence");
 
plot(rules, shading="order", control=list(main ="Two-key plot"));

support,confidence,lift,count
0.7,0.7,1,7
0.8,0.8,1,8
0.8,0.8,1,8
0.1,1.0,10,1
0.1,1.0,10,1
0.1,1.0,10,1
