In [87]:
library(arules)
library(arulesViz)
library(dplyr)
library(ggplot2)
library(magrittr)
library(stringr)

In [88]:
check_preds=function(preds,user_id) {
    plist = preds[[1]]
    ulist = users_movies[users_movies$user_id == user_id,"movie_id"][[1]]
    cnt = 0 
    
    for (p in plist) {
        if (grepl(p, ulist, fixed=TRUE)) cnt = cnt+1
    }
        
    return(cnt)
}

In [89]:
create_matrix=function(table,column1,column2){
    return(as(split(table[,column1],table[,column2]),'transactions'))
}

In [90]:
count_preds=function(preds) {
  len=length(preds)
    
  if(len>0 && (preds[[1]]==''))
      0 #avoid counting an empty list
  else
      len
}

In [91]:
format_dataset=function(dataset,columns){
    if(ncol(dataset)==1)
        return(str_split_fixed(dataset$V1,'::',columns))
    else
        return(dataset)  
}

In [92]:
make_pred=function(item,rules_df){
    antecedent=paste('{',item,'} =>',sep='')
    firingrules=rules_df[grep(antecedent,rules_df$rules,fixed=TRUE),1]
    gsub(' ','',toString(sub('\\}','',sub('.* =>\\{','',firingrules))))
}

In [93]:
make_unique=function(items){
    unique(as.list(strsplit(gsub(' ','',items),','))[[1]])
}

In [94]:
directory='data\\1millionfile\\ml-1m\\'

movies='movies.dat'
ratings='ratings.dat'
users='users.dat'

movies=read.csv(paste(c(directory,movies),collapse=''),sep='\n',stringsAsFactors=FALSE,header=FALSE)
ratings=read.csv(paste(c(directory,ratings),collapse=''),sep='\n',stringsAsFactors=FALSE,header=FALSE)
users=read.csv(paste(c(directory,users),collapse=''),sep='\n',stringsAsFactors=FALSE,header=FALSE)

In [95]:
head(movies)

V1
1::Toy Story (1995)::Animation|Children's|Comedy
2::Jumanji (1995)::Adventure|Children's|Fantasy
3::Grumpier Old Men (1995)::Comedy|Romance
4::Waiting to Exhale (1995)::Comedy|Drama
5::Father of the Bride Part II (1995)::Comedy
6::Heat (1995)::Action|Crime|Thriller


In [96]:
movies=as.data.frame(format_dataset(movies,3))
ratings=as.data.frame(format_dataset(ratings,4))
users=as.data.frame(format_dataset(users,5))

In [None]:
colnames(movies)=c('id','title','genre')
colnames(ratings)=c('user_id','movie_id','rating','timestamp')
colnames(users)=c('id','gender','age','occupation','zip_code')

ratings$user_id=as.integer(ratings$user_id)

In [None]:
movies$year=as.numeric(str_sub(str_trim(movies$title),start=-5,end=-2))
movies$title=substr(movies$title,1,nchar(as.character(movies$title))-7)

In [None]:
genres=unique(unlist(strsplit(as.character(movies$genre),('\\|'))))

In [None]:
for(genre in genres){
  movies[str_c('genre_',genre)]=ifelse((str_detect(movies$genre,genre)|str_detect(movies$genre,'no genre')),1,0)
}

In [None]:
user_item_matrix=create_matrix(ratings,'movie_id','user_id')

In [None]:
summary(ratings)

In [None]:
train_ratings=ratings %>%
    select(user_id,movie_id) %>%
    filter(user_id<11)

In [None]:
test_ratings=ratings %>%
    select(user_id,movie_id) %>%
    filter(user_id>10 & user_id<16) %>%
    arrange(order(user_id))

In [None]:
user_movie_matrix=create_matrix(train_ratings,'movie_id','user_id')

In [None]:
rule_param=list(
  supp=0.01,
  conf=0.75
)

In [None]:
rules=apriori(user_movie_matrix,parameter=rule_param)

In [None]:
rules_df=as(rules,'data.frame')

In [None]:
test_ratings$preds=apply(test_ratings,1,function(X) make_pred(X['movie_id'],rules_df))
test_ratings

In [None]:
users_preds=as.data.frame(aggregate(preds~user_id,data=test_ratings,paste,collapse=','))
users_preds$preds=apply(users_preds,1,function(X) make_unique(X['preds']))

In [None]:
users_movies=as.data.frame(aggregate(movie_id~user_id,data=test_ratings,paste,collapse=','))
users_movies$movies=apply(users_movies,1,function(X) make_unique(X['movie_id']))

In [None]:
correct_preds=sum(apply(users_preds,1,function(X) check_preds(X["preds"],X["user_id"])))

In [None]:
total_preds=sum(apply(users_preds,1,function(X) count_preds(X['preds'][[1]])))

In [None]:
precision=correct_preds*100/total_preds

In [None]:
cat("precision=", precision, "corr=",correct_preds,"total=",total_preds)

In [None]:
inspect(head(sort(rules, by="lift"),3));

In [None]:
plot(rules);
 
head(quality(rules));
 
plot(rules, measure=c("support","lift"), shading="confidence");
 
plot(rules, shading="order", control=list(main ="Two-key plot"));