In [None]:
library(arules)
library(dplyr)
library(ggplot2)
library(magrittr)
library(stringr)

In [358]:
create_matrix=function(table,column1,column2){
    return(as(split(table[,column1],table[,column2]),'transactions'))
}

In [359]:
format_dataset=function(dataset,columns){
    if(ncol(dataset)==1)
        return(str_split_fixed(dataset$V1,'::',columns))
    else
        return(dataset)  
}

In [426]:
make_pred=function(item,rules_df) {
  antecedent = paste("{",item,"} =>",sep="") 
  firingrules = rules_df[grep(antecedent, rules_df$rules,fixed=TRUE),1]
  gsub(" ","",toString(sub("\\}","",sub(".*=> \\{","",firingrules))))
}

In [360]:
directory='data\\1millionfile\\ml-1m\\'

movies='movies.dat'
ratings='ratings.dat'
users='users.dat'

movies=read.csv(paste(c(directory,movies),collapse=''),sep='\n',stringsAsFactors=FALSE,header=FALSE)
ratings=read.csv(paste(c(directory,ratings),collapse=''),sep='\n',stringsAsFactors=FALSE,header=FALSE)
users=read.csv(paste(c(directory,users),collapse=''),sep='\n',stringsAsFactors=FALSE,header=FALSE)

In [292]:
head(movies)

id,title,genre,year,genre_Animation,genre_Children's,genre_Comedy,genre_Adventure,genre_Fantasy,genre_Romance,...,genre_Crime,genre_Thriller,genre_Horror,genre_Sci-Fi,genre_Documentary,genre_War,genre_Musical,genre_Mystery,genre_Film-Noir,genre_Western
1,Toy Story,Animation|Children's|Comedy,1995,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Jumanji,Adventure|Children's|Fantasy,1995,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men,Comedy|Romance,1995,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,Waiting to Exhale,Comedy|Drama,1995,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Father of the Bride Part II,Comedy,1995,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Heat,Action|Crime|Thriller,1995,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0


In [361]:
movies=as.data.frame(format_dataset(movies,3))
ratings=as.data.frame(format_dataset(ratings,4))
users=as.data.frame(format_dataset(users,5))

In [392]:
colnames(movies)=c('id','title','genre')
colnames(ratings)=c('user_id','movie_id','rating','timestamp')
colnames(users)=c('id','gender','age','occupation','zip_code')

ratings$user_id=as.integer(ratings$user_id)

In [363]:
movies$year=as.numeric(str_sub(str_trim(movies$title),start=-5,end=-2))
movies$title=substr(movies$title,1,nchar(as.character(movies$title))-7)

In [364]:
genres=unique(unlist(strsplit(as.character(movies$genre),('\\|'))))

In [365]:
for(genre in genres){
  movies[str_c('genre_',genre)]=ifelse((str_detect(movies$genre,genre)|str_detect(movies$genre,'no genre')),1,0)
}

In [366]:
user_item_matrix=create_matrix(ratings,'movie_id','user_id')

In [410]:
summary(ratings)

    user_id        movie_id    rating          timestamp      
 Min.   :   1   Min.   :   1   1: 56174   975528402 :     30  
 1st Qu.:1506   1st Qu.: 675   2:107557   975440712 :     28  
 Median :3070   Median :1622   3:261197   975527781 :     28  
 Mean   :3025   Mean   :1673   4:348971   1025585635:     27  
 3rd Qu.:4476   3rd Qu.:2594   5:226310   975528243 :     27  
 Max.   :6040   Max.   :3706              975280276 :     26  
                                          (Other)   :1000043  

In [412]:
train_ratings=ratings %>%
    select(user_id,movie_id) %>%
    filter(user_id<4476)

In [413]:
test_ratings=ratings %>%
    select(user_id,movie_id) %>%
    filter(as.integer(user_id)>4475) %>%
    arrange(order(user_id))

In [414]:
user_movie_matrix=create_matrix(train_ratings,'movie_id','user_id')

In [415]:
rule_param=list(
  supp=0.001,
  conf=0.7,
  maxlen=2
)

In [420]:
rules=apriori(user_movie_matrix,parameter=rule_param)

Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
        0.7    0.1    1 none FALSE            TRUE       5   0.001      1
 maxlen target   ext
      2  rules FALSE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 4 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[3672 item(s), 4475 transaction(s)] done [0.09s].
sorting and recoding items ... [3366 item(s)] done [0.01s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2

"Mining stopped (maxlen reached). Only patterns up to a length of 2 returned!"

 done [0.39s].
writing ... [88336 rule(s)] done [0.09s].
creating S4 object  ... done [0.02s].


In [422]:
rules_df=as(rules,'data.frame')

In [None]:
test_ratings$preds=apply(test_ratings,1,function(X) make_pred(X['movie_id'],rules_df))

In [372]:
# extract unique predictions for each test user
userpreds = as.data.frame(aggregate(preds ~ basketID, data = testegs, paste, collapse=","))
userpreds$preds = apply(userpreds,1,function(X) uniqueitems(X["preds"]))

# extract unique items bought (or rated highly) for each test user
baskets = as.data.frame(aggregate(items ~ basketID, data = testegs, paste, collapse=","))
baskets$items = apply(baskets,1,function(X) uniqueitems(X["items"]))

#count how many unique predictions made are correct, i.e. have previously been bought (or rated highly) by the user
correctpreds = sum(apply(userpreds,1,function(X) checkpreds(X["preds"],X["basketID"])))

# count total number of unique predictions made
totalpreds = sum(apply(userpreds,1,function(X) countpreds(X["preds"][[1]]))) 

precision = correctpreds*100/totalpreds

cat("precision=", precision, "corr=",correctpreds,"total=",totalpreds)