# Recommender system

## Collaborative filtering

The data set contains information about users, their gender, their age, and which artists they have listened to on Last.FM. We will not use the entire dataset. For simplicity’s sake we only use songs in Germany and we will transform the data to a item frequency matrix. This means each row will represent a user, and each column represents and artist. For this we use R’s “reshape” package. This is largely administrative, so we will start with the transformed dataset.

### Item Based Collaborative Filtering 

In [1]:
data.germany <- read.csv(file="data/lastfm-matrix-germany.csv")

In [2]:
head(data.germany)

user,a.perfect.circle,abba,ac.dc,adam.green,aerosmith,afi,air,alanis.morissette,alexisonfire,⋯,timbaland,tom.waits,tool,tori.amos,travis,trivium,u2,underoath,volbeat,yann.tiersen
1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
33,0,0,0,1,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
42,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
51,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
62,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
75,0,0,0,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,0


In [3]:
# Delete all columns containing "user" as name
data.germany.clean <- (data.germany[, !(names(data.germany) %in% c("user"))])

In [4]:
calculate_cosine_similarity <- function(x,y) {
    this.cosine <- sum(x*y) / (sqrt(sum(x^2)) * sqrt(sum(y^2)))
    return(this.cosine)
}

In [5]:
data.germany.clean.similarity <- matrix(NA, nrow=ncol(data.germany.clean), ncol=ncol(data.germany.clean), dimnames=list(colnames(data.germany.clean), colnames(data.germany.clean)))

In [6]:
head(data.germany.clean.similarity)

Unnamed: 0,a.perfect.circle,abba,ac.dc,adam.green,aerosmith,afi,air,alanis.morissette,alexisonfire,alicia.keys,⋯,timbaland,tom.waits,tool,tori.amos,travis,trivium,u2,underoath,volbeat,yann.tiersen
a.perfect.circle,,,,,,,,,,,⋯,,,,,,,,,,
abba,,,,,,,,,,,⋯,,,,,,,,,,
ac.dc,,,,,,,,,,,⋯,,,,,,,,,,
adam.green,,,,,,,,,,,⋯,,,,,,,,,,
aerosmith,,,,,,,,,,,⋯,,,,,,,,,,
afi,,,,,,,,,,,⋯,,,,,,,,,,


In [7]:
for(i in 1:ncol(data.germany.clean)) {
    for(j in 1:ncol(data.germany.clean)) {
      data.germany.clean.similarity[i,j] <- calculate_cosine_similarity(data.germany.clean[i], data.germany.clean[j])
    }
}

In [8]:
data.germany.clean.similarity <- as.data.frame(data.germany.clean.similarity)

In [9]:
data.germany.neighbours <- matrix(NA, nrow=ncol(data.germany.clean.similarity),ncol=11,dimnames=list(colnames(data.germany.clean.similarity)))

In [10]:
head(data.germany.clean.similarity['abba'])

Unnamed: 0,abba
a.perfect.circle,0.0
abba,1.0
ac.dc,0.05227877
adam.green,0.02507061
aerosmith,0.06105625
afi,0.0


In [11]:
find_top_similar <- function(name, n_top) {
    head(data.germany.clean.similarity[order(data.germany.clean.similarity[name,], decreasing=TRUE),][name], n=n_top+1)
}

In [12]:
find_top_similar("abba", 5)

Unnamed: 0,abba
abba,1.0
madonna,0.2416561
robbie.williams,0.2053985
elvis.presley,0.1917988
michael.jackson,0.1878846
queen,0.1794268


### User Based Collaborative Filtering

In [13]:
getScore <- function(history, similarities) {
    x <- sum(history*similarities)/sum(similarities)
    return(x)
}

In [14]:
holder <- matrix(NA, nrow=nrow(data.germany),ncol=ncol(data.germany)-1,dimnames=list((data.germany$user),colnames(data.germany[-1])))

In [15]:
# Loop through the users (rows)
   for(i in 1:nrow(holder)) 
   {
       # Loops through the products (columns)
       for(j in 1:ncol(holder)) 
       {
           # Get the user's name and th product's name
           # We do this not to conform with vectors sorted differently 
             user <- rownames(holder)[i]
             product <- colnames(holder)[j]
 
           # We do not want to recommend products you have already consumed
           # If you have already consumed it, we store an empty string
             if(as.integer(data.germany[data.germany$user==user,product]) == 1)
             { 
                 holder[i,j]<-""
              } else {
 
           # We first have to get a product's top 10 neighbours sorted by similarity
             topN<-((head(n=11,(data.germany.clean.similarity[order(data.germany.clean.similarity[,product],decreasing=TRUE),][product]))))
             topN.names <- as.character(rownames(topN))
             topN.similarities <- as.numeric(topN[,1])
 
           # Drop the first one because it will always be the same song
             topN.similarities<-topN.similarities[-1]
             topN.names<-topN.names[-1]
 
           # We then get the user's purchase history for those 10 items
             topN.purchases<- data.germany[,c("user",topN.names)]
             topN.userPurchases<-topN.purchases[topN.purchases$user==user,]
             topN.userPurchases <- as.numeric(topN.userPurchases[!(names(topN.userPurchases) %in% c("user"))])
 
            # We then calculate the score for that product and that user
             holder[i,j]<-getScore(similarities=topN.similarities,history=topN.userPurchases)
 
         } # close else statement
       } # end product for loop   
   } # end user for loop
 
data.germany.user.scores <- holder

In [18]:
# We first have to get a product's top 10 neighbours sorted by similarity
 topN<-((head(n=11,(data.germany.clean.similarity[order(data.germany.clean.similarity[,product],decreasing=TRUE),][product]))))
 topN.names <- as.character(rownames(topN))
 topN.similarities <- as.numeric(topN[,1])
 
# Drop the first one because it will always be the same song
 topN.similarities<-topN.similarities[-1]
 topN.names<-topN.names[-1]

In [19]:
# We then get the user's purchase history for those 10 items
 topN.purchases<- data.germany[,c("user",topN.names)]
 topN.userPurchases<-topN.purchases[topN.purchases$user==user,]
 topN.userPurchases <- as.numeric(topN.userPurchases[!(names(topN.userPurchases) %in% c("user"))])

In [20]:
holder[i,j]<-getScore(similarities=topN.similarities,history=topN.userPurchases)

In [21]:
data.germany.user.scores.holder <- matrix(NA, nrow=nrow(data.germany.user.scores),ncol=100,dimnames=list(rownames(data.germany.user.scores)))
for(i in 1:nrow(data.germany.user.scores)) {
    data.germany.user.scores.holder[i,] <- names(head(n=100,(data.germany.user.scores[,order(data.germany.user.scores[i,],decreasing=TRUE)])[i,]))
}

In [23]:
head(data.germany.user.scores.holder)

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
1,flogging.molly,coldplay,aerosmith,the.beatles,moby,mando.diao,ac.dc,bob.marley...the.wailers,korpiklaani,oasis,⋯,peter.fox,the.wombats,guns.n.roses,massive.attack,jamiroquai,james.morrison,rage.against.the.machine,mgmt,pink.floyd,audioslave
33,peter.fox,gentleman,red.hot.chili.peppers,kings.of.leon,flyleaf,oasis,beatsteaks,the.killers,jason.mraz,babyshambles,⋯,bjork,nine.inch.nails,blur,beastie.boys,billy.talent,editors,good.charlotte,goldfrapp,hot.chip,radiohead
42,oomph.,lacuna.coil,rammstein,schandmaul,sonata.arctica,subway.to.sally,apocalyptica,marilyn.manson,pink,linkin.park,⋯,bob.dylan,bob.marley,bob.marley...the.wailers,breaking.benjamin,bright.eyes,bruce.springsteen,cat.power,christina.aguilera,clueso,cocorosie
51,the.subways,the.kooks,the.hives,franz.ferdinand,jack.johnson,bloc.party,foo.fighters,amy.winehouse,the.white.stripes,deichkind,⋯,amon.amarth,anti.flag,aphex.twin,apocalyptica,arch.enemy,as.i.lay.dying,atb,atreyu,audioslave,avril.lavigne
62,mando.diao,the.fratellis,jack.johnson,incubus,peter.fox,oasis,the.wombats,foo.fighters,the.offspring,timbaland,⋯,millencolin,madonna,disturbed,the.chemical.brothers,norah.jones,hoobastank,dire.straits,beyonce,johnny.cash,simple.plan
75,hoobastank,papa.roach,the.prodigy,sum.41,good.charlotte,blink.182,hans.zimmer,groove.coverage,three.days.grace,afi,⋯,amy.macdonald,amy.winehouse,aphex.twin,apocalyptica,arcade.fire,arch.enemy,arctic.monkeys,babyshambles,beck,beirut


## References

http://www.salemmarafi.com/code/collaborative-filtering-r/