# Recommendation System

In [77]:
#Libraries
import pandas as pd
import numpy as np
from random import *

In [78]:
#Load the dataset
data=pd.read_csv("facebook_combined.txt",sep=" ", header=None)

#Add column names
data.columns = ["node1", "node2"]

In [79]:
#Transform the graph to undirected
data2=pd.concat([data.node2,data.node1], axis=1)

#Rename the columns in order to merge the columns
data2.columns= ["node1", "node2"]
data=data.append(data2)

#Reset indexes
data = data.reset_index(drop=True)

In [61]:
#Create a sample graph dataset
test_data = pd.DataFrame([[5, 2], 
                       [9, 3],
                       [9, 11],
                       [3, 6],
                       [4, 6],
                       [5, 7],
                       [1, 11],
                       [6, 2],
                       [7, 9],
                       [8, 9],
                       [5, 11],
                       [6, 7],
                       [6, 11],
                       [7, 6],
                       [2, 11],
                       [11,2],
                       [2, 5],
                       [6, 2],
                       [2, 7],
                       [7, 2]],
                      columns=["node1", "node2"])

#### Recommending friends using Common neighbors (friend-of-friend (FoF) method)

In [62]:
##### Create the function for Common neighbors

def friendOfFriend(users, dataset, target):
    #Initialize
    l=list()
    friendships={}

    #Create friendships dict
    for node in users:
        #Create a list with the friends of node
        ls=dataset[dataset.node1 == node]['node2'].tolist()

        #Create a dictionary with key the node and value the list
        friendships[node]=ls

    # Initialize a dictionary with the intersections
    inter={}

    #Intersection between users
    for j in friendships:
        if (target != j) and (target not in friendships[j]) :
            intersection=(len(set(friendships.get(target)).intersection(set(friendships.get(j)))))
            
            #Keep intersection into a list
            inter[j]=intersection
   
    #Create a sorted list, in ties we take the smallest ID
    lis=sorted(inter, key=inter.get, reverse=True)

    #Final Result
    return(lis[0:10]);
    

In [63]:
##### Test the code for the Sample
users=[1,2,3,4,5,6,7,8,9,10,11]
friendOfFriend(users, test_data, 5)

[6, 1, 7, 9, 11, 3, 4, 8, 10]

In [67]:
##### Test the code for the Original Dataset
users=list(range(0,4038))
print("The suggestions of 107 are:", friendOfFriend(users, data,107))
print("The suggestions of 1126 are:", friendOfFriend(users, data,1126))
print("The suggestions of 14 are:", friendOfFriend(users, data,14))
print("The suggestions of 35 are:", friendOfFriend(users, data,35))

The suggestions of 107 are: [513, 400, 559, 373, 492, 500, 378, 436, 431, 514]
The suggestions of 1126 are: [916, 1238, 1750, 1230, 1004, 1791, 1530, 1172, 1570, 1597]
The suggestions of 14 are: [2, 17, 140, 111, 137, 162, 19, 333, 44, 243]
The suggestions of 35 are: [46, 68, 99, 131, 175, 177, 225, 227, 278, 321]


###  Recommending friends using Jaccard coefficient

In [30]:
##### Create the function for Common neighbors using Jaccard coefficient

def JaccardCoefficient(users, dataset, target):
    #Initialize
    l=list()
    friendships={}

    #Create friendships dict
    for node in users:
        #Create a list with the friends of node
        ls=dataset[dataset.node1 == node]['node2'].tolist()

        #Create a dictionary with key the node and value the list
        friendships[node]=ls

    # Initialize a dictionary with the intersections
    inter={}

    #Intersection between users
    for j in friendships:
        if (target != j) and (target not in friendships[j]) :
            
            # Create union
            union=len(set(friendships.get(target)).union(set(friendships.get(j))))
            
            # Check for No zero denominator
            if (union != 0) :
                inter[j]=len(set(friendships.get(target)).intersection(set(friendships.get(j))))/union

    #Create a sorted list, in ties we take the smallest ID
    lis=sorted(inter, key=inter.get, reverse=True)

    #Final Result
    return(lis[0:10]);
    

In [9]:
##### Test the code for the Sample
users=[1,2,3,4,5,6,7,8,9,10,11]
JaccardCoefficient(users, test_data, 5)

[6, 1, 11, 9, 7, 3, 4, 8, 10]

In [10]:
##### Test the code for the Original Dataset
users=list(range(0,4038))

print("The suggestions of 107 are:", JaccardCoefficient(users, data,107))
print("The suggestions of 1126 are:", JaccardCoefficient(users, data,1126))
print("The suggestions of 14 are:", JaccardCoefficient(users, data,14))
print("The suggestions of 35 are:", JaccardCoefficient(users, data,35))

The suggestions of 107 are: [513, 400, 559, 492, 500, 373, 436, 378, 515, 514]
The suggestions of 1126 are: [916, 1750, 1230, 1530, 1004, 1238, 1172, 1791, 1789, 1597]
The suggestions of 14 are: [2, 140, 17, 162, 111, 333, 44, 137, 19, 243]
The suggestions of 35 are: [321, 11, 12, 15, 18, 37, 43, 74, 114, 209]


### Recommending friends using Adamic and Adar function

In [31]:
def AdamicAdarFunction(users, dataset, target):
    #Initialize
    l=list()
    friendships={}

    #Create friendships dict
    for node in users:
        #Create a list with the friends of node
        ls=dataset[dataset.node1 == node]['node2'].tolist()

        #Create a dictionary with key the node and value the list
        friendships[node]=ls

    # Initialize a dictionary with the intersections
    inter={}

    #Intersection between users
    for j in friendships:
        if (target != j) and (target not in friendships[j]) :
            intersection = set(friendships.get(target)).intersection(set(friendships.get(j)))

            # Adamic and Adar score calculation
            sum = 0
            for k in intersection :
                if (k in friendships.keys()) and (friendships[k] != []) and len(friendships[k]) != 1:
                    sum = sum+1/np.log(len(friendships[k]))

            inter[j]=sum
   
    #Create a sorted list, in ties we take the smallest ID
    lis=sorted(inter, key=inter.get, reverse=True)

    #Final Result
    return(lis[0:10]);

In [12]:
##### Test the code for the Sample
users=[1,2,3,4,5,6,7,8,9,10,11]
AdamicAdarFunction(users, test_data, 5)

[6, 7, 11, 1, 3, 4, 8, 9, 10]

In [12]:
##### Test the code for the Original Dataset
users=list(range(0,4038))

print("The suggestions of 107 are:",AdamicAdarFunction(users,data,107))
print("The suggestions of 1126 are:",AdamicAdarFunction(users,data,1126))
print("The suggestions of 14 are:",AdamicAdarFunction(users,data,14))
print("The suggestions of 35 are:",AdamicAdarFunction(users,data,35))

The suggestions of 107 are: [513, 400, 559, 500, 492, 373, 378, 436, 524, 514]
The suggestions of 1126 are: [916, 1238, 1750, 1230, 1004, 1791, 1530, 1172, 1570, 1597]
The suggestions of 14 are: [2, 17, 140, 111, 162, 137, 333, 19, 44, 243]
The suggestions of 35 are: [46, 68, 99, 131, 175, 177, 225, 227, 278, 321]


### Recommending Friends with Leicht-Holme-Newman Index (bonus method)

In [32]:
##### Create the function for Leicht Holme Newman method
def LeichtHolmeNewman(users, dataset, target):
    #Initialize
    l=list()
    friendships={}

    #Create friendships dict
    for node in users:
        #Create a list with the friends of node
        ls=dataset[dataset.node1 == node]['node2'].tolist()

        #Create a dictionary with key the node and value the list
        friendships[node]=ls

    # Initialize a dictionary with the intersections
    inter={}

    #Intersection between users
    for j in friendships:
        if (target != j) and (target not in friendships[j]) :
            intersection=(len(set(friendships.get(target)).intersection(set(friendships.get(j)))))
            
            #Calculate the k for j and target
            k1=len(friendships.get(j))
            k2=len(friendships.get(target))
            
            if (k1 !=0 and k2 !=0):
                #Store the intersection in the list inter[]
                inter[j]=intersection/(k1*k2)
   
    #Create a sorted list, in ties we take the smallest ID
    lis=sorted(inter, key=inter.get, reverse=True)

    #Final Result
    return(lis[0:10]);

In [20]:
##### Test the code for the Sample
users=[1,2,3,4,5,6,7,8,9,10,11]
LeichtHolmeNewman(users, test_data, 5)

[1, 11, 6, 9, 7, 3, 4, 8]

In [14]:
##### Test the code for the Original Dataset
users=list(range(0,4038))
print("The suggestions of 107 are:",LeichtHolmeNewman(users, data,107))
print("The suggestions of 1126 are:",LeichtHolmeNewman(users, data,1126))
print("The suggestions of 14 are:",LeichtHolmeNewman(users, data,14))
print("The suggestions of 35 are:",LeichtHolmeNewman(users, data,35))

The suggestions of 107 are: [11, 12, 15, 18, 37, 43, 74, 114, 209, 210]
The suggestions of 1126 are: [911, 918, 1096, 1119, 1145, 1206, 1262, 1386, 1395, 1466]
The suggestions of 14 are: [11, 12, 15, 18, 37, 43, 74, 114, 209, 210]
The suggestions of 35 are: [11, 12, 15, 18, 37, 43, 74, 114, 209, 210]


### Evaluation of the recommendation system

In [110]:
# Create users list
users=list(range(0,4038))

#Initialization
s1=[]
s2=[]
s3=[]

for i in list(range(100,4100,100)):
    
    #Run the functions
    fofList=friendOfFriend(users, data,i)
    JaccardList=JaccardCoefficient(users, data,i)
    AdamicAdarList=AdamicAdarFunction(users, data,i)
    
    #Similarity Percentage of FoF and Jaccard
    s1.append(len(set(fofList).intersection(set(JaccardList)))*10)
    
    #Similarity Percentage of FoF and Adamic and Adar
    s2.append(len(set(fofList).intersection(set(AdamicAdarList)))*10)
    
    #Similarity Percentage of Jaccard and Adamic and Adar
    s3.append(len(set(AdamicAdarList).intersection(set(JaccardList)))*10)

#Average Similarity (%)
print("The average similarity of FoF & Jaccard is:",np.mean(s1),"%")
print("The average similarity of FoF & Adamic Adar is:",np.mean(s2),"%")
print("The average similarity of Adamic Adar & Jaccard is:",np.mean(s3),"%")

The average similarity of FoF & Jaccard is: 55.5 %
The average similarity of FoF & Adamic Adar is: 90.75 %
The average similarity of Adamic Adar & Jaccard is: 57.0 %


### Forecast Recommendations

#### Evaluation Function
In this stage we have to estimate the quality of the recommendation methods. We create a function(evaluationFunction()) which computes the strength of the connection between two nodes. In more details, we insert two already friends of our network and the function removes this connection from the dataset. After the connection drops, the algorythm searches for every method if one of the two nodes (ex. F1) exists in the list of the second node (ex. F2). We do the same process in both F1 and F2. Also, we would like to mention if a node doesn't exist in the recommendation list of the other node we exclude this relationship.

#### Score Calculation
The score for each algorythm is calculated according to the position of the list.
Also, we take the average value of the position for both F1 and F2. The higher the score is, the higher the quality of the algorythm.


In [34]:
#Choose a pair F1 AND F2

def evaluationFunction(dataset,users,F1,F2):

    ####Remove the relationship
    
    #First we find the connection F1-F2
    l1=dataset[dataset.node2 == F1].index
    l2=dataset[dataset.node1 == F2 ].index
    rm1=set(l1).intersection(set(l2))
    
    #Then we find the connection F2-F1
    l1=dataset[dataset.node2 == F2 ].index
    l2=dataset[dataset.node1 == F1].index
    rm2=set(l1).intersection(set(l2))

    #We create the union
    rm=rm1.union(rm2)
    
    #Remove the elements of the set rm
    for i in rm:
        dataset=dataset.drop(i)
    
    ###FoF (friend-of-friend)
    if ((F1 in friendOfFriend(users, dataset,F2)) and (F2 in friendOfFriend(users, dataset,F1))): 
        
        #Compute the recommendations for F1
        Friend1=10 - friendOfFriend(users, dataset,F1).index(F2)

        #Compute the resommentdations for F2
        Friend2=10 - friendOfFriend(users, dataset,F2).index(F1)


        ####Compute the score
        scoreFoF=(Friend1+Friend2)/2
        
    else:
        scoreFoF= 0
    
    ###Jaccard
    if ((F1 in JaccardCoefficient(users, dataset,F2)) and (F2 in JaccardCoefficient(users, dataset,F1))): 
        
        #Compute the recommendations for F1
        Friend1=10 - JaccardCoefficient(users, dataset,F1).index(F2)

        #Compute the resommentdations for F2
        Friend2=10 - JaccardCoefficient(users, dataset,F2).index(F1)

        ####Check if either of these does not exist

        ####Compute the score
        scoreJaccard=(Friend1+Friend2)/2
        
    else:
        scoreJaccard = 0
    
    ###AdamicAdar
    if ((F1 in AdamicAdarFunction(users, dataset,F2)) and (F2 in AdamicAdarFunction(users, dataset,F1))):
        
        #Compute the recommendations for F1
        Friend1=10 - AdamicAdarFunction(users, dataset,F1).index(F2)

        #Compute the resommentdations for F2
        Friend2=10 - AdamicAdarFunction(users, dataset,F2).index(F1)

        ####Check if either of these does not exist

        ####Compute the score
        scoreAdamicAdar=(Friend1+Friend2)/2
        
    else:
        scoreAdamicAdar = 0 
          
 
    return(scoreFoF,scoreJaccard,scoreAdamicAdar);

#### Iteration Function

In order to have more accurate results we should run the algorythm more than once. So, we create the algorythm (finalscore()) in order to recall the evaluation function many times. Speciffically, this function takes a random index from the original dataset. After we call the evaluation function in order to create a score for each relationship; the outputs of the evaluation function are stored in a list. Finally, after all that loops, we calculate the average score for each method.

In [40]:
#Function for iterations
def finalScore(dataset,users, n):
    eval_scores=[]
    for l in list(range(0,n)):
        
        #Random F1-F2 Relationship
        index=sample(range(len(dataset)),1)[0]
        friend1=dataset.iloc[index].node1
        friend2=dataset.iloc[index].node2
        
        evaluationOutput=evaluationFunction(dataset,users,friend1,friend2)
        
        if (evaluationOutput!= None):
            eval_scores.append(evaluationFunction(dataset,users,friend1,friend2))
        
    #Final Score
    if eval_scores != [] :
        scores=eval_scores      
        
    return(scores);       

In [43]:
#Create the scores for the test_data (for 100 iterations)
users=[1,2,3,4,5,6,7,8,9,10,11]

#We create a variable in order to call once the function and then print the scores
score=finalScore(test_data,users,100)

#Create a score table
score_table=pd.DataFrame(score)

#Add column names
score_table.columns = ["FoF Score", "Jaccard Score", "Adamic Adar Score"]

In [44]:
users=list(range(0,4038))

#We create a variable in order to call once the function and then print the scores
score=finalScore(data,users,100)

#Create a score table
score_table=pd.DataFrame(score)

#Add column names
score_table.columns = ["FoF_Score", "Jaccard_Score", "Adamic_Adar_Score"]

# Final Print
print(score_table)

    FoF Score  Jaccard Score  Adamic Adar Score
0         9.5            8.5                9.5
1         0.0            6.0                0.0
2         6.0            7.5                6.0
3        10.0           10.0               10.0
4        10.0            0.0               10.0
5         0.0            0.0                0.0
6         9.5            0.0               10.0
7         9.0           10.0                9.0
8         8.0            9.5                8.0
9         0.0            8.5                0.0
10       10.0           10.0               10.0
11        7.5            9.0                7.5
12       10.0           10.0               10.0
13        9.0            8.0                9.0
14        8.0            8.5                8.0
15        0.0            0.0                0.0
16        9.0            8.0                9.0
17        0.0            0.0                0.0
18        4.5            0.0                4.5
19        0.0            0.0            

#### Note
In the table we have zero when a node doesn't identify in the suggestion list so we have to exclude the zeros for each method and calculate the average score for each one

### Average score

In [127]:
#Remove the zeros
score1=score_table.FoF_Score[score_table.FoF_Score !=0].mean()
score2=score_table.Jaccard_Score[score_table.Jaccard_Score !=0].mean()
score3=score_table.Adamic_Adar_Score[score_table.Adamic_Adar_Score !=0].mean()

#Final output
print("FoF score is: ", score1,"/10")
print("Jaccard score is: ", score2,"/10")
print("Adamic Adar score is: ", score3,"/10")

FoF score is:  8.12295081967213 /10
Jaccard score is:  8.356060606060606 /10
Adamic Adar score is:  8.15079365079365 /10


#### Comment the Score outputs

As we can see the scoring of all functions is very close to each other but idependent of how many times we run the algorythm the sorting of the scores is the same. In other words, the Adamic Adar method has the higher score in all times, after Friend of Friend(FoF) method and after we have Jaccard method. In our view FoF method is a more generalized method that has decent results, if we take into consideration the calculation time. On the other hand the Adamic Adar method penalizes more the users that have a lot of friends than users that have less friends. This techique can better describes social network connections, and can have better results because is much closer on how people make a friend in a social network. The Jaccard distance is the less accurate metric for a reccomendation application.

## 2nd Evaluation System

We crete an alternative evaluation system in order to incease the accuracy.Specifficaly, we take a target node and we remove one by one all the friends of him. Hence, we calculate the number of the friends that suggested and then we divide it with the total initial number of friends. This computations are hard intensive, so we do this procedure for 2-3 different targets.

In [128]:
#Function for iterations
def finalScore2(dataset,users, n):
    
    eval_scores=[]
    
    #Create friendships dict
    friendships={}
    for node in users:
        #Create a list with the friends of node
        ls=dataset[dataset.node1 == node]['node2'].tolist()

        #Create a dictionary with key the node and value the list
        friendships[node]=ls

    for l in list(range(0,n)):
        
        target=sample(users,1)[0] #take 1 random user
        
        #Initialize the scores
        scorefof = 0
        scoreJaccard = 0
        scoreAdamicAdar = 0
        friends=[]
        
        for i in friendships.get(target):
            friends.append([target,i])
            
        for i in friends:
            evaluationOutput=evaluationFunction(dataset,users,i[0],i[1])

            if (evaluationOutput!= None):
                if evaluationOutput[0]!= 0:
                    scorefof +=1
                if evaluationOutput[1]!= 0:
                    scoreJaccard +=1
                if evaluationOutput[2]!= 0:    
                    scoreAdamicAdar +=1
        
        if len(friends) !=0 :
            eval_scores.append([scorefof/len(friends),scoreJaccard/len(friends),scoreAdamicAdar/len(friends)])
            
        #Final Print
        if eval_scores != [] :
            #Calculate the average value
            scores=np.mean(eval_scores,axis=0)
        
        
    return([round(scores[0],3),round(scores[1],3),round(scores[2],3)]);  

#### Final Score

In [132]:
#We create a variable in order to call once the function and then print the scores
score2=finalScore2(data,users,2)

print("FoF score is: ", score2[0],"/10")
print("Jaccard score is: ", score2[1],"/10")
print("Adamic Adar score is: ", score2[2],"/10")

FoF score is:  0.892 /10
Jaccard score is:  0.824 /10
Adamic Adar score is:  0.882 /10
