# App Implementation

# Importing Libraries and Credentials

In [3]:
from neo4j import GraphDatabase
from graphdatascience import GraphDataScience
import numpy as np
import pandas as pd

from creds import user, pwd, uri, user_local, pwd_local, local_uri

import warnings 
warnings.simplefilter('ignore')

# Driver Creation

In [2]:
driver = GraphDatabase.driver( 
    uri=uri,
    auth=(user,pwd) 
)

Testing the connection

In [3]:
test = driver.execute_query('''
    match (p:Player {name:'Paulo Dybala'})-[:PLAYED_FOR]->(s)
    return s.name
''')

In [12]:
[r.data()['s.name'] for r in test[0]]

['Juventus', 'Juventus', 'Roma', 'Roma', 'Roma']

# Similarity Algorithm  

Running the Node Similarity algorithm from GDS to find how much two nodes are similar.

### Native Projection Creation  

The projection will be created with the GDS python library

In [4]:
gds = GraphDataScience( 
    endpoint=local_uri,
    auth=(user_local,pwd_local)
)

driver_local = GraphDatabase.driver( 
    uri=local_uri,
    auth=(user_local,pwd_local) 
)

Test for local driver connection

In [10]:
dybala_teams = driver_local.execute_query('''
match (p:Player {name:'Paulo Dybala'})-[:PLAYED_FOR]-(o)
return o.name
''')

In [21]:
[team.values()[0] for team in dybala_teams.records]

['Juventus', 'Juventus', 'Roma', 'Roma', 'Roma']

## Using the algorithm

### Creating the Projection

In [42]:
# Getting all node labels and relationship types

labels = driver_local.execute_query('''
match (n)
return distinct(labels(n))
''')

relationships = driver_local.execute_query('''
match ()-[r]-()
return distinct(type(r))
''')

In [49]:
# Putting values in a list
lab_values = [l.value()[0] for l in labels.records] 
rel_values = [r.value() for r in relationships.records]

In [51]:
print(f"Node labels: {lab_values}\nRelationship types: {rel_values}")

Node labels: ['Player', 'Nation', 'Position', 'Team_2020_2021', 'Team_2021_2022', 'Team_2022_2023', 'Team_2023_2024', 'Team_2024_2025']
Relationship types: ['IS_NATION', 'POSITION_PLAYED', 'PLAYED_FOR']


In [52]:
# Creating the projection
G = gds.graph.project(
    graph_name='guesstheplayergame',
    node_spec=lab_values,  
    relationship_spec=rel_values)

In [64]:
# Creating the graph object
graph = gds.graph.get('guesstheplayergame')

In [99]:
result = gds.nodeSimilarity.stream(graph, similarityCutoff=0, topK=int(1e6))

Results are provided as node ids. They will be converted into player name.

In [107]:
cr7 = gds.util.asNode(0)

In [108]:
list(cr7.values())[0]

'Cristiano Ronaldo'

Getting players names

In [113]:
player_id = list(result.node1.unique())

In [115]:
player_names = gds.util.asNodes(player_id)

In [121]:
list(player_names[0].values())[0]

['Cristiano Ronaldo', 1]

In [122]:
# Mapping
map_dict = {}

for id, name in zip(player_id, player_names):
    map_dict[id] = list(name.values())[0]

In [125]:
result.node1.map(map_dict)

0          Cristiano Ronaldo
1          Cristiano Ronaldo
2          Cristiano Ronaldo
3          Cristiano Ronaldo
4          Cristiano Ronaldo
                 ...        
1384147           Paul Pogba
1384148           Paul Pogba
1384149           Paul Pogba
1384150           Paul Pogba
1384151           Paul Pogba
Name: node1, Length: 1384152, dtype: object

In [126]:
# Adding columns to the df
result['node1_name'] = result.node1.map(map_dict) 
result['node2_name'] = result.node2.map(map_dict)

In [127]:
result

Unnamed: 0,node1,node2,similarity,node1_name,node2_name
0,0,769,0.500000,Cristiano Ronaldo,Francisco Conceição
1,0,468,0.400000,Cristiano Ronaldo,João Moutinho
2,0,585,0.400000,Cristiano Ronaldo,Rúben Vinagre
3,0,263,0.400000,Cristiano Ronaldo,Nani
4,0,200,0.333333,Cristiano Ronaldo,Dejan Kulusevski
...,...,...,...,...,...
1384147,1176,15,0.000000,Paul Pogba,Nicolas Nkoulou
1384148,1176,8,0.000000,Paul Pogba,Mateo Musacchio
1384149,1176,7,0.000000,Paul Pogba,Ashley Young
1384150,1176,5,0.000000,Paul Pogba,Achraf Hakimi


The last steps include: rounding to three decimals and computing -0.001 in order not to have similarity = 1 for two players.

In [132]:
result['rounded_similarity'] = np.round(result.similarity,3).apply(lambda x: x-.001 if x > 0 else(x))

Storing results

In [135]:
result.to_csv('similarity_scores.csv', index=None)

Sampling a random player

In [4]:
df = pd.read_csv('similarity_scores.csv')

In [12]:
df.node1_name.sample(1).values[0]

'Remo Freuler'

Getting similarity values

In [21]:
df[(df.node1_name == 'Remo Freuler') & (df.node2_name == 'Paulo Dybala')].rounded_similarity.values[0]

0.076

In [27]:
df.query("node1_name == 'Matías Viña'").rounded_similarity.max()

0.499

Many players have small values as their max value. I should use the Min Max Scaler formula to normalize them.

In [29]:
df.groupby('node1_name', as_index=False).similarity.agg(['min', 'max'])

Unnamed: 0,node1_name,min,max
0,Aaron Hickey,0.0,0.600000
1,Aaron Ramsey,0.0,0.500000
2,Aarón Martín,0.0,0.600000
3,Abdelhamid Sabiri,0.0,0.444444
4,Abdou Harroui,0.0,0.400000
...,...,...,...
1172,Ángel Di María,0.0,0.600000
1173,Éderson,0.0,0.571429
1174,Þórir Jóhann Helgason,0.0,0.571429
1175,Łukasz Skorupski,0.0,0.625000


In [38]:
ederson = df[df.node1_name == 'Éderson']

ederson['min_max_scaled_similarity'] = (ederson.rounded_similarity-ederson.rounded_similarity.min())/(ederson.rounded_similarity.max()-ederson.rounded_similarity.min())
ederson['min_max_scaled_similarity_rounded'] = ederson.min_max_scaled_similarity.apply(
    lambda x: np.round(x,3)-.001 if x>0 else(0))

In [39]:
ederson

Unnamed: 0,node1,node2,similarity,node1_name,node2_name,rounded_similarity,min_max_scaled_similarity,min_max_scaled_similarity_rounded
994896,846,857,0.571429,Éderson,Ademola Lookman,0.570,1.000000,0.999
994897,846,1012,0.500000,Éderson,Matteo Ruggeri,0.499,0.875439,0.874
994898,846,903,0.444444,Éderson,Marten de Roon,0.443,0.777193,0.776
994899,846,902,0.444444,Éderson,Mario Pašalić,0.443,0.777193,0.776
994900,846,1086,0.428571,Éderson,Giorgio Scalvini,0.428,0.750877,0.750
...,...,...,...,...,...,...,...,...
996067,846,7,0.000000,Éderson,Ashley Young,0.000,0.000000,0.000
996068,846,5,0.000000,Éderson,Achraf Hakimi,0.000,0.000000,0.000
996069,846,2,0.000000,Éderson,Sami Khedira,0.000,0.000000,0.000
996070,846,1,0.000000,Éderson,Gianluigi Donnarumma,0.000,0.000000,0.000


In [41]:
df['min_max_scaled'] = df.groupby('node1_name', as_index=False).similarity.transform(lambda x: (x-x.min())/(x.max()-x.min()))

In [43]:
df['rounded_min_max_similarity'] = df.min_max_scaled.apply(lambda x: np.round(x,3)-.001 if x>0 else(0))

In [49]:
df.groupby('node1_name', as_index=False).rounded_min_max_similarity.agg(['min', 'max'])

Unnamed: 0,node1_name,min,max
0,Aaron Hickey,0.0,0.999
1,Aaron Ramsey,0.0,0.999
2,Aarón Martín,0.0,0.999
3,Abdelhamid Sabiri,0.0,0.999
4,Abdou Harroui,0.0,0.999
...,...,...,...
1172,Ángel Di María,0.0,0.999
1173,Éderson,0.0,0.999
1174,Þórir Jóhann Helgason,0.0,0.999
1175,Łukasz Skorupski,0.0,0.999


Storing

In [57]:
df.query("node1_name == 'Tammy Abraham'").query('min_max_scaled == 1')

Unnamed: 0,node1,node2,similarity,node1_name,node2_name,rounded_similarity,min_max_scaled,rounded_min_max_similarity
866712,737,535,0.5,Tammy Abraham,Chris Smalling,0.499,1.0,0.999


In [50]:
df.to_csv('similarity_scores.csv', index=None)