# Using Neo4j graph data science package

In [1]:
from graphdatascience import GraphDataScience
import pandas as pd
import os
from pprint import pprint 

In [2]:
NEO4J_URI = "bolt://localhost:7687"
gds = GraphDataScience(NEO4J_URI, auth=("neo4j","123"))

# connect with AuraDS
# Configure the driver with AuraDS-recommended settings
# gds = GraphDataScience("neo4j+s://my-aura-ds.databases.neo4j.io:7687", auth=("neo4j", "my-password"), aura_ds=True)



In [3]:
print(gds.version())

2.7.0


In [4]:
tech_df = pd.read_excel(r"C:\\Users\25335\Downloads\neo4j_astoria\data\Technology Skills.xlsx")
skills_df = pd.read_excel(r"C:\Users\25335\Downloads\neo4j_astoria\data\Skills.xlsx")
knowledge_df = pd.read_excel(r"C:\Users\25335\Downloads\neo4j_astoria\data\Knowledge.xlsx")
related_title_df = pd.read_excel(r"C:\Users\25335\Downloads\neo4j_astoria\data\Related Occupations.xlsx")

skills_df.rename(columns = {'Element Name':'Skill'}, inplace = True)
knowledge_df.rename(columns = {'Element Name':'Knowledge'}, inplace = True)

In [5]:
tech_example_listed = tech_df.groupby(["Title"])["Example"].apply(lambda x: [item.lower() for item in list(set(x))]).reset_index("Title")
tech_HOT_listed = tech_df.groupby(["Title"])["Hot Technology"].apply(list).reset_index("Title")

data_value_threshhold = skills_df["Data Value"] > 4.5
skill_listed = skills_df[data_value_threshhold].groupby(["Title"])["Skill"].apply(lambda x: [item.lower() for item in list(set(x))]).reset_index("Title")
knowledge_listed = knowledge_df[data_value_threshhold].groupby(["Title"])["Knowledge"].apply(lambda x: [item.lower() for item in list(set(x))]).reset_index("Title")
related_listed = related_title_df.groupby(["Title"])["Related Title"].apply(lambda x: [item.lower() for item in list(set(x))]).reset_index("Title")

merged_df = tech_example_listed.merge(tech_HOT_listed, on="Title", how="left")
merged_df = merged_df.merge(skill_listed, on="Title", how="left")
merged_df = merged_df.merge(knowledge_listed, on="Title", how="left")
merged_df = merged_df.merge(related_listed, on="Title", how="left")

merged_df.fillna("NA", inplace=True)

  knowledge_listed = knowledge_df[data_value_threshhold].groupby(["Title"])["Knowledge"].apply(lambda x: [item.lower() for item in list(set(x))]).reset_index("Title")


In [6]:
# some more preprocessing
merged_df = merged_df.drop_duplicates(subset='Title', keep='first')
merged_df = merged_df.drop_duplicates(subset='Related Title', keep='first')
merged_df['Title'] = merged_df['Title'].str.lower()

merged_df.sample(5)

Unnamed: 0,Title,Example,Hot Technology,Skill,Knowledge,Related Title
186,correspondence clerks,"[data entry software, sap, microsoft word, mic...","[Y, N, N, Y, Y, Y, Y, Y, Y]",,,"[interviewers, except eligibility and loan, re..."
412,helpers--electricians,"[recordkeeping software, computer-aided drafti...","[Y, Y, Y, N, N]",,,"[boilermakers, electricians, electrical power-..."
221,dermatologists,"[ge healthcare centricity practice solution, e...","[N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...",[reading comprehension],[clerical],"[cardiologists, family medicine physicians, em..."
721,recreational vehicle service technicians,"[email software, microsoft word, inventory tra...","[N, N, Y, Y, N, N, N]",,,"[mobile heavy equipment mechanics, except engi..."
737,retail salespersons,[advanced retail management systems retail pro...,"[Y, Y, N, Y, Y, Y, N, N, Y, N, N, Y, N, N, N, ...",,,"[door-to-door sales workers, news and street v..."


In [7]:
for index, row in merged_df.iterrows():

        tech_list = row["Example"]
        skill_list = row["Skill"]
        knowledge_list = row["Knowledge"]
        hot_tech_list = row["Hot Technology"]
        related_list = row["Related Title"]

        if tech_list is not None:

          for i in range(len(tech_list)):

            #add tech skills

            row_level_query1 = """
            MERGE (title:Title {name: $title_name})

            MERGE (tech:TechSkill {name: $tech_name, hot: $hot_bool})

            MERGE (title)-[:HAS_TECH_SKILL]->(tech)
            """
            gds.run_cypher(row_level_query1, 
                           params={"title_name":row["Title"],
                                   "tech_name":tech_list[i],
                                   "hot_bool":hot_tech_list[i]               
            })

        #add skills and knowledge
        if skill_list != "NA" and skill_list is not None:

          for j in range(len(skill_list)):
            row_level_query2 = """
            MERGE (title:Title {name: $title_name})
            MERGE (skill:Skill {name: $skill_name})
            MERGE (title)-[:HAS_SKILL]->(skill)
            """
            gds.run_cypher(row_level_query2, 
                           params={"title_name":row["Title"],
                                  "skill_name":tech_list[j]               
            })


        # add knowledge and skill
        if knowledge_list != "NA" and knowledge_list is not None:
          for k in range(len(knowledge_list)):
            #add skills and knowledge
            row_level_query3 = """
            MERGE (title:Title {name: $title_name})
            MERGE (knowledge:Knowledge {name: $knowledge_name})
            MERGE (title)-[:HAS_KNOWLEDGE]->(knowledge)
        """
            gds.run_cypher(row_level_query3, 
                           params={"title_name":row["Title"],
                                  "knowledge_name":knowledge_list[k]              
            })

        #add related occupations
        if related_list != "NA" and related_list is not None:
          for l in range(len(related_list)):
            row_level_query4 = """
            MERGE (title:Title {name: $title_name})
            MERGE (related:Title {name: $related_name})
            MERGE (title)-[:HAS_RELATION]-(related)
            """
            gds.run_cypher(row_level_query4, 
                           params={"title_name":row["Title"],
                                  "related_name":related_list[l]              
            })

        print(f"{index} out of {len(merged_df)} DONE")

0 out of 902 DONE
1 out of 902 DONE
2 out of 902 DONE
3 out of 902 DONE
4 out of 902 DONE
5 out of 902 DONE
6 out of 902 DONE
7 out of 902 DONE
8 out of 902 DONE
9 out of 902 DONE
10 out of 902 DONE
11 out of 902 DONE
12 out of 902 DONE
13 out of 902 DONE
14 out of 902 DONE
15 out of 902 DONE
16 out of 902 DONE
17 out of 902 DONE
18 out of 902 DONE
19 out of 902 DONE
20 out of 902 DONE
21 out of 902 DONE
22 out of 902 DONE
23 out of 902 DONE
24 out of 902 DONE
25 out of 902 DONE
26 out of 902 DONE
27 out of 902 DONE
28 out of 902 DONE
29 out of 902 DONE
30 out of 902 DONE
31 out of 902 DONE
32 out of 902 DONE
33 out of 902 DONE
34 out of 902 DONE
35 out of 902 DONE
36 out of 902 DONE
37 out of 902 DONE
38 out of 902 DONE
39 out of 902 DONE
40 out of 902 DONE
41 out of 902 DONE
42 out of 902 DONE
43 out of 902 DONE
44 out of 902 DONE
45 out of 902 DONE
46 out of 902 DONE
47 out of 902 DONE
48 out of 902 DONE
49 out of 902 DONE
50 out of 902 DONE
51 out of 902 DONE
52 out of 902 DONE
53 

## Centrality measure

In [8]:
G, result = gds.graph.project("relation_test", "Title", "HAS_RELATION")

print(f"The projection took {result['projectMillis']} ms")

# We can use convenience methods on `G` to check if the projection looks correct
print(f"Graph '{G.name()}' node count: {G.node_count()}")
print(f"Graph '{G.name()}' node labels: {G.node_labels()}")
print(f"Graph '{G.name()}' relationship count: {G.relationship_count()}")



The projection took 24 ms
Graph 'relation_test' node count: 923
Graph 'relation_test' node labels: ['Title']
Graph 'relation_test' relationship count: 13080


In [9]:
eigenvector_centrality_result = gds.eigenvector.mutate(G, maxIterations=100, mutateProperty="eigenvectorCentrality")

In [10]:
eigenvector_centrality_result

mutateMillis                                                              0
nodePropertiesWritten                                                   923
ranIterations                                                           100
didConverge                                                           False
centralityDistribution    {'min': 0.0, 'max': 0.46537971496582026, 'p90'...
postProcessingMillis                                                    142
preProcessingMillis                                                       0
computeMillis                                                           106
configuration             {'mutateProperty': 'eigenvectorCentrality', 'j...
Name: 0, dtype: object

In [11]:
eigenvector_centrality_result.centralityDistribution

{'min': 0.0,
 'max': 0.46537971496582026,
 'p90': 0.01036310195922674,
 'p999': 0.46537971496581854,
 'p99': 0.19938755035400213,
 'p50': 0.0001336019486171125,
 'p75': 0.001373641192911279,
 'p95': 0.032069683074949396,
 'mean': 0.007387174331791837}

In [12]:
gds.graph.nodeProperties.write(G, ["eigenvectorCentrality"])

writeMillis                               59
graphName                      relation_test
nodeProperties       [eigenvectorCentrality]
propertiesWritten                        923
Name: 0, dtype: object

In [13]:
pd.set_option('display.max_colwidth', None)

def display_top_20(centrality_measure):
    """
    Function to execute the Cypher query to retrieve the top 20 cities with the highest centrality measure.
    """
    query = f"""
    MATCH (n:Title)
    WHERE n.{centrality_measure} IS NOT NULL
    RETURN n.name AS name, n.{centrality_measure} AS {centrality_measure}
    ORDER BY n.{centrality_measure} DESC
    LIMIT 20
    """
    result = gds.run_cypher(query)
    return result

    # Display the result
    #for res in result:
        #pprint(res["name"])


display_top_20("eigenvectorCentrality").head(20)

Unnamed: 0,name,eigenvectorCentrality
0,"woodworking machine setters, operators, and tenders, except sawing",0.46538
1,"rolling machine setters, operators, and tenders, metal and plastic",0.288922
2,"tool grinders, filers, and sharpeners",0.282746
3,industrial machinery mechanics,0.255133
4,"welding, soldering, and brazing machine setters, operators, and tenders",0.239907
5,tool and die makers,0.232556
6,"grinding and polishing workers, hand",0.213068
7,"multiple machine tool setters, operators, and tenders, metal and plastic",0.208561
8,"paper goods machine setters, operators, and tenders",0.20776
9,machine feeders and offbearers,0.199387


## community detection

In [14]:
# view graph data in pandas
# gds.graph.list()

### weakly connected components

In [15]:

result = gds.wcc.mutate(G, mutateProperty="componentId")

print(f"Components found: {result.componentCount}")

Components found: 1


In [17]:
query = """
    CALL gds.graph.nodeProperties.stream('relation_test', 'componentId')
    YIELD nodeId, propertyValue
    WITH gds.util.asNode(nodeId).name AS node, propertyValue AS componentId
    WITH componentId, collect(node) AS group
    WITH componentId, group, size(group) AS componentSize
    RETURN componentId, componentSize, group
    ORDER BY componentSize DESC
"""

components = gds.run_cypher(query)
components

Unnamed: 0,componentId,componentSize,group
0,0,923,"[accountants and auditors, billing and posting clerks, sales representatives of services, except advertising, insurance, financial services, and travel, financial and investment analysts, statistical assistants, credit authorizers, checkers, and clerks, compensation, benefits, and job analysis specialists, credit analysts, management analysts, financial managers, tax preparers, personal financial advisors, budget analysts, payroll and timekeeping clerks, financial examiners, first-line supervisors of office and administrative support workers, brokerage clerks, tax examiners and collectors, and revenue agents, financial risk specialists, treasurers and controllers, bookkeeping, accounting, and auditing clerks, actors, writers and authors, news analysts, reporters, and journalists, agents and business managers of artists, performers, and athletes, dancers, costume attendants, models, art, drama, and music teachers, postsecondary, poets, lyricists and creative writers, special effects artists and animators, self-enrichment teachers, art directors, music directors and composers, makeup artists, theatrical and performance, producers and directors, choreographers, fine artists, including painters, sculptors, and illustrators, film and video editors, musicians and singers, broadcast announcers and radio disc jockeys, talent directors, actuaries, financial quantitative analysts, investment fund managers, insurance underwriters, compensation and benefits managers, business intelligence analysts, insurance sales agents, securities, commodities, and financial services sales agents, economists, acupuncturists, cardiologists, family medicine physicians, emergency medicine physicians, pediatricians, general, advanced practice psychiatric nurses, neurologists, orthopedic surgeons, except pediatric, chiropractors, physical medicine and rehabilitation physicians, dermatologists, massage therapists, general internal medicine physicians, psychiatrists, naturopathic physicians, recreational therapists, urologists, allergists and immunologists, pediatric surgeons, obstetricians and gynecologists, nurse practitioners, acute care nurses, paramedics, nursing assistants, respiratory therapists, registered nurses, emergency medical technicians, anesthesiologists, licensed practical and licensed vocational nurses, nurse midwives, clinical nurse specialists, physical therapists, critical care nurses, physician assistants, adapted physical education specialists, health specialties teachers, postsecondary, kindergarten teachers, except special education, special education teachers, elementary school, middle school teachers, except special and career/technical education, career/technical education teachers, postsecondary, recreation and fitness studies teachers, postsecondary, career/technical education teachers, middle school, adult basic education, adult secondary education, and english as a second language instructors, special education teachers, secondary school, career/technical education teachers, secondary school, special education teachers, preschool, school psychologists, teaching assistants, special education, secondary school teachers, except special and career/technical education, ...]"


In [18]:
largest_component = components["componentId"][0]

print(f"The largest component has the id {largest_component} with {components['componentSize'][0]} techs.")

The largest component has the id 0 with 923 techs.


In [19]:

largest_component_graph, _ = gds.beta.graph.project.subgraph(
    "largest_connected_components", G, f"n.componentId={largest_component}", "*"
)
largest_component_graph



Graph({'graphName': 'largest_connected_components', 'nodeCount': 923, 'relationshipCount': 13080, 'database': 'neo4j', 'configuration': {'relationshipProperties': {}, 'jobId': '553d9758-a18e-4035-aea3-b117cc58ec44', 'validateRelationships': False, 'nodeFilter': 'n.componentId=0', 'sudo': False, 'relationshipProjection': {'HAS_RELATION': {'aggregation': 'DEFAULT', 'orientation': 'NATURAL', 'indexInverse': False, 'properties': {}, 'type': 'HAS_RELATION'}}, 'readConcurrency': 4, 'nodeProperties': {}, 'nodeProjection': {'Title': {'label': 'Title', 'properties': {}}}, 'logProgress': True, 'concurrency': 4, 'creationTime': neo4j.time.DateTime(2024, 7, 7, 2, 20, 52, 295717700, tzinfo=<UTC>), 'relationshipFilter': '*', 'parameters': {}}, 'schema': {'graphProperties': {}, 'nodes': {'Title': {'eigenvectorCentrality': 'Float (DefaultValue(NaN), TRANSIENT)', 'componentId': 'Integer (DefaultValue(-9223372036854775808), TRANSIENT)'}}, 'relationships': {'HAS_RELATION': {}}}, 'memoryUsage': '437 KiB'}

### louvain

In [20]:
gds.louvain.mutate(largest_component_graph, mutateProperty="louvainCommunityId")

mutateMillis                                                                                                                                                                                                                                                                                                                                                               0
nodePropertiesWritten                                                                                                                                                                                                                                                                                                                                                    923
modularity                                                                                                                                                                                                                                                                    

In [21]:
gds.graph.nodeProperties.write(largest_component_graph, ["louvainCommunityId"])

writeMillis                                    82
graphName            largest_connected_components
nodeProperties               [louvainCommunityId]
propertiesWritten                             923
Name: 0, dtype: object

In [22]:
# check names in certain communities
gds.run_cypher(
    """
    MATCH (n) WHERE 'louvainCommunityId' IN keys(n) 
    RETURN n.name, n.louvainCommunityId LIMIT 10
    """
)

Unnamed: 0,n.name,n.louvainCommunityId
0,accountants and auditors,689
1,billing and posting clerks,689
2,"sales representatives of services, except advertising, insurance, financial services, and travel",689
3,financial and investment analysts,689
4,statistical assistants,798
5,"credit authorizers, checkers, and clerks",689
6,"compensation, benefits, and job analysis specialists",689
7,credit analysts,689
8,management analysts,798
9,financial managers,689


In [23]:
query = """
    CALL gds.graph.nodeProperties.stream('largest_connected_components', 'louvainCommunityId')
    YIELD nodeId, propertyValue
    WITH gds.util.asNode(nodeId).name AS node, propertyValue AS communityId
    WITH communityId, collect(node) AS title_community
    WITH communityId, title_community, size(title_community) AS communitySize
    RETURN communityId, communitySize, title_community
    ORDER BY communitySize DESC
"""

communities = gds.run_cypher(query)
communities

Unnamed: 0,communityId,communitySize,title_community
0,301,264,"[costume attendants, makeup artists, theatrical and performance, fine artists, including painters, sculptors, and illustrators, adhesive bonding machine operators and tenders, industrial machinery mechanics, textile cutting machine setters, operators, and tenders, shoe machine operators and tenders, woodworking machine setters, operators, and tenders, except sawing, cutting and slicing machine setters, operators, and tenders, milling and planing machine setters, operators, and tenders, metal and plastic, cutting, punching, and press machine setters, operators, and tenders, metal and plastic, crushing, grinding, and polishing machine setters, operators, and tenders, packaging and filling machine operators and tenders, machine feeders and offbearers, rolling machine setters, operators, and tenders, metal and plastic, multiple machine tool setters, operators, and tenders, metal and plastic, extruding and forming machine setters, operators, and tenders, synthetic and glass fibers, extruding and drawing machine setters, operators, and tenders, metal and plastic, grinding and polishing workers, hand, molders, shapers, and casters, except metal and plastic, molding, coremaking, and casting machine setters, operators, and tenders, metal and plastic, paper goods machine setters, operators, and tenders, grinding, lapping, polishing, and buffing machine tool setters, operators, and tenders, metal and plastic, extruding, forming, pressing, and compacting machine setters, operators, and tenders, aircraft structure, surfaces, rigging, and systems assemblers, aircraft mechanics and service technicians, mechanical engineers, farm equipment mechanics and service technicians, agricultural equipment operators, separating, filtering, clarifying, precipitating, and still machine setters, operators, and tenders, operating engineers and other construction equipment operators, industrial truck and tractor operators, farmworkers and laborers, crop, nursery, and greenhouse, pesticide handlers, sprayers, and applicators, vegetation, excavating and loading machine and dragline operators, surface mining, conveyor operators and tenders, maintenance workers, machinery, paving, surfacing, and tamping equipment operators, laborers and freight, stock, and material movers, hand, graders and sorters, agricultural products, food science technicians, weighers, measurers, checkers, and samplers, recordkeeping, inspectors, testers, sorters, samplers, and weighers, biofuels processing technicians, locomotive engineers, aircraft service attendants, railroad brake, signal, and switch operators and locomotive firers, tank car, truck, and ship loaders, stationary engineers and boiler operators, mobile heavy equipment mechanics, except engines, bus and truck mechanics and diesel engine specialists, electric motor, power tool, and related repairers, engine and other machine assemblers, control and valve installers and repairers, except mechanical door, millwrights, rail car repairers, automotive service technicians and mechanics, ship engineers, motorboat mechanics and service technicians, boilermakers, maintenance and repair workers, general, sheet metal workers, machinists, welding, soldering, and brazing machine setters, operators, and tenders, welders, cutters, solderers, and brazers, model makers, metal and plastic, electrical and electronic equipment assemblers, layout workers, metal and plastic, structural metal fabricators and fitters, electromechanical equipment assemblers, tool and die makers, sailors and marine oilers, motorboat operators, captains, mates, and pilots of water vessels, firefighters, hoist and winch operators, coin, vending, and amusement machine servicers and repairers, fishing and hunting workers, slaughterers and meat packers, meat, poultry, and fish cutters and trimmers, farmworkers, farm, ranch, and aquacultural animals, interior designers, commercial and industrial designers, electrical and electronics installers and repairers, transportation equipment, telecommunications line installers and repairers, automotive body and related repairers, upholsterers, coating, painting, and spraying machine setters, operators, and tenders, motorcycle mechanics, tire builders, automotive glass installers and repairers, tire repairers and changers, furniture finishers, glaziers, insulation workers, mechanical, insulation workers, floor, ceiling, and wall, painting, coating, and decorating workers, glass blowers, molders, benders, and finishers, home appliance repairers, cleaners of vehicles and equipment, ...]"
1,689,197,"[accountants and auditors, billing and posting clerks, sales representatives of services, except advertising, insurance, financial services, and travel, financial and investment analysts, credit authorizers, checkers, and clerks, compensation, benefits, and job analysis specialists, credit analysts, financial managers, tax preparers, personal financial advisors, budget analysts, payroll and timekeeping clerks, financial examiners, first-line supervisors of office and administrative support workers, brokerage clerks, tax examiners and collectors, and revenue agents, financial risk specialists, treasurers and controllers, bookkeeping, accounting, and auditing clerks, writers and authors, agents and business managers of artists, performers, and athletes, models, actuaries, investment fund managers, insurance underwriters, compensation and benefits managers, insurance sales agents, securities, commodities, and financial services sales agents, economists, administrative law judges, adjudicators, and hearing officers, claims adjusters, examiners, and investigators, labor relations specialists, police and sheriff's patrol officers, coroners, legal secretaries and administrative assistants, court reporters and simultaneous captioners, equal opportunity representatives and officers, detectives and criminal investigators, eligibility interviewers, government programs, chief executives, judges, magistrate judges, and magistrates, arbitrators, mediators, and conciliators, paralegals and legal assistants, lawyers, court, municipal, and license clerks, judicial law clerks, title examiners, abstractors, and searchers, compliance officers, administrative services managers, first-line supervisors of non-retail sales workers, executive secretaries and executive administrative assistants, project management specialists, secretaries and administrative assistants, except legal, medical, and executive, general and operations managers, office clerks, general, first-line supervisors of housekeeping and janitorial workers, human resources assistants, except payroll and timekeeping, facilities managers, advertising sales agents, door-to-door sales workers, news and street vendors, and related workers, sales managers, wholesale and retail buyers, except farm products, telemarketers, sales representatives, wholesale and manufacturing, except technical and scientific products, advertising and promotions managers, market research analysts and marketing specialists, online merchants, demonstrators and product promoters, counter and rental clerks, merchandise displayers and window trimmers, sales representatives, wholesale and manufacturing, technical and scientific products, marketing managers, retail salespersons, public relations specialists, real estate sales agents, fundraisers, fundraising managers, purchasing managers, meeting, convention, and event planners, media programming directors, industrial production managers, transportation inspectors, buyers and purchasing agents, farm products, first-line supervisors of farming, fishing, and forestry workers, air traffic controllers, airfield operations specialists, public safety telecommunicators, aircraft cargo handling supervisors, transportation, storage, and distribution managers, bus drivers, transit and intercity, flight attendants, airline pilots, copilots, and flight engineers, railroad conductors and yardmasters, subway and streetcar operators, dispatchers, except police, fire, and ambulance, first-line supervisors of material-moving machine and vehicle operators, commercial pilots, first-line supervisors of passenger attendants, cargo and freight agents, first-line supervisors of mechanics, installers, and repairers, ...]"
2,469,155,"[producers and directors, aerospace engineering and operations technologists and technicians, avionics technicians, robotics technicians, industrial engineering technologists and technicians, electro-mechanical and mechatronics technologists and technicians, mechatronics engineers, electrical and electronic engineering technologists and technicians, aerospace engineers, electronics engineers, except computer, medical equipment repairers, calibration technologists and technicians, electrical and electronics repairers, commercial and industrial equipment, robotics engineers, validation engineers, automotive engineering technicians, mechanical engineering technologists and technicians, photonics technicians, mechanical drafters, electrical engineers, automotive engineers, industrial engineers, marine engineers and naval architects, agricultural engineers, biofuels/biodiesel technology and product development managers, precision agriculture technicians, conservation scientists, geothermal production managers, industrial ecologists, civil engineers, biofuels production managers, agricultural technicians, forest and conservation technicians, environmental scientists and specialists, including health, environmental engineering technologists and technicians, water/wastewater engineers, farmers, ranchers, and other agricultural managers, water resource specialists, soil and plant scientists, environmental engineers, agricultural inspectors, environmental compliance inspectors, food scientists and technologists, environmental science and protection technicians, including health, animal scientists, construction and building inspectors, government property inspectors and investigators, transportation vehicle, equipment and systems inspectors, except aviation, health and safety engineers, except mining safety engineers and inspectors, aviation inspectors, natural sciences managers, chemists, chemical technicians, biological technicians, traffic technicians, power distributors and dispatchers, animal breeders, zoologists and wildlife biologists, fish and game wardens, biologists, geographers, geoscientists, except hydrologists and geographers, cost estimators, surveyors, architects, except landscape and naval, wind energy development managers, construction managers, landscape architects, architectural and civil drafters, solar energy installation managers, electrical and electronics drafters, sustainability specialists, solar energy systems engineers, architectural and engineering managers, civil engineering technologists and technicians, surveying and mapping technicians, computer numerically controlled tool programmers, manufacturing engineers, sales engineers, engineering teachers, postsecondary, logistics engineers, museum technicians and conservators, digital forensics analysts, photographers, set and exhibit designers, astronomers, hydrologists, physicists, nanotechnology engineering technologists and technicians, nanosystems engineers, atmospheric and space scientists, remote sensing technicians, geological technicians, except hydrologic technicians, climate change policy analysts, geodetic surveyors, hydrologic technicians, remote sensing scientists and technologists, audio and video technicians, camera and photographic equipment repairers, broadcast technicians, ...]"
3,914,152,"[acupuncturists, cardiologists, family medicine physicians, emergency medicine physicians, pediatricians, general, advanced practice psychiatric nurses, neurologists, orthopedic surgeons, except pediatric, chiropractors, physical medicine and rehabilitation physicians, dermatologists, massage therapists, general internal medicine physicians, psychiatrists, naturopathic physicians, recreational therapists, urologists, allergists and immunologists, pediatric surgeons, obstetricians and gynecologists, nurse practitioners, acute care nurses, paramedics, nursing assistants, respiratory therapists, registered nurses, emergency medical technicians, anesthesiologists, licensed practical and licensed vocational nurses, nurse midwives, clinical nurse specialists, physical therapists, critical care nurses, physician assistants, probation officers and correctional treatment specialists, medical and health services managers, psychiatric technicians, mental health counselors, occupational therapists, clinical neuropsychologists, psychiatric aides, neuropsychologists, occupational health and safety technicians, medical and clinical laboratory technicians, medical and clinical laboratory technologists, microbiologists, preventive medicine physicians, ophthalmologists, except pediatric, ambulance drivers and attendants, except emergency medical technicians, medical assistants, medical equipment preparers, home health aides, lifeguards, ski patrol, and other recreational protective service workers, surgical technologists, physical therapist aides, orderlies, personal care aides, surgical assistants, anesthesiologist assistants, cardiovascular technologists and technicians, nurse anesthetists, veterinarians, animal trainers, animal caretakers, veterinary technologists and technicians, veterinary assistants and laboratory animal caretakers, phlebotomists, child, family, and school social workers, first-line supervisors of personal service workers, biochemists and biophysicists, molecular and cellular biologists, geneticists, athletic trainers, exercise physiologists, exercise trainers and group fitness instructors, epidemiologists, art therapists, occupational therapy aides, physical therapist assistants, occupational therapy assistants, clinical and counseling psychologists, low vision therapists, orientation and mobility specialists, and vision rehabilitation therapists, rehabilitation counselors, music therapists, mental health and substance abuse social workers, marriage and family therapists, fitness and wellness coordinators, sports medicine physicians, recreation workers, dietitians and nutritionists, audiologists, speech-language pathologists, optometrists, hearing aid specialists, shampooers, skincare specialists, medical records specialists, histotechnologists, cytogenetic technologists, histology technicians, ...]"
4,675,83,"[dancers, art, drama, and music teachers, postsecondary, self-enrichment teachers, choreographers, musicians and singers, talent directors, adapted physical education specialists, health specialties teachers, postsecondary, kindergarten teachers, except special education, special education teachers, elementary school, middle school teachers, except special and career/technical education, career/technical education teachers, postsecondary, recreation and fitness studies teachers, postsecondary, career/technical education teachers, middle school, adult basic education, adult secondary education, and english as a second language instructors, special education teachers, secondary school, career/technical education teachers, secondary school, special education teachers, preschool, school psychologists, teaching assistants, special education, secondary school teachers, except special and career/technical education, elementary school teachers, except special education, special education teachers, middle school, tutors, special education teachers, kindergarten, instructional coordinators, human resources managers, human resources specialists, education teachers, postsecondary, teaching assistants, preschool, elementary, middle, and secondary school, except special education, english language and literature teachers, postsecondary, family and consumer sciences teachers, postsecondary, teaching assistants, postsecondary, preschool teachers, except special education, agricultural sciences teachers, postsecondary, farm and home management educators, environmental science teachers, postsecondary, chemistry teachers, postsecondary, economics teachers, postsecondary, biological science teachers, postsecondary, atmospheric, earth, marine, and space sciences teachers, postsecondary, forestry and conservation science teachers, postsecondary, athletes and sports competitors, childcare workers, coaches and scouts, training and development managers, training and development specialists, anthropologists and archeologists, political science teachers, postsecondary, sociology teachers, postsecondary, curators, social science research assistants, political scientists, anthropology and archeology teachers, postsecondary, geography teachers, postsecondary, history teachers, postsecondary, area, ethnic, and cultural studies teachers, postsecondary, historians, sociologists, park naturalists, philosophy and religion teachers, postsecondary, communications teachers, postsecondary, psychology teachers, postsecondary, social work teachers, postsecondary, architecture teachers, postsecondary, library science teachers, postsecondary, law teachers, postsecondary, foreign language and literature teachers, postsecondary, physics teachers, postsecondary, mathematical science teachers, postsecondary, disc jockeys, except radio, education and childcare administrators, preschool and daycare, education administrators, kindergarten through secondary, business teachers, postsecondary, education administrators, postsecondary, educational, guidance, and career counselors and advisors, directors, religious activities and education, criminal justice and law enforcement teachers, postsecondary, interpreters and translators, industrial-organizational psychologists, tour guides and escorts, entertainment and recreation managers, except gambling, substitute teachers, short-term]"
5,798,72,"[statistical assistants, management analysts, actors, news analysts, reporters, and journalists, poets, lyricists and creative writers, special effects artists and animators, art directors, music directors and composers, film and video editors, broadcast announcers and radio disc jockeys, financial quantitative analysts, business intelligence analysts, fraud examiners, investigators and analysts, information technology project managers, health information technologists and medical registrars, document management specialists, compliance managers, search marketing strategists, public relations managers, web administrators, software developers, occupational health and safety specialists, emergency management directors, archivists, data scientists, computer science teachers, postsecondary, library assistants, clerical, librarians and media collections specialists, editors, proofreaders and copy markers, library technicians, media technical directors/managers, graphic designers, computer and information research scientists, bioinformatics scientists, mathematicians, statisticians, bioinformatics technicians, geographic information systems technologists and technicians, clinical data managers, database administrators, database architects, computer systems analysts, operations research analysts, computer systems engineers/architects, computer network architects, computer network support specialists, business continuity planners, security management specialists, information security engineers, information security analysts, security managers, computer and information systems managers, regulatory affairs specialists, regulatory affairs managers, penetration testers, computer user support specialists, blockchain engineers, network and computer systems administrators, computer programmers, web and digital interface designers, web developers, data warehousing specialists, interviewers, except eligibility and loan, intelligence analysts, desktop publishers, technical writers, video game designers, retail loss prevention specialists, loss prevention managers, gambling surveillance officers and gambling investigators, survey researchers]"


In [24]:
G.drop()
gds.graph.drop('relation_test')



## similarity: FastRP(node embedding) + KNN

In [25]:
# We define how we want to project our database into GDS
node_projection = ["Title", "TechSkill"]
relationship_projection = {"HAS_TECH_SKILL": {"orientation": "UNDIRECTED"}}

# Before actually going through with the projection, let's check how much memory is required
result = gds.graph.project.estimate(node_projection, relationship_projection)

print(f"Required memory for native loading: {result['requiredMemory']}")

Required memory for native loading: 1462 KiB


In [26]:
G.drop()
gds.graph.drop('TechSkill')



In [27]:
G, result = gds.graph.project("TechSkill", node_projection, relationship_projection)

print(f"The projection took {result['projectMillis']} ms")

# We can use convenience methods on `G` to check if the projection looks correct
print(f"Graph '{G.name()}' node count: {G.node_count()}")
print(f"Graph '{G.name()}' node labels: {G.node_labels()}")



The projection took 39 ms
Graph 'TechSkill' node count: 11032
Graph 'TechSkill' node labels: ['TechSkill', 'Title']


In [28]:
# We can also estimate memory of running algorithms like FastRP, so let's do that first
result = gds.fastRP.mutate.estimate(
    G,
    mutateProperty="embedding",
    randomSeed=42,
    embeddingDimension=20,
    iterationWeights=[0.8, 1, 1, 1],
)

print(f"Required memory for running FastRP: {result['requiredMemory']}")

Required memory for running FastRP: 3620 KiB


In [29]:
# Now let's run FastRP and mutate our projected graph 'purchases' with the results
result = gds.fastRP.mutate(
    G,
    mutateProperty="embedding",
    randomSeed=50,
    embeddingDimension=75,
    iterationWeights=[0.8, 1, 1, 1],
)

# Let's make sure we got an embedding for each node
print(f"Number of embedding vectors produced: {result['nodePropertiesWritten']}")

Number of embedding vectors produced: 11032


In [30]:
# Run kNN and write back to db (we skip memory estimation this time...)
result = gds.knn.write(
    G,
    topK=2,
    nodeProperties=["embedding"],
    randomSeed=42,
    concurrency=1,
    sampleRate=1.0,
    deltaThreshold=0.0,
    writeRelationshipType="SIMILAR",
    writeProperty="score",
)

print(f"Relationships produced: {result['relationshipsWritten']}")
print(f"Nodes compared: {result['nodesCompared']}")
print(f"Mean similarity: {result['similarityDistribution']['mean']}")

Relationships produced: 22064
Nodes compared: 11032
Mean similarity: 0.9576618761486718


In [31]:
gds.run_cypher(
    """
        MATCH (p1:Title)-[r:SIMILAR]->(p2:Title)
        RETURN p1.name AS T1, p2.name AS T2, r.score AS similarity
        ORDER BY similarity DESCENDING, T1, T2
        LIMIT 100
    """
)

Unnamed: 0,T1,T2,similarity
0,"cleaning, washing, and metal pickling equipment operators and tenders","grinding and polishing workers, hand",1.000000
1,"cleaning, washing, and metal pickling equipment operators and tenders",hoist and winch operators,1.000000
2,"cutting and slicing machine setters, operators, and tenders","derrick operators, oil and gas",1.000000
3,"derrick operators, oil and gas","cutting and slicing machine setters, operators, and tenders",1.000000
4,"first-line supervisors of entertainment and recreation workers, except gambling services",first-line supervisors of personal service workers,1.000000
...,...,...,...
95,"child, family, and school social workers",recycling coordinators,0.964877
96,recycling coordinators,"child, family, and school social workers",0.964877
97,"forestry and conservation science teachers, postsecondary","social work teachers, postsecondary",0.964856
98,butchers and meat cutters,"cutters and trimmers, hand",0.964854


#### **If T1 is similar to T2, then recommended items on T1 is also for T2 (collaborative filtering)**

In [32]:
%%script true

gds.run_cypher(
    """
        MATCH (:Person {name: "Annie"})-[:BUYS]->(p1:Product)
        WITH collect(p1) as products
        MATCH (:Person {name: "Matt"})-[:BUYS]->(p2:Product)
        WHERE not p2 in products
        RETURN p2.name as recommendation
    """
)

Couldn't find program: 'true'


In [33]:
#G.drop()
#gds.graph.drop('TechSkill')

In [34]:
#gds.run_cypher("MATCH (n) DETACH DELETE n")
#gds.graph.drop("full_graph")

In [35]:
gds.close()