### Examples in python ---> Node2Vec 

Use node2vec algorithm to generate low dimensional representation of users to discover interesting user groups / clusters (e.g. popular professionals, satisfied students etc.) using only the available network structure.

In [1]:
import os
import numpy as np
import pandas as pd
import networkx as nx
from collections import Counter

In [2]:
files = os.listdir("../social")
files

['groups.csv',
 'professionals.csv',
 'tag_questions.csv',
 'emails.csv',
 '.DS_Store',
 'answers.csv',
 'group_memberships.csv',
 'tag_users.csv',
 'matches.csv',
 'answer_scores.csv',
 'tags.csv',
 'comments.csv',
 'questions.csv',
 'school_memberships.csv',
 'question_scores.csv',
 'students.csv']

In [7]:
data_parts = {}
for file_name in files:
    file_id = file_name.split(".")[0]
    data_parts[file_id] = pd.read_csv("data/social/" + file_name)
    print(file_id)
    print(data_parts[file_id].shape)

groups
(49, 2)
professionals
(28152, 5)
tag_questions
(76553, 2)
emails
(1850101, 4)

(0, 1)
answers
(51123, 5)
group_memberships
(1038, 2)
tag_users
(136663, 2)
matches
(4316275, 2)
answer_scores
(51138, 2)
tags
(16269, 2)
comments
(14966, 5)
questions
(23931, 5)
school_memberships
(5638, 2)
question_scores
(23928, 2)
students
(30971, 3)


#### Build knowledge graph

In [8]:
def add_nodes(G, df, col, type_name):
    """Add entities to G from the 'col' column of the 'df' DataFrame. The new nodes are annotated with 'type_name' label."""
    nodes = list(df[~df[col].isnull()][col].unique())
    G.add_nodes_from([(n,dict(type=type_name)) for n in nodes])
    print("Nodes (%s,%s) were added" % (col, type_name))
    
def add_links(G, df, col1, col2, type_name):
    """Add links to G from the 'df' DataFrame. The new edges are annotated with 'type_name' label."""
    df_tmp = df[(~df[col1].isnull()) & (~df[col2].isnull())]
    links = list(zip(df_tmp[col1],df_tmp[col2]))
    G.add_edges_from([(src, trg, dict(type=type_name)) for src, trg in links])
    print("Edges (%s->%s,%s) were added" % (col1, col2, type_name))

In [9]:
G = nx.DiGraph()

#### Nodes
The vertices of the knowledge graph consists of the following entities:

* answers
* questions
* comments
* students
* professionals
* industries
* schools
* tags
* user groups
* group types

In [11]:
add_nodes(G, data_parts["answers"], "answers_id", "answer")
add_nodes(G, data_parts["comments"], "comments_id", "comment")
add_nodes(G, data_parts["groups"], "groups_id", "group")
add_nodes(G, data_parts["groups"], "groups_group_type", "group_type")
add_nodes(G, data_parts["professionals"], "professionals_id", "professional")
add_nodes(G, data_parts["professionals"], "professionals_industry", "industry")
add_nodes(G, data_parts["questions"], "questions_id", "question")
add_nodes(G, data_parts["school_memberships"], "school_memberships_school_id", "school")
add_nodes(G, data_parts["students"], "students_id", "student")
add_nodes(G, data_parts["tags"], "tags_tag_id", "tag")


Nodes (answers_id,answer) were added
Nodes (comments_id,comment) were added
Nodes (groups_id,group) were added
Nodes (groups_group_type,group_type) were added
Nodes (professionals_id,professional) were added
Nodes (professionals_industry,industry) were added
Nodes (questions_id,question) were added
Nodes (school_memberships_school_id,school) were added
Nodes (students_id,student) were added
Nodes (tags_tag_id,tag) were added


In [13]:
add_links(G, data_parts["answers"], "answers_id", "answers_question_id", "question")
add_links(G, data_parts["answers"], "answers_id", "answers_author_id", "author")
add_links(G, data_parts["comments"], "comments_id", "comments_parent_content_id", "parent_content")
add_links(G, data_parts["comments"], "comments_id", "comments_author_id", "author")
add_links(G, data_parts["group_memberships"], "group_memberships_user_id", "group_memberships_group_id", "member")
add_links(G, data_parts["groups"], "groups_id", "groups_group_type", "type")
add_links(G, data_parts["professionals"], "professionals_id", "professionals_industry", "type")
add_links(G, data_parts["questions"], "questions_id", "questions_author_id", "author")
add_links(G, data_parts["school_memberships"], "school_memberships_user_id", "school_memberships_school_id", "member")
add_links(G, data_parts["tag_questions"], "tag_questions_question_id", "tag_questions_tag_id", "tag")
add_links(G, data_parts["tag_users"], "tag_users_user_id", "tag_users_tag_id", "follow")

Edges (answers_id->answers_question_id,question) were added
Edges (answers_id->answers_author_id,author) were added
Edges (comments_id->comments_parent_content_id,parent_content) were added
Edges (comments_id->comments_author_id,author) were added
Edges (group_memberships_user_id->group_memberships_group_id,member) were added
Edges (groups_id->groups_group_type,type) were added
Edges (professionals_id->professionals_industry,type) were added
Edges (questions_id->questions_author_id,author) were added
Edges (school_memberships_user_id->school_memberships_school_id,member) were added
Edges (tag_questions_question_id->tag_questions_tag_id,tag) were added
Edges (tag_users_user_id->tag_users_tag_id,follow) were added


### Location information
Location information of users and professionals are preprocessed before I add it to the knowledge graph. I tried to extract city / state / country hierarchy from locations were it was provided. In this case I created different levels for locations: cities, states/regions and countries.

In [14]:
students = data_parts["students"]
profs = data_parts["professionals"]
students = students[~students["students_location"].isnull()]
profs = profs[~profs["professionals_location"].isnull()]

In [16]:
locs1 = list(students["students_location"])
locs2 = list(profs["professionals_location"])
locs = [loc.lower() for loc in locs1+locs2]
locs_unique = list(set(locs))

#### Most common locations:

In [17]:
cnt = Counter(locs)
cnt.most_common()[:10]

[('new york, new york', 2650),
 ('bengaluru, karnataka, india', 1284),
 ('los angeles, california', 1280),
 ('boston, massachusetts', 1271),
 ('houston, texas', 1032),
 ('san francisco, california', 975),
 ('chicago, illinois', 920),
 ('california, california', 894),
 ('greater new york city area', 745),
 ('atlanta, georgia', 738)]

In [18]:
new_edges = []
new_nodes = []
for loc in locs_unique:
    loc_hierarchy = loc.split(", ")
    loc_nodes = [] # due to city name duplicates in the world
    k = len(loc_hierarchy)
    for i in range(k):
        loc_nodes.append('_'.join(loc_hierarchy[i:]))
    new_nodes += loc_nodes
    loc_links = [(loc_nodes[i],loc_nodes[i+1], dict(type="location"))  for i in range(k-1)]
    new_edges += loc_links
new_nodes = list(set(new_nodes))
new_nodes = [(n, dict(type="location")) for n in new_nodes]

Add location nodes to the graph
* the 3 level of nodes are added
* connections between cities -> regions, regions -> contires are added

In [20]:
G.add_nodes_from(new_nodes)
G.add_edges_from(new_edges)
print(len(new_edges), len(new_nodes))

7253 7160


### Examples:
Locations that are immediate in-neighbors of entity United Kingdom (e.g.: England, Scotland etc.).

In [22]:
list(G.in_edges("united kingdom"))[:5]

[('england_united kingdom', 'united kingdom'),
 ('leeds_united kingdom', 'united kingdom'),
 ('harrow_united kingdom', 'united kingdom'),
 ('edinburgh_united kingdom', 'united kingdom'),
 ('scotland_united kingdom', 'united kingdom')]

* Locations that are in-neighbors of entity England

In [24]:
list(G.in_edges("england_united kingdom"))[:5]

[('windsor_england_united kingdom', 'england_united kingdom'),
 ('swindon_england_united kingdom', 'england_united kingdom'),
 ('kingston upon thames_england_united kingdom', 'england_united kingdom'),
 ('oxford_england_united kingdom', 'england_united kingdom'),
 ('luton_england_united kingdom', 'england_united kingdom')]

* Link users to the first level of locations:

In [26]:
students["students_location"] = students["students_location"].apply(lambda x: "_".join(x.lower().split(", ")))
profs["professionals_location"] = profs["professionals_location"].apply(lambda x: "_".join(x.lower().split(", ")))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [27]:
add_links(G, students, "students_id", "students_location", "location")
add_links(G, profs, "professionals_id", "professionals_location", "location")

Edges (students_id->students_location,location) were added
Edges (professionals_id->professionals_location,location) were added


### Clean and encode knowledge graph

In [28]:
def encode_graph(G):
    """Encode the nodes of the network into integers"""
    nodes = [(n,d.get("type",None)) for n, d in G.nodes(data=True)]
    nodes_df = pd.DataFrame(nodes, columns=["id","type"]).reset_index()
    node2idx = dict(zip(nodes_df["id"],nodes_df["index"]))
    edges = [(node2idx[src], node2idx[trg], d.get("type",None)) for src, trg, d in G.edges(data=True)]
    edges_df = pd.DataFrame(edges, columns=["src","trg","type"])
    return nodes_df, edges_df

#### Remove isolated nodes

In [29]:
print(G.number_of_nodes(), G.number_of_edges())
G.remove_nodes_from(list(nx.isolates(G)))
print(G.number_of_nodes(), G.number_of_edges())

177890 461369
174181 461369


### Encode the nodes to have integer identifiers

In [30]:
nodes_df, edges_df = encode_graph(G)
len(nodes_df), len(edges_df)

(174181, 461369)

#### Node information summary

In [31]:
print(nodes_df.head())
print(nodes_df["type"].value_counts())
nodes_df.to_csv("knowledge_graph_nodes.csv", index=False)

   index                                id    type
0      0  4e5f01128cae4f6d8fd697cec5dca60c  answer
1      1  ada720538c014e9b8a6dceed09385ee3  answer
2      2  eaa66ef919bc408ab5296237440e323f  answer
3      3  1a6b3749d391486c9e371fbd1e605014  answer
4      4  5229c514000446d582050f89ebd4e184  answer
answer          51123
student         29460
professional    27819
question        23931
comment         14966
tag             14404
location         7160
school           2706
industry         2470
group              49
group_type          7
Name: type, dtype: int64


#### Edge information summary

In [32]:
print(edges_df.head())
print(edges_df["type"].value_counts())
edges_df[["src","trg"]].to_csv("knowledge_graph_edges.csv", index=False, header=False, sep=" ")

   src    trg      type
0    0  96434  question
1    0  68352    author
2    1  96435  question
3    1  83417    author
4    2  96435  question
follow            135907
author             90020
tag                76553
location           60724
question           51123
type               25625
parent_content     14966
member              6451
Name: type, dtype: int64


#### Node2Vec

In [33]:
edge_list = list(zip(edges_df["src"],edges_df["trg"]))
edge_list[:5]

[(0, 96434), (0, 68352), (1, 96435), (1, 83417), (2, 96435)]

In [34]:
KG = nx.Graph(edge_list)
KG.number_of_nodes(), KG.number_of_edges()

(174181, 461369)

**NOTE:** We will analyse only the greatest (weakly) connected component of our knowledge graph

In [36]:
largest_cc = max(nx.connected_components(KG), key=len)
KG = nx.subgraph(KG, largest_cc)
KG.number_of_nodes(), KG.number_of_edges()

(173933, 461225)