In [1]:
import re
import numpy as np
import pandas as pd
import numbers
import pickle
import snap
from os import path

In [53]:
BASE_PATH = "../data/academia.stackexchange.com"
NGRAM_PATH = path.join(BASE_PATH, "Posts.most_important.txt")
POST_PATH = path.join(BASE_PATH, "Posts.csv")
USERID_NGRAM_TSV_PATH = path.join(BASE_PATH, "Userid_Ngram_Bipartite_Graph.tsv")
POSTID_NGRAM_TSV_PATH = path.join(BASE_PATH, "Postid_Ngram_Bipartite_Graph.tsv")
NGRAMID_DICT_PICKLE_PATH = path.join(BASE_PATH, "Ngramid_Dict.pickle")
USERID_SET_PICKLE_PATH = path.join(BASE_PATH, "Userid_set.pickle")
POSTID_SET_PICKLE_PATH = path.join(BASE_PATH, "Postid_set.pickle")
USERID_FOLDED_GRAPH_PATH = path.join(BASE_PATH, "Userid_Folded_Graph.graph")
POSTID_FOLDED_GRAPH_PATH = path.join(BASE_PATH, "Postid_Folded_Graph.graph")
USERID_NGRAM_FOLDED_GRAPH_PATH = path.join(BASE_PATH, "Userid_Ngram_Folded_Graph.graph")
POSTID_NGRAM_FOLDED_GRAPH_PATH = path.join(BASE_PATH, "Postid_Ngram_Folded_Graph.graph")

### Load N-grams and "TF-IDF" Scores

In [7]:
def clean_ngram_entry(entry):
    """
    Given string entry with the form "('ngram', score)\n" returns a tuple ('ngram', score), where
    score is converted to an integer.
    """
    entry = re.sub('[\n()]', '', entry)
    split_entry = entry.split(",")
    ngram = split_entry[0].strip('\'')
    score = int(split_entry[1].strip())
    
    return ngram, score

In [8]:
def import_ngrams_scores(file_path):
    """
    Given path to file containing string of form "('ngram', score)\n" on each line, returns
    a Pandas dataframe with n-gram and column columns. Each row of the dataframe will be sorted by
    descending order for score.
    """
    # Extract entries from text file.
    all_ngrams = []
    all_scores = []
    with open(file_path, "rb") as file:
        for entry in file.readlines():
            ngram, score = clean_ngram_entry(entry)
            all_ngrams.append(ngram)
            all_scores.append(score)
            
    # Create dataframe containing entries.
    ngram_df = pd.DataFrame({"ngram": all_ngrams, "score": all_scores})
    ngram_df.sort_values(by=["score"], ascending = False)
    
    return ngram_df

In [9]:
ngram_df = import_ngrams_scores(NGRAM_PATH)

### Load Post content into dataframe.
Post id, body, user id.

In [10]:
def clean_post_body(body):
    """
    Remove HTML tags from post body.
    """
    if (type(body) != str):
        return None
    TAG_RE = re.compile(r'<[^>]+>')
    return TAG_RE.sub('', body.strip()).strip().lower()

In [11]:
def load_posts_df(csv_file_path):
    """
    Given path to CSV file containing posts information, lots into Pandas dataframe.
    """
    posts_df = pd.read_csv(csv_file_path, usecols = ["Id", "Body", "OwnerUserId"])
    posts_df = posts_df.dropna()
    posts_df = posts_df.rename(columns={"Id": "post_id", "Body": "body", "OwnerUserId": "user_id"})
    posts_df["body"] = posts_df["body"].apply(clean_post_body)
    posts_df["user_id"] = posts_df["user_id"].astype(np.int64)
    posts_df["post_id"] = posts_df["post_id"].astype(np.int64)
    return posts_df

In [12]:
posts_df = load_posts_df(POST_PATH)

### Construct User-Id to Ngram and Post-Id to Ngram Graphs.

In [13]:
def create_ngram_id_dict(ngram_df, min_score, init_index):
    """
    Create dictionary mapping n-gram to an integer index greater than or equal to init_index.
    """
    # Get n-gram nodes that have score at or above min_score.
    used_ngram_df = ngram_df.loc[ngram_df["score"] >= min_score]
    
    # Create dictinonary that maps n-gram to its id.
    ngram_id_dict = dict()
    curr_index = init_index
    for ngram in used_ngram_df["ngram"].values:
        ngram_id_dict[ngram] = curr_index
        curr_index += 1
    
    return ngram_id_dict

In [14]:
def create_graph_dfs(ngram_df, posts_df, min_score):
    """
    Create user-id to n-gram and post-id to n-gram graph. The graphs will be stored as a Pandas dataframe.
        Each row of the dataframe contains an edge of the graph. The dataframe can be written out as a tsv
        to be read in as a Snap graph.
    Returns:
        Tuple (userid_ngram_df, postid_ngram_df, ngram_id_dict, user_id_set, post_id_set). ngramid_dict is
        a dictionary mapping ngram to its assigned id value. userid_set is a set containing the user id nodes
        that are included in the user-id graph. postid_set is a containing the post id nodes that
        are included in teh post-id graph.
    """
    # Create n-gram dict.
    max_post_id = max(posts_df["post_id"].values)
    max_user_id = max(posts_df["user_id"].values)
    ngram_id_dict = create_ngram_id_dict(ngram_df, min_score, max(max_post_id, max_user_id) + 1)
    
    # Create dataframes storing the edges in the graphs.
    user_id_nodes = []
    post_id_nodes = []
    ngram_id_nodes = []
    for _, row in posts_df.iterrows():
        if (type(row["body"]) != str): continue
        user_id = row["user_id"]
        post_id = row["post_id"]
        if ((user_id < 0) or (post_id) < 0): continue
        body = row["body"].split()
        for ngram, ngram_id in ngram_id_dict.items():
            if ngram in body:
                ngram_id_nodes.append(ngram_id)
                user_id_nodes.append(user_id)
                post_id_nodes.append(post_id)
      
    userid_ngram_df = pd.DataFrame({"user_id": user_id_nodes, "ngram_id": ngram_id_nodes})
    postid_ngram_df = pd.DataFrame({"post_id": post_id_nodes, "ngram_id": ngram_id_nodes})
    return userid_ngram_df, postid_ngram_df, ngram_id_dict, set(user_id_nodes), set(post_id_nodes)

In [15]:
# Construct graphs stored in dataframes.
userid_ngram_df, postid_ngram_df, ngramid_dict, userid_set, postid_set = create_graph_dfs(ngram_df, posts_df, 5)

In [17]:
# Write graphs as tsv files that can be read as a Snap graph.
userid_ngram_df.to_csv(USERID_NGRAM_TSV_PATH, sep="\t", header=False, index=False)
postid_ngram_df.to_csv(POSTID_NGRAM_TSV_PATH, sep="\t", header=False, index=False)

In [22]:
# Pickle to store ngramid_dict, userid_set, postid_set.
pickle_out = open(NGRAMID_DICT_PICKLE_PATH,"wb")
pickle.dump(ngramid_dict, pickle_out)
pickle_out.close()

pickle_out = open(USERID_SET_PICKLE_PATH,"wb")
pickle.dump(userid_set, pickle_out)
pickle_out.close()

pickle_out = open(POSTID_SET_PICKLE_PATH,"wb")
pickle.dump(postid_set, pickle_out)
pickle_out.close()

### Fold Bipartite Graphs to create User-id and Post-id Graphs

In [25]:
# Load the graphs in SNAP.
userid_ngram_bipartite_graph = snap.LoadEdgeList(snap.PUNGraph, USERID_NGRAM_TSV_PATH, 0, 1)
postid_ngram_bipartite_graph = snap.LoadEdgeList(snap.PUNGraph, POSTID_NGRAM_TSV_PATH, 0, 1)

In [58]:
def U_fold_graph(G, U_set):
    """
    G: Bipartite graph that we want to fold.
    U_set: Set containing all node ids in the left partition of G. These will be the nodes that are kept
        in the fold
    """
    folded_G = snap.TUNGraph.New()
    
    # Add all nodes in U_set to G.
    for nid in U_set:
        folded_G.AddNode(nid)
        
    # Iterate through nodes in U_set and add edge between them if they have at least one common neighbor in G.
    # Becasue the graph is bipartite, the common neighbor must be in V.
    for N1 in G.Nodes():
        if (N1.GetId() not in U_set): continue # N1 not a disease node.
        for N2 in G.Nodes():
            if (N1.GetId() == N2.GetId()): continue # No self-loops.
            if (N2.GetId() not in U_set): continue # N2 not a disease node.
            if (snap.GetCmnNbrs(G, N1.GetId(), N2.GetId()) > 0):
                    folded_G.AddEdge(N1.GetId(), N2.GetId())
    return folded_G

Graph containing user nodes.

In [29]:
# Fold to create user graph.
userid_graph = U_fold_graph(userid_ngram_bipartite_graph, userid_set)

In [46]:
# Save created user graph.
FOut = snap.TFOut(USERID_FOLDED_GRAPH_PATH)
userid_graph.Save(FOut)
FOut.Flush()

Graph containing post nodes.

In [47]:
# Fold to create post graph.
postid_graph = U_fold_graph(postid_ngram_bipartite_graph,e postid_set)

In [49]:
# Save created post graph.
FOut = snap.TFOut(POSTID_FOLDED_GRAPH_PATH)
postid_graph.Save(FOut)
FOut.Flush()

Graph containing n-gram nodes folded from user graph.

In [None]:
# Fold to n-gram graph from the user graph.
userid_ngram_graph = U_fold_graph(userid_ngram_bipartite_graph, ngramid_dict.values())

In [None]:
# Save created user n-gram  graph.
FOut = snap.TFOut(USERID_NGRAM_FOLDED_GRAPH_PATH)
userid_graph.Save(FOut)
FOut.Flush()

Graph containing n-gram nodes from  post graph.


In [None]:
# Fold to n-gram graph from the post graph.
userid_ngram_graph = U_fold_graph(postid_ngram_bipartite_graph, ngramid_dict.values())

In [None]:
# Save created user n-gram  graph.
FOut = snap.TFOut(POSTID_NGRAM_FOLDED_GRAPH_PATH)
userid_graph.Save(FOut)
FOut.Flush()

### Page Rank

In [52]:
userid_pagerank_H = snap.TIntFltH()
snap.GetPageRank(userid_graph, userid_pagerank_H)

KeyboardInterrupt: 