In [2]:
import re
import numpy as np
import pandas as pd
import numbers
import pickle
import snap
from os import path
from datetime import datetime

In [20]:
BASE_PATH = "../data/academia.stackexchange.com"
POST_TOP_NGRAM_PATH = path.join(BASE_PATH, "Posts-top_words_more.tsv")
USERID_NGRAM_TSV_PATH_POSTSEPT17 = path.join(BASE_PATH, "Userid_Ngram_Bipartite_Graph_PostSept17.tsv")
POSTID_NGRAM_TSV_PATH_POSTSEPT17 = path.join(BASE_PATH, "Postid_Ngram_Bipartite_Graph_PostSept17.tsv")
NGRAMID_DICT_PICKLE_PATH_POSTSEPT17 = path.join(BASE_PATH, "Ngramid_Dict_PostSept17.pickle")
USERID_SET_PICKLE_PATH_POSTSEPT17 = path.join(BASE_PATH, "Userid_setPostSept17.pickle")
POSTID_SET_PICKLE_PATH_POSTSEPT17 = path.join(BASE_PATH, "Postid_setPostSept17.pickle")

### Load top words from posts.

In [4]:
num_top_words = 5

In [5]:
top_word_columns = ["TopWord%d" % i for i in xrange(1, num_top_words+1)]

In [6]:
datetime.strptime("2012-02-14T20:23:40.127".split("T")[0], "%Y-%m-%d").year

2012

In [7]:
def get_date(date_str):
    return datetime.strptime(date_str.split("T")[0], "%Y-%m-%d")

In [8]:
def load_top_ngram_df(topwords_path):
    # Load csv containing the top words.
    posts_df = pd.read_csv(topwords_path, sep = "\t", usecols = 
                           ["Id", "OwnerUserId", "TopWord1", "TopWord2", "TopWord3", "TopWord4", "TopWord5",
                           "PostTypeId", "Date"])
    
    # Clean dataframe.
    posts_df = posts_df.dropna()
    posts_df = posts_df.rename(columns={
            "Id": "post_id", "OwnerUserId": "user_id", "PostTypeId": "Type",
            "Date": "date"})
    posts_df["user_id"] = posts_df["user_id"].astype(np.int64)
    posts_df["post_id"] = posts_df["post_id"].astype(np.int64)
    posts_df = posts_df[posts_df["user_id"] > 0]
    posts_df = posts_df[posts_df["post_id"] > 0]
    
    # Convert date from string to datetime.
    posts_df["date"] = posts_df["date"].apply(get_date)
    
    # Create a year column.
    posts_df["year"] = posts_df["date"].apply(lambda x: x.year)

    return posts_df

In [9]:
def get_top_ngram_set(posts_df):
    top_ngram_set = set()
    for col in top_word_columns:
        top_ngram_set.update(posts_df[col].values)
    return top_ngram_set

In [10]:
posts_df = load_top_ngram_df(POST_TOP_NGRAM_PATH)

### Limit posts to only ones occuring date limit_date.
2017-09-02


In [11]:
limit_date = datetime(2017, 9, 2)

In [13]:
posts_df = posts_df[posts_df["date"] >= limit_date]

### Construct top_ngram_set.

In [15]:
top_ngram_set = get_top_ngram_set(posts_df)

In [16]:
print "Number of n-grams:", len(top_ngram_set)

Number of n-grams: 17907


### Construct User-Id to Ngram and Post-Id to Ngram Graphs.

In [17]:
def create_ngram_id_dict(top_ngram_set, init_index):
    """
    Create dictionary mapping n-gram to an integer index greater than or equal to init_index.
    """    
    # Create dictinonary that maps n-gram to its id.
    ngram_id_dict = dict()
    curr_index = init_index
    for ngram in top_ngram_set:
        ngram_id_dict[ngram] = curr_index
        curr_index += 1
    
    return ngram_id_dict

In [18]:
def create_graph_dfs(top_ngram_set, posts_df):
    """
    Create user-id to n-gram and post-id to n-gram graph. The graphs will be stored as a Pandas dataframe.
        Each row of the dataframe contains an edge of the graph. The dataframe can be written out as a tsv
        to be read in as a Snap graph.
    Returns:
        Tuple (userid_ngram_df, postid_ngram_df, ngram_id_dict, user_id_set, post_id_set). ngramid_dict is
        a dictionary mapping ngram to its assigned id value. userid_set is a set containing the user id nodes
        that are included in the user-id graph. postid_set is a containing the post id nodes that
        are included in teh post-id graph.
    """
    # Create n-gram dict.
    max_post_id = max(posts_df["post_id"].values)
    max_user_id = max(posts_df["user_id"].values)
    ngram_id_dict = create_ngram_id_dict(top_ngram_set, max(max_post_id, max_user_id) + 1)
    
    # Create dataframes storing the edges in the graphs.
    user_id_nodes = []
    post_id_nodes = []
    ngram_id_nodes = []
    for _, row in posts_df.iterrows():
        user_id = row["user_id"]
        post_id = row["post_id"]
        top_ngrams = list(row[top_word_columns])        
        if ((user_id < 0) or (post_id) < 0): continue
        for ngram, ngram_id in ngram_id_dict.items():
            if ngram in top_ngrams:
                ngram_id_nodes.append(ngram_id)
                user_id_nodes.append(user_id)
                post_id_nodes.append(post_id)
      
    userid_ngram_df = pd.DataFrame({"user_id": user_id_nodes, "ngram_id": ngram_id_nodes})
    postid_ngram_df = pd.DataFrame({"post_id": post_id_nodes, "ngram_id": ngram_id_nodes})
    return userid_ngram_df, postid_ngram_df, ngram_id_dict, set(user_id_nodes), set(post_id_nodes)

In [None]:
# Construct graphs stored in dataframes.
userid_ngram_df, postid_ngram_df, ngramid_dict, userid_set, postid_set = create_graph_dfs(top_ngram_set, posts_df)

In [None]:
# Write graphs as tsv files that can be read as a Snap graph.
userid_ngram_df.to_csv(USERID_NGRAM_TSV_PATH_POSTSEPT17, sep="\t", header=False, index=False)
postid_ngram_df.to_csv(POSTID_NGRAM_TSV_PATH_POSTSEPT17, sep="\t", header=False, index=False)

In [None]:
# Pickle to store ngramid_dict, userid_set, postid_set.
pickle_out = open(NGRAMID_DICT_PICKLE_PATH_POSTSEPT17,"wb")
pickle.dump(ngramid_dict, pickle_out)
pickle_out.close()

pickle_out = open(USERID_SET_PICKLE_PATH_POSTSEPT17,"wb")
pickle.dump(userid_set, pickle_out)
pickle_out.close()

pickle_out = open(POSTID_SET_PICKLE_PATH_POSTSEPT17,"wb")
pickle.dump(postid_set, pickle_out)
pickle_out.close()