In [6]:
import snap
from os import path
import pickle
import collections
import numpy as np
from tqdm import tqdm
import pandas as pd

In [103]:
BASE_PATH = "../data/stats.stackexchange.com/Mixed"
FOLDED_NGRAM_GRAPH_PATH = path.join(BASE_PATH, "Userid_Ngram_Folded_Graph.graph")
FOLDED_POSTID_GRAPH_PATH = path.join(BASE_PATH, "Postid_Folded_Graph.graph")
NGRAM_DICT_PICKLE = path.join(BASE_PATH, "Bigramid_Dict")
POSTID_PICKLE = path.join(BASE_PATH, "STATS_20k-Posts_11-top_uni&bigrams_nostem.pickle")
POST_TOP_NGRAM_PATH = path.join(BASE_PATH, "STATS_20k-Posts_11-top_uni&bigrams_nostem.tsv")
COMMUNITIES_PATH = path.join(BASE_PATH, 'postid-communities-with-postbodies.txt')
COMMUNITIES_VEC_PATH = path.join(BASE_PATH, 'postid-communities.vector')
EDGELIST_PATH = path.join(BASE_PATH, 'postid_edges.txt')
COMMUNITIES_VIZ_PATH = path.join(BASE_PATH, 'postid-communities-viz.csv')
COMMUNITIES_VIZ_PATH2 = path.join(BASE_PATH, 'postid-spectral-communities-viz.csv')
SPEC_COMMUNITIES_PICKLE = path.join(BASE_PATH, 'Spectral_Node_to_Community_dict.pickle')

In [14]:
def get_modularity(G, community_dict):
    two_M = G.GetEdges() * 2.0
    mod_sum = 0.0
    for NI in G.Nodes():
        NI_id = NI.GetId()
        for NJ in G.Nodes():
            NJ_id = NJ.GetId()
            if (community_dict[NI_id] == community_dict[NJ_id]):
                mod_sum += G.IsEdge(NI_id, NJ_id) - ((NI.GetDeg() * NJ.GetDeg()) / two_M)
    modularity = mod_sum / two_M
    return modularity

def load_top_ngram_df(topwords_path):
    # Load csv containing the top words.
    posts_df = pd.read_csv(topwords_path, sep = "\t", usecols =
                           ["Id", "OwnerUserId", "TopWord1", "TopWord2", "TopWord3", "TopWord4", "TopWord5"])

    # Clean dataframe.
    posts_df = posts_df.dropna()
    posts_df = posts_df.rename(columns={
            "Id": "post_id", "OwnerUserId": "user_id"})
    posts_df["user_id"] = posts_df["user_id"].astype(np.int64)
    posts_df["post_id"] = posts_df["post_id"].astype(np.int64)
    posts_df = posts_df[posts_df["user_id"] > 0]
    posts_df = posts_df[posts_df["post_id"] > 0]

    return posts_df

def add_communities_post_df(post_df, best_comm_map, postid_dict):
    # Iterate over rows and add each community to each row.
    community_array = []
    for index, row in post_df.iterrows():
        user_id = row["post_id"]
        A_id = postid_dict.get(user_id, -1)
        if (A_id < 0):
            community_array.append(-1)
        else:
            community_array.append(best_comm_map[A_id])
    post_df.loc[:,'Community'] = community_array

    return post_df

In [15]:
f_in = snap.TFIn(FOLDED_POSTID_GRAPH_PATH)
post_graph = snap.TUNGraph.Load(f_in)
print "original nodes", post_graph.GetNodes()
print "edges", post_graph.GetEdges()
assert snap.CntSelfEdges(post_graph) == 0
snap.DelZeroDegNodes(post_graph)
print "new nodes (no degree-0)", post_graph.GetNodes()

original nodes 19725
edges 567729
new nodes (no degree-0) 17139


In [16]:
comm_vec = snap.TCnComV()
modularity = snap.CommunityCNM(post_graph, comm_vec)

f_out = snap.TFOut(COMMUNITIES_VEC_PATH)
comm_vec.Save(f_out)
f_out.Flush()


1335it [02:54, 27.28it/s][A

In [17]:
f_in = snap.TFIn(COMMUNITIES_VEC_PATH)
comm_vec = snap.TCnComV()
comm_vec.Load(f_in)

print "communities", len(comm_vec)

communities 66


In [18]:
pickle_file = open(POSTID_PICKLE, 'rb')
postid_dict = pickle.load(pickle_file)

community_dict = collections.defaultdict(int)

with open(COMMUNITIES_PATH, 'w') as f:
    for i, comm in enumerate(comm_vec):
        f.write("#####Community {}#####\n".format(i))
        community = snap.TIntV()
        for node in comm:
            community.Add(node)
#             f.write("Node {}: {}\n".format(node, postid_dict[node]))
            community_dict[node] = i
        f.write('Community {}, nodes: {} modularity: {}\n'.format(i, len(comm), snap.GetModularity(post_graph, community, post_graph.GetEdges())))
    f.write("The modularity of the network is {}\n".format(modularity))
    alt_modularity = get_modularity(post_graph, community_dict)
    f.write("Alternate modularity of the network (sanity check) is {}".format(alt_modularity))

In [94]:
postid_dict2 = collections.defaultdict(int)
for node in community_dict:
    postid_dict2[node] = node

post_df = load_top_ngram_df(POST_TOP_NGRAM_PATH)
post_df_w_comm = add_communities_post_df(post_df, community_dict, postid_dict2)

In [95]:
communities = set(post_df_w_comm["Community"])
for comm in communities:
    print "Community:", comm, "Size:", len(post_df[post_df_w_comm["Community"] == comm])

Community: 0 Size: 4030
Community: 1 Size: 2303
Community: 2 Size: 24
Community: 3 Size: 4711
Community: 4 Size: 5343
Community: 5 Size: 97
Community: 6 Size: 4
Community: 7 Size: 5
Community: 8 Size: 159
Community: 9 Size: 127
Community: 10 Size: 2
Community: 11 Size: 4
Community: 12 Size: 9
Community: 13 Size: 36
Community: 14 Size: 4
Community: 15 Size: 9
Community: 16 Size: 2
Community: 17 Size: 46
Community: 18 Size: 3
Community: 19 Size: 8
Community: 20 Size: 2
Community: 21 Size: 7
Community: 22 Size: 55
Community: 23 Size: 4
Community: 24 Size: 4
Community: 25 Size: 3
Community: 26 Size: 4
Community: 27 Size: 5
Community: 28 Size: 10
Community: 29 Size: 17
Community: 30 Size: 4
Community: 31 Size: 8
Community: 32 Size: 12
Community: 33 Size: 2
Community: 34 Size: 2
Community: 35 Size: 2
Community: 36 Size: 4
Community: 37 Size: 4
Community: 38 Size: 2
Community: 39 Size: 5
Community: 40 Size: 2
Community: 41 Size: 2
Community: 42 Size: 3
Community: 43 Size: 2
Community: 44 Size

In [24]:
post_df[post_df_w_comm["Community"] == 13]

Unnamed: 0,post_id,user_id,TopWord1,TopWord2,TopWord3,TopWord4,TopWord5,Community
363,318158,175125,risky,labelled,deemed risky,risky risky,labelled data,13
528,318379,164061,dice,roll,limit summation,rolls dice,roll dice,13
1050,319047,8013,cut,sensitivity,harms,specificity,sensitivity specificity,13
1577,319730,121522,existing model,existing,risky,deems risky,deems,13
2546,320994,142829,wire,probability wire,wire cut,wire length,cut,13
2696,321181,190212,existing cohort,cohort,power study,prospective,existing,13
4142,323039,23732,vis,second order,second,better fits,check complex,13
4883,323970,60065,corr,corr corr,cutoff,add edge,add edges,13
5035,324172,115061,hand cut,cut point,cut,age understand,coefficient lower,13
6764,326316,28740,tpr,fpr,assessing probabilistic,specificity,sensitivity,13


Unnamed: 0,post_id,user_id,TopWord1,TopWord2,TopWord3,TopWord4,TopWord5,Community
0,317697,134975,variables statistically,mixed effect,mixed,variables,statistically significant,1
1,317699,45374,accuracy,validation accuracy,training accuracy,shallow,validation,3
2,317701,171235,freedom,mean,degrees freedom,degrees,data degree,0
3,317703,187797,patients,treating,excel,treat,bias treating,4
4,317704,61496,uncertainty,error ways,estimate realistic,forecast method,predicts time,8
5,317705,134369,bonferroni correction,bonferroni,correction,outliers,don understand,4
6,317707,187792,environmental variables,traits,environmental,plant traits,plant,1
7,317708,82816,solve differential,equation distribution,differential equation,need solve,distribution pdf,0
8,317710,61092,monotonically increasing,monotonically,differentiable,increasing,assuming differentiable,0
9,317711,179343,affect specified,shock affect,understand shock,prof,shock,4


In [23]:
snap.SaveEdgeList(post_graph, EDGELIST_PATH)

In [60]:
# word frequency in communities
word_counts = collections.defaultdict(dict)
word_freqs = collections.defaultdict(dict)
for comm in communities:
    total_words = 0.0
    word_counts[comm] = collections.defaultdict(int)
    for word in post_df_w_comm[post_df_w_comm["Community"] == comm]['TopWord1']:
        word_counts[comm][word] += 1
        total_words += 1
    for word in post_df_w_comm[post_df_w_comm["Community"] == comm]['TopWord2']:
        word_counts[comm][word] += 1
        total_words += 1
    for word in post_df_w_comm[post_df_w_comm["Community"] == comm]['TopWord3']:
        word_counts[comm][word] += 1
        total_words += 1
    for word in post_df_w_comm[post_df_w_comm["Community"] == comm]['TopWord4']:
        word_counts[comm][word] += 1
        total_words += 1
    for word in post_df_w_comm[post_df_w_comm["Community"] == comm]['TopWord5']:
        word_counts[comm][word] += 1
        total_words += 1
    for word in word_counts[comm]:
        word_freqs[comm][word] = word_counts[comm][word] / total_words

In [75]:
sorted_word_freqs = collections.defaultdict(dict)
for comm in word_freqs:
    sorted_word_freqs[comm] = sorted(word_freqs[comm].items(), key=lambda x: x[1], reverse=True)

community_labels = collections.defaultdict(str) 

for ind, comm in enumerate(sorted_word_freqs):
    community_labels[comm] = "{}|{}|{}".format(sorted_word_freqs[comm][0][0], sorted_word_freqs[comm][1][0], sorted_word_freqs[comm][2][0])

for comm in communities:
    print "Community: {:3}  Size: {:5}  Label: {}".format(comm, len(post_df[post_df_w_comm["Community"] == comm]), community_labels[comm])

Community:   0  Size:  4030  Label: distribution|sample|probability
Community:   1  Size:  2303  Label: matrix|model|variables
Community:   2  Size:    24  Label: pattern|sklearn|truck
Community:   3  Size:  4711  Label: test|training|validation
Community:   4  Size:  5343  Label: series|time|group
Community:   5  Size:    97  Label: outliers|bias|percentile
Community:   6  Size:     4  Label: et|al|et al
Community:   7  Size:     5  Label: nest|materials|common materials
Community:   8  Size:   159  Label: size|power|uncertainty
Community:   9  Size:   127  Label: learning|rate|minutes
Community:  10  Size:     2  Label: rmsep|lowest rmsep|command validation
Community:  11  Size:     4  Label: cars|amounts cars|meet conditions
Community:  12  Size:     9  Label: count data|count|data time
Community:  13  Size:    36  Label: sensitivity|specificity|cut
Community:  14  Size:     4  Label: hypertension|calcium|assay
Community:  15  Size:     9  Label: concentration|flow|water
Community: 

In [76]:
post_df_w_comm.to_csv(COMMUNITIES_VIZ_PATH, encoding='utf-8', index=False, columns = ['post_id', 'Community'])
post_df_w_comm

Unnamed: 0,post_id,user_id,TopWord1,TopWord2,TopWord3,TopWord4,TopWord5,Community
0,317697,134975,variables statistically,mixed effect,mixed,variables,statistically significant,1
1,317699,45374,accuracy,validation accuracy,training accuracy,shallow,validation,3
2,317701,171235,freedom,mean,degrees freedom,degrees,data degree,0
3,317703,187797,patients,treating,excel,treat,bias treating,4
4,317704,61496,uncertainty,error ways,estimate realistic,forecast method,predicts time,8
5,317705,134369,bonferroni correction,bonferroni,correction,outliers,don understand,4
6,317707,187792,environmental variables,traits,environmental,plant traits,plant,1
7,317708,82816,solve differential,equation distribution,differential equation,need solve,distribution pdf,0
8,317710,61092,monotonically increasing,monotonically,differentiable,increasing,assuming differentiable,0
9,317711,179343,affect specified,shock affect,understand shock,prof,shock,4


In [89]:
# spectral clustering
pickle_f = open(SPEC_COMMUNITIES_PICKLE, 'rb')
community_dict2 = pickle.load(pickle_f)

community_dict2

{0: 1,
 1: 1,
 2: 74,
 3: 78,
 4: 106,
 5: 1,
 6: 99,
 7: 1,
 8: 78,
 9: 1,
 10: 111,
 11: 83,
 12: 17,
 13: 61,
 14: 133,
 15: 1,
 16: 93,
 17: 118,
 18: 1,
 19: 1,
 20: 1,
 21: 1,
 22: 56,
 23: 118,
 24: 118,
 25: 102,
 26: 1,
 27: 1,
 28: 118,
 29: 118,
 30: 126,
 31: 1,
 32: 1,
 33: 50,
 34: 1,
 35: 1,
 36: 1,
 37: 1,
 38: 1,
 39: 1,
 40: 66,
 41: 1,
 42: 1,
 43: 66,
 44: 1,
 45: 1,
 46: 16,
 47: 147,
 48: 122,
 49: 143,
 50: 58,
 51: 1,
 52: 1,
 53: 119,
 54: 1,
 55: 1,
 56: 1,
 57: 130,
 58: 1,
 59: 1,
 60: 1,
 61: 1,
 62: 1,
 63: 1,
 64: 96,
 65: 129,
 66: 1,
 67: 1,
 68: 1,
 69: 1,
 70: 1,
 71: 61,
 72: 77,
 73: 118,
 74: 117,
 75: 1,
 76: 145,
 77: 1,
 78: 135,
 79: 1,
 80: 72,
 81: 118,
 82: 1,
 83: 68,
 84: 1,
 85: 1,
 86: 1,
 87: 54,
 88: 58,
 89: 1,
 90: 1,
 91: 1,
 92: 1,
 93: 137,
 94: 1,
 95: 111,
 96: 66,
 97: 1,
 98: 121,
 99: 1,
 100: 1,
 101: 106,
 102: 1,
 103: 1,
 104: 1,
 105: 101,
 106: 1,
 107: 109,
 108: 1,
 109: 118,
 110: 69,
 111: 137,
 112: 130,
 113: 1,
 

In [91]:
def remap_postids(G):
    """
    Returns a Graph containing all the remapped post_ids so that they go from 0 to n. 
    Also returns the dictionary that maps the ids to their new value.
    """
    postid_map = dict()
    new_G = snap.TUNGraph.New()
    index = 0
    
    # Remap all nodes. Only keep ones with degree > 0.
    for N in G.Nodes():
        if (N.GetDeg() < 1): continue 
        postid_map[N.GetId()] = index
        new_G.AddNode(index)
        index += 1
                
    # Remap all edges.
    for E in G.Edges(): # Edge traversal
        new_G.AddEdge(postid_map[E.GetSrcNId()], postid_map[E.GetDstNId()])
        
    return new_G, postid_map

In [97]:
_, postid_dict2 = remap_postids(post_graph)

post_df2 = load_top_ngram_df(POST_TOP_NGRAM_PATH)
post_df_w_comm2 = add_communities_post_df(post_df, community_dict2, postid_dict2)

In [98]:
post_df_w_comm2

Unnamed: 0,post_id,user_id,TopWord1,TopWord2,TopWord3,TopWord4,TopWord5,Community
0,317697,134975,variables statistically,mixed effect,mixed,variables,statistically significant,42
1,317699,45374,accuracy,validation accuracy,training accuracy,shallow,validation,145
2,317701,171235,freedom,mean,degrees freedom,degrees,data degree,140
3,317703,187797,patients,treating,excel,treat,bias treating,1
4,317704,61496,uncertainty,error ways,estimate realistic,forecast method,predicts time,1
5,317705,134369,bonferroni correction,bonferroni,correction,outliers,don understand,120
6,317707,187792,environmental variables,traits,environmental,plant traits,plant,92
7,317708,82816,solve differential,equation distribution,differential equation,need solve,distribution pdf,1
8,317710,61092,monotonically increasing,monotonically,differentiable,increasing,assuming differentiable,1
9,317711,179343,affect specified,shock affect,understand shock,prof,shock,1


In [99]:
communities2 = set(post_df_w_comm2["Community"])
for comm in communities2:
    print "Community:", comm, "Size:", len(post_df[post_df_w_comm2["Community"] == comm])

Community: 0 Size: 87
Community: 1 Size: 8343
Community: 2 Size: 2
Community: 3 Size: 2
Community: 4 Size: 2
Community: 5 Size: 2
Community: 6 Size: 2
Community: 7 Size: 2
Community: 8 Size: 2
Community: 9 Size: 2
Community: 10 Size: 2
Community: 11 Size: 2
Community: 12 Size: 2
Community: 13 Size: 2
Community: 14 Size: 2
Community: 15 Size: 3
Community: 16 Size: 169
Community: 17 Size: 200
Community: 18 Size: 8
Community: 19 Size: 7
Community: 20 Size: 2
Community: 21 Size: 2
Community: 22 Size: 2
Community: 23 Size: 4
Community: 24 Size: 2
Community: 25 Size: 2
Community: 26 Size: 5
Community: 27 Size: 2
Community: 28 Size: 2
Community: 29 Size: 2
Community: 30 Size: 29
Community: 31 Size: 2
Community: 32 Size: 4
Community: 33 Size: 5
Community: 34 Size: 2
Community: 35 Size: 3
Community: 36 Size: 2
Community: 37 Size: 2
Community: 38 Size: 6
Community: 39 Size: 2
Community: 40 Size: 2
Community: 41 Size: 2
Community: 42 Size: 109
Community: 43 Size: 65
Community: 44 Size: 2
Communit

In [100]:
# word frequency in communities2
word_counts2 = collections.defaultdict(dict)
word_freqs2 = collections.defaultdict(dict)
for comm in communities2:
    total_words = 0.0
    word_counts2[comm] = collections.defaultdict(int)
    for word in post_df_w_comm2[post_df_w_comm2["Community"] == comm]['TopWord1']:
        word_counts2[comm][word] += 1
        total_words += 1
    for word in post_df_w_comm2[post_df_w_comm2["Community"] == comm]['TopWord2']:
        word_counts2[comm][word] += 1
        total_words += 1
    for word in post_df_w_comm2[post_df_w_comm2["Community"] == comm]['TopWord3']:
        word_counts2[comm][word] += 1
        total_words += 1
    for word in post_df_w_comm2[post_df_w_comm2["Community"] == comm]['TopWord4']:
        word_counts2[comm][word] += 1
        total_words += 1
    for word in post_df_w_comm2[post_df_w_comm2["Community"] == comm]['TopWord5']:
        word_counts2[comm][word] += 1
        total_words += 1
    for word in word_counts2[comm]:
        word_freqs2[comm][word] = word_counts2[comm][word] / total_words

In [105]:
sorted_word_freqs2 = collections.defaultdict(dict)
for comm in word_freqs2:
    sorted_word_freqs2[comm] = sorted(word_freqs2[comm].items(), key=lambda x: x[1], reverse=True)

community_labels2 = collections.defaultdict(str) 

for ind, comm in enumerate(sorted_word_freqs2):
    community_labels2[comm] = "{}|{}|{}".format(sorted_word_freqs2[comm][0][0], sorted_word_freqs2[comm][1][0], sorted_word_freqs2[comm][2][0])

for comm in communities2:
    print "Community: {:3}  Size: {:5}  Label: {}".format(comm, len(post_df[post_df_w_comm2["Community"] == comm]), community_labels2[comm])

Community:   0  Size:    87  Label: likelihood|log likelihood|likelihood function
Community:   1  Size:  8343  Label: feature|dataset|value
Community:   2  Size:     2  Label: stuck|suppose hypotheses|able stuck
Community:   3  Size:     2  Label: structure coefficients|corville|additionally sure
Community:   4  Size:     2  Label: geo|thrown getting|coin thrown
Community:   5  Size:     2  Label: advice|functions image|variance group
Community:   6  Size:     2  Label: convolution quite|quite nicely|nicely different
Community:   7  Size:     2  Label: nrow|partykit|matrix nrow
Community:   8  Size:     2  Label: result general|general finally|methods ok
Community:   9  Size:     2  Label: covariance variable|equivalent variance|equals variance
Community:  10  Size:     2  Label: donors|lapsed|frequency donations
Community:  11  Size:     2  Label: distributed random|having exponential|convergeges
Community:  12  Size:     2  Label: grain|mineral|assume grain
Community:  13  Size:     

In [104]:
post_df_w_comm2.to_csv(COMMUNITIES_VIZ_PATH2, encoding='utf-8', index=False, columns = ['post_id', 'Community'])
post_df_w_comm2

Unnamed: 0,post_id,user_id,TopWord1,TopWord2,TopWord3,TopWord4,TopWord5,Community
0,317697,134975,variables statistically,mixed effect,mixed,variables,statistically significant,42
1,317699,45374,accuracy,validation accuracy,training accuracy,shallow,validation,145
2,317701,171235,freedom,mean,degrees freedom,degrees,data degree,140
3,317703,187797,patients,treating,excel,treat,bias treating,1
4,317704,61496,uncertainty,error ways,estimate realistic,forecast method,predicts time,1
5,317705,134369,bonferroni correction,bonferroni,correction,outliers,don understand,120
6,317707,187792,environmental variables,traits,environmental,plant traits,plant,92
7,317708,82816,solve differential,equation distribution,differential equation,need solve,distribution pdf,1
8,317710,61092,monotonically increasing,monotonically,differentiable,increasing,assuming differentiable,1
9,317711,179343,affect specified,shock affect,understand shock,prof,shock,1
