In [5]:
#%autoreload 2

from VMTest import *

In [6]:
import json
from networkx.readwrite import json_graph

def removeDefaults(df):
    defaults = list(loadSQL('defaults','reference')['defaults'])
    subset = df[~df['subreddit'].isin(defaults)].copy()
    
    return subset

def dropSingleEdges(G):
    edges = list(G.edges(data=True))
    single_edges = [edge for edge in edges if edge[2]['weight'] == 1]
    
    G.remove_edges_from(single_edges)
    
    return G

def removeIsolates(G):
    isolates = list(nx.isolates(G))
    G.remove_nodes_from(isolates)
    
    return G
    
def setAttributes(G):
    nx.set_node_attributes(G, 'group', 1)
    for n in G:
        G.nodes[n]['id'] = n
        
    return G

def subsetNet(G):
    G = dropSingleEdges(G)
    G = removeIsolates(G)
    G = setAttributes(G)
    
    return G
    
def saveGraphJson(G, filename):
    d = json_graph.node_link_data(G)
    json.dump(d, open(filename, 'w'))

In [7]:
def projectSub(df):
    top = df.subreddit.values
    bot = df.author.values
    
    top_unique, top_indices = sp.unique(top, return_inverse=True)
    bot_unique, bot_indices = sp.unique(bot, return_inverse=True)
    
    data = sp.ones(len(top))
    incidence = sp.sparse.coo_matrix((data, (top_indices, bot_indices)))
    adj = incidence.dot(incidence.T)
    
    G = add_edges_fast(top_unique, adj)
    
    return G

In [9]:
database_name = 'allEdges'
engine = get_engine(database_name)

subreddit = 'changemyview'
char = 'a'
query = """
        SELECT author, subreddit, weight
        FROM 'a'
        WHERE subreddit in
                  (SELECT subreddit
                  FROM 'a'
                  WHERE author in
                            (SELECT author
                            FROM 'a'
                            WHERE (subreddit == 'changemyview')))
        """.format(char, subreddit)

data = pd.read_sql_query(query, engine)

In [10]:
data.head()

Unnamed: 0,author,subreddit,weight
0,andracute2,suggestmeabook,31
1,alegxab,MapPorn,47
2,Akoustyk,audioengineering,73
3,ARJeff,Brawlstars,63
4,AmazingSource,fatlogic,59


In [12]:
G = projectSub(data)
G.remove_node('changemyview')

In [38]:
subset = data[data['weight']>=3]
subset = subset[~subset['source'].isin(['AutoModerator','[deleted]'])]
subset = subset[~subset['target'].isin(['changemyview'])]
subset.to_csv('cmv-network.csv', index=False)

In [39]:
subset.shape

(277904, 3)

In [15]:
data.columns = ['source','target','weight']
data.head()

Unnamed: 0,source,target,weight
0,andracute2,suggestmeabook,31
1,alegxab,MapPorn,47
2,Akoustyk,audioengineering,73
3,ARJeff,Brawlstars,63
4,AmazingSource,fatlogic,59


In [None]:
data.

In [115]:
data = removeAuthors(data)
data = removeDefaults(data)
cmvAuthors = data[data['subreddit']=='changemyview'].author

data.columns = ['author','subreddit','value']

cmvNetwork = data[data['author'].isin(cmvAuthors)]
nonCMVNetwork = data[~data['author'].isin(cmvAuthors)]

# Producing Graphs

## Testing on Sample Network

In [175]:
def separateUpperQuartile(df, col):
    upper_bound = df.quantile(.75)[col]
    lq = s[s[col]<upper_bound]
    uq = s[s[col]>=upper_bound]
    
    return lq, uq

def removeUpperQuartileNodes(G):
    s = pd.DataFrame.from_dict(dict(G.degree()), orient='index')
    lq, uq = separateUpperQuartile(s, 0)
    
    G.remove_nodes_from(uq.index)

In [179]:
def makeNetworkGraph(data, filename):
    G = projectSub(cmvNetwork)
    G.remove_node('changemyview')
    removeUpperQuartileNodes(G)

    defaults = list(loadSQL('defaults','reference')['defaults'])
    G.remove_nodes_from(defaults) # too connected
    G.remove_node('politics') # too connected
    subset = subsetNet(G)

    saveGraphJson(subset, '{}.json'.format(filename))
    
makeNetworkGraph(nonCMV, 'noncmv')

In [130]:
df = nx.to_pandas_edgelist(subsetCMV)

In [131]:
df.sort_values('weight').tail()

Unnamed: 0,source,target,weight
22447,WTF,politics,29.0
24435,bestof,politics,36.0
1877,AdviceAnimals,politics,40.0
18055,PoliticalHumor,politics,45.0
30563,politics,technology,46.0


In [157]:
s = pd.DataFrame.from_dict(dict(subsetCMV.degree()), orient='index')