<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Data-collection" data-toc-modified-id="Data-collection-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data collection</a></span></li><li><span><a href="#Topic-modelling" data-toc-modified-id="Topic-modelling-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Topic modelling</a></span></li></ul></div>

In [5]:
import scipy as sp
import spacy
import pandas as pd
import pickle
import gzip
from tqdm import tqdm
from google.cloud import bigquery
from logging import getLogger
from concurrent.futures import TimeoutError
import sys
import os
from pathlib import Path
import matplotlib.pyplot as plt
% matplotlib inline

PROJECT = 'reddit-network-184710'
CREDENTIALS = 'reddit-network-774059619c28.json'

CACHE = 'cache'

def client():
    return bigquery.Client.from_service_account_json(CREDENTIALS, project=PROJECT)

# Data collection

In [10]:
def get_comments():
    query = """select author, author_flair_text, created_utc, link_id, parent_id, score, id, distinguished
                   from `fh-bigquery.reddit_comments.2017_06`
                   where (subreddit = 'changemyview')
                   """

    config = bigquery.QueryJobConfig()

    config.use_legacy_sql = False
    max_bytes=7e9
    config.maximum_bytes_billed = int(max_bytes)

    print('Submitting query')
    j = client().query(query=query, job_config=config)
    with tqdm() as pbar:
        while True:
            try:
                j.result(timeout=1)
            except TimeoutError:                
                pbar.update(1)
            else:
                break

    return j

def dump_comments(j):
    print('Unpacking results')

    total = j.query_results().total_rows

    iterator = j.result()
    rows = []
    for row in tqdm(iterator, total=total):
        rows.append(row.values())

    columns = [c.name for c in iterator.schema]
    return pd.DataFrame(rows, None, columns)

j = get_comments()
df = dump_comments(j)
df.to_pickle('cmv_17_06_comments.pkl')

Submitting query


6it [00:08,  1.37s/it]

<class 'google.cloud.bigquery.job.QueryJob'>





In [34]:
def get_posts():
    query = """SELECT created_utc, author, num_comments, score, title, selftext, id, author_flair_text, distinguished
                FROM `fh-bigquery.reddit_posts.2017_06`
                where (subreddit = 'changemyview')"""

    config = bigquery.QueryJobConfig()

    config.use_legacy_sql = False
    max_bytes=5e9
    config.maximum_bytes_billed = int(max_bytes)

    print('Submitting query')
    j = client().query(query=query, job_config=config)
    with tqdm() as pbar:
        while True:
            try:
                j.result(timeout=1)
            except TimeoutError:                
                pbar.update(1)
            else:
                break

    return(j)

def dump_posts(j)
    print('Unpacking results')

    total = j.query_results().total_rows

    iterator = j.result()
    rows = []
    for row in tqdm(iterator, total=total):
        rows.append(row.values())

    columns = [c.name for c in iterator.schema]
    return pd.DataFrame(rows, None, columns)

j = get_posts()
posts_df = dump_posts(j)
posts_df.to_pickle('cmv_2017_06_posts.pkl')

Submitting query


0it [00:00, ?it/s]

<class 'google.cloud.bigquery.job.QueryJob'>





In [58]:
edgelist = df[['author','link_id']]
edgelist = edgelist[~edgelist['author'].isin(['[deleted]', 'DeltaBot'])]

Unnamed: 0,author,link_id
0,KeepingMyJob310,t3_5idfoh
1,[deleted],t3_5lhhgj
2,IAmAN00bie,t3_5lhhgj
3,MrJunior12,t3_5lhhgj
4,an_iridescent_ham,t3_5lv89l


In [90]:
import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt
import numpy as np

B = nx.Graph()
B.add_nodes_from(set(edgelist['author']), bipartite=0) 
B.add_nodes_from(set(edgelist['link_id']), bipartite=1)
B.add_edges_from(list(zip(edgelist['author'],edgelist['link_id'])))

author_net = bipartite.weighted_projected_graph(B, set(edgelist['author']))
author_net.remove_nodes_from(list(nx.isolates(author_net)))

post_net = bipartite.weighted_projected_graph(B, set(edgelist['link_id']))
post_net.remove_nodes_from(list(nx.isolates(post_net)))

print('author net has {} connected components'.format(nx.number_connected_components(author_net)))
print('post net has {} connected components'.format(nx.number_connected_components(post_net)))

In [81]:
from networkx.algorithms import community

In [96]:
cc = list(nx.connected_components(author_net))

giant = max(nx.connected_component_subgraphs(author_net), key=len)

In [105]:
len(giant)

8435

In [107]:
a = nx.induced_subgraph(author_net, giant)

In [108]:
len(a.nodes())

8435

In [115]:
cn = nx.core_number(a)

In [127]:
degrees = dict(a.degree)

# Topic modelling

In [None]:
x = edgelist['author'].value_counts()

In [141]:
s = pd.Series(degrees, index=degrees.keys())
s = pd.DataFrame(s, columns = ['degree'])

In [142]:
s.head()

Unnamed: 0,degree
LordBryne,324
sdspg,34
JmmiP,124
octopuscat77,16
yaar_,19


In [158]:
vc = edgelist['author'].value_counts()
vc = pd.DataFrame(vc)
vc.shape

(8471, 1)

In [161]:
pd.merge(s, vc, how='outer', right_index=True)

MergeError: Must pass left_on or left_index=True