# Setup

In [None]:
import networkx as nx
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from stop_words import get_stop_words

import numpy as np
import csv
import os
import re
import sqlite3
import random
from gensim import corpora, models
import pylab

In [None]:
# Constants 

NUMDOCS = 60 # number of subreddits OR number of BBC documents per category
NUMTOPICS = 30 # number of topics to extract 
COMMENTLIMIT = 1000 # for reddit only; number of comments to represent one subreddit

REDDIT = "REDDIT"
BBC = "BBC"

REDDIT_PATH = "/texts/reddit-comments.sqlite"
TOP_SUBREDDITS = '/misc/top_100_subreddits'
BBC_PATH = "/texts/bbc-fulltext"

# we needed to add additional stop words to fully denoise the data
# this includes stripping URLs includes reddit-specific terms

ADDNL_STOP_WORDS = [u's',u'www',u'year', u'com', u'http', u'https', u'use', u'make', u'know', u'say', \
                  u'even', u'go', u'think', u'', u't',u're', u'said', u'will', u'like', \
                  u'just', u'also', u'can', u'get', u'don', u'delete', \
                  u'really', u'good', u'know', u'think', u'one', u'even', u'need', u'way',\
                  u'want', u'people', u'thing', u'look', u'work', u'time', \
                  u'see', u'reddit', u'using', u'wants', u'comment', \
                 u'please', u'looks', u'looking', u'message',\
                 u'gt', u'messag', u'pleas', u'way', u'someth', u've', u'remov', u'well', u'take',\
                 u'now', u'post', u'still', u'try', u'tri', u'right', u'd', u'much', u'person',\
                 u'submit', u'submission', u'subreddit', u'doesn', u'isn', u'sure', u'didn', u'll',\
                 u'got', u'u', u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8', u'9', u'0', u'back',\
                 u'come', u'v', u'a', u'b', u'c', u'd', u'e', u'f', u'g', u'h', u'i', u'j', u'k',\
                 u'l', u'm', u'n', u'o', u'p', u'q', u'r', u's', u't', u'u', u'v', u'w', u'x', u'y',\
                 u'z']

def toVector(l):
    ''' inserts 0s for unrepresented topics in a document
        input: return value from ldamodel.get_document_topics(corpus[i])
        output: document topics with 0s for unrepresented topics'''
    ret = []
    t = 0
    for i in range(NUMTOPICS):
        if t == len(l):
            ret += [0]
        elif l[t][0] == i + 1:
            ret += [l[t][1]]
            t = t + 1
        else:
            ret += [0]
    return ret

# Data pre-processing

#### Define which dataset you're using

In [None]:
data = BBC

#### Read documents

In [None]:
if data == BBC:
    
    # read docs from bbc
    docs = []

    # ignore system files
    dirs = [f for f in os.listdir(BBC_PATH) if not f.startswith('.')]
    
    # keep track of document labels
    doc_names = []
    for d in dirs:
        doc_names += [str(d)]
        
    # how many documents should we get?
    if NUMDOCS == "ALL":
        maxDocs = float("inf")
    else:
        maxDocs = NUMDOCS
    
    # extract all of the documents, recording how many were in 
    # each category for labelling later
    doc_id_counts = [0 for x in range(len(doc_names))]
    currentDocID = 0
    for d in dirs:
        i = 0
        for f in os.listdir(os.path.join(path,d)):
            doc_id_counts[currentDocID] += 1
            docs += [open(os.path.join(path,d,f), "r").read()]
            i = i + 1
            if i >= maxDocs:
                break
        currentDocID += 1
    str_docs = docs
    
elif data == REDDIT:
    
    # read in the names of the most popular subreddits
    with open(REDDIT_PATH) as f:
        top_subreddits = f.read().splitlines()

    # pull the data from reddit
    # each subreddit will be read as a 'document' for LDA analysis
    sql_conn = sqlite3.connect(REDDIT_PATH)
    docs = []
    for sub in top_subreddits[0:NUMDOCS]:
        docs += [pd.read_sql(
                "SELECT body FROM May2015 WHERE subreddit = '" + str(sub) + 
                "' LIMIT " + str(COMMENTLIMIT), 
                sql_conn)]
    
    # turn the dataframes into arrays, then into long strings
    # note that each subreddit is represented by a concatenation of comments
    str_docs = []
    for i in range(len(docs)):
        tostring = ""
        for e in docs[i].as_matrix():
            tostring += e
            tostring += " "
        str_docs += [tostring]

#### Stem and remove stop words

In [None]:
# list for tokenized documents in loop
texts = []
# loop through document list

for doc in str_docs:

    if data == REDDIT:
    # clean and tokenize document string
        raw = doc[0].lower()
    else:
        raw = doc.lower()
        raw = raw.decode("utf8", errors="ignore")
    #print raw
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    en_stop = get_stop_words('en') + ADDNL_STOP_WORDS

    # remove stop words
    stopped_tokens = []
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    p_stemmer = PorterStemmer()
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # ensure we didn't miss any stop words, after stemming
    restopped_tokens = [i for i in stemmed_tokens if not i in en_stop]
    
    # add tokens to list
    texts.append(restopped_tokens)
    

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

#### Generate LDA model across texts

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics=NUMTOPICS, 
                                           id2word = dictionary, passes=25)

#### Print a list of "pretty" topics

In [None]:
# you can get topic proportions with ldamodel.print_topics()
for i in range(0, ldamodel.num_topics):
    tops = []
    filtered = re.findall( '\"[a-z]+\"', ldamodel.print_topic(i, 20))
    for elem in filtered:
        print elem + ", ",
    print "\n\n"

#### Calculate similarity between all document topic distributions

In [None]:
# include a small value to add to all distance measurements
# such that we never get 0 distance
EPS = 0.001

# note: this can take a long time on large corpora
similarity = [[0 for x in range(numDocs)] for y in range(numDocs)]

for i in range(numDocs):
    itops = toVector(ldamodel.get_document_topics(corpus[i]))
    for j in range(numDocs)[i:]:
        jtops = toVector(ldamodel.get_document_topics(corpus[j]))
        sim = 1.0 - gensim.matutils.hellinger(itops, jtops)
        sim = min(sim + EPS, 1.0)
        
        similarity[i][j] = sim


## Graphing

#### Color and label everything based on its source for BBC data or based on its topic for Reddit data

In [None]:
if data == BBC:
    # generate colors for each topic
    NUMCOLORS = len(doc_names)
    raw_cols = []

    # these are default placeholder colors
    # we can change these colors later in Gensim
    raw_cols = ["#ff0d05", "#ff9a02", "#00ff00", "#00e1ff", "#ff00ff"]

    colors = {}
    labels = {}

    i = 0
    doc = 0
    counter = [x for x in doc_id_counts]

    while i < len(counter):
        labels[doc] = doc_names[i]
        if counter[i] > 0:
            counter[i] = counter[i] - 1
        else:
            i = i + 1      
        doc = doc + 1

    for i in range(len(doc_names)):
        colors[doc_names[i]] = raw_cols[i]

if data == REDDIT:
    
    # generate colors for each topic
    NUMCOLORS = NUMTOPICS
    raw_cols = []
    currentcol = 5.0
    for i in range(NUMCOLORS):
        raw_cols += ["#%06x" % random.randint(0xa982ff, 0xFFFFFF)]

    # get node data
    colors = {}
    labels = {}
    doctopics = {}
    for i in range(len(str_docs)):
        labels[i] = top_subreddits[i]
        doctopic = ldamodel.get_document_topics(corpus[i])[0][0] % len(raw_cols)
        colors[i] = raw_cols[doctopic]
        doctopics[i] = doctopic

#### Create networkx graph

In [None]:
G = nx.Graph()
          
if data == BBC:

    for i in range(len(str_docs)):
        G.add_node(i, {'label': labels[i], 'color' : colors[labels[i]]})

    inc = 0
    for i in range(len(str_docs)):
        for j in range(len(str_docs))[i+1:]:
            inc = inc + 1
            if similarity > 0:
                G.add_edge(i, j, weight = similarity[i][j])
                
elif data == REDDIT:
    for i in range(len(str_docs)):
        G.add_node(i, {'label': labels[i], 'color' : colors[i], 'topic' : doctopics[i]})
    for i in range(len(str_docs)):
            for j in range(len(str_docs))[i+1:]:
                G.add_edge(i, j, weight = similarity[i][j])

#### Save to gexf file

In [None]:
filename = "%s_%d-docs_%d-topics"% (data, NUMDOCS, NUMTOPICS)
nx.write_gexf(G, open(filename, "wb"))

## Finally, the remainder of the graph creation process happens in Gensim!