# Sparking Curiosity

To goal of this notebook is to clean your text to construct a narrative flow of important words, stripped of words that are articles or fillers. Then, the notebook makes a network and animates the narrative flow on that network. 

In [None]:
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer as wnl
import nltk, string, glob
import gensim
import itertools
import re
import csv
import scipy
import warnings
import numpy as np
warnings.simplefilter(action='ignore', category=FutureWarning)
#
model = "/Users/dalezhou/Box/2019-02-neuroDepartments/code/bioASQvectors/bioASQmodel.txt"
word_vectors = gensim.models.KeyedVectors.load_word2vec_format(model, binary=False, unicode_errors='ignore')

#################################################
# Initialize, config & define helpful functions #
#################################################

translator = str.maketrans('', '', string.punctuation.replace('-', '')) #filters punctuation except dash
lemmatizeCondition = 1
lemmatizer = wnl()
stop_words = nltk.corpus.stopwords.words('english')

# Function for finding index of words of interest, like 'references'

def find(target):
    for i, word in enumerate(sents):
        try:
            j = word.index(target)
        except ValueError:
            continue
        yield i

# Function for handling the input for gensim word2vec

class FileToSent(object):
    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        for line in open(self.filename, 'r'):
            ll = line.strip().split(",")
            ll = [''.join(c for c in s if c not in string.punctuation) for s in ll]
            ll = [num.strip() for num in ll]
            yield ll


###################################################
# Read in .txt file(s) from a specified directory #
###################################################

IDs = glob.glob('/Users/dalezhou/Downloads/psomTest/*.csv')

####################
# Clean, lemmatize #
####################

for ID in IDs: # loop through papers
    print(ID)
    totalWords = []
    with open(ID, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            text = row['Answer.Text_Detail']
        text = re.sub("\u2013|\u2014", "-", str(text))  # Replace em-dashes
        sents = sent_tokenize(text)  # Split into sentences
        sents = [word_tokenize(s) for s in sents]
        sents = [[w.translate(translator) for w in s] for s in sents]  # filter punctuation
        sents = [[re.sub(r'^[-+]?[0-9]*[\.\-]?[0-9]+$', 'numeric', w) for w in s] for s in sents]  # replace all numerals with the holder "number"
        sents = [[w for w in s if re.search('[^a-zA-Z-0-9-]+', w) is None] for s in sents]  # trips everything but alphanumeric
        sents = [[w.lower() for w in s] for s in sents]  # make lower case
        sents = [s for s in sents if len(s) > 0]  # remove empty lines
        sents = [[w for w in s if not w in stop_words] for s in sents]  # filter stop words
        sents = [[w for w in s if len(w) > 1] for s in sents]  # filters out variables, etc
        sents = [[w for w in s if len(w) > 2] for s in sents]  # filters out variables, etc
        sents = [[w for w in s if len(w) > 3] for s in sents]  # filters out variables and abbreviations
        sents = [s for s in sents if len(s) > 0]  # remove empty lines
        words = [[lemmatizer.lemmatize(w) for w in s if lemmatizeCondition == 1] for s in sents]  # lemmatize
        words = list(itertools.chain.from_iterable(words))  # join list of lists
        totalWords.append(words)

        model = word_vectors

        # get average of all words across years
        my_words = list(set(list(itertools.chain.from_iterable(totalWords))))  # append unique words in the whole corpus

        # filter out words not in model
        my_words = [word for word in my_words if word in model]

        # add man and woman to words
        my_words.append('man')
        my_words.append('woman')

        # The number of connections we want: either as a factor of the number of words or a set number
        num_top_conns = len(my_words) * 50

        # Make a list of all word-to-word distances [each as a tuple of (word1,word2,dist)]
        sims = []

        # Find similarity distances between each word pair for current year

        for i1, word1 in enumerate(my_words):
                for i2, word2 in enumerate(my_words):
                    if i1 >= i2: continue
                    cosine_similarity = model.similarity(word1, word2)
                    sim = (word1, word2, cosine_similarity)
                    sims.append(sim)

        # Sort the list by ascending distance
        sims.sort(key=lambda _tuple: _tuple[-1], reverse=True)


## Animate network



In [None]:
library(ggraph)
library(gganimate)
library(igraph)
library(RColorBrewer)

# Data from http://konect.uni-koblenz.de/networks/sociopatterns-infectious
#infect <- read.table('/Users/dalezhou/Downloads/sociopatterns-infectious/out.sociopatterns-infectious', skip = 2, sep = ' ', stringsAsFactors = FALSE)

# Code from https://gist.github.com/thomasp85/eee48b065ff454e390e1
# https://gist.github.com/jalapic/612036977d9f9c773107681bc4a46d58

infect <- read.table('/home/jovyan/networkDynamics.txt', skip = 0, sep = ' ', stringsAsFactors = FALSE)
infect <- read.table('/home/jovyan/networkDynamicsLabels.txt', skip = 0, sep = ' ', stringsAsFactors = FALSE)
infect$V3 <- NULL 
#infect$words <- words$V2
names(infect) <- c('from', 'to', 'time')
infect$timebins <- as.numeric(cut(infect$time, breaks = 500))

# We want that nice fading effect so we need to add extra data for the trailing
infectAnim <- lapply(1:10, function(i) {infect$timebins  <- infect$timebins + i; infect$delay <- i; infect})
infect$delay <- 0
infectAnim <- rbind(infect, do.call(rbind, infectAnim))

infectGraph <- graph_from_data_frame(infectAnim, directed = F)

# We use only original data for the layout
subGr <- subgraph.edges(infectGraph, which(E(infectGraph)$delay == 0))
V(subGr)$degree <- degree(subGr)
V(subGr)$group <- cluster_louvain(subGr)$membership
lay <- createLayout(subGr, 'igraph', algorithm = 'fr')

# Then we reassign the full graph with edge trails
attr(lay, 'graph') <- infectGraph

# Now we create the graph with timebins as frame
p <- ggraph(data = lay, layout = 'fr', aes(frame = timebins)) + 
  geom_node_point(size = .1, col = "white") +
  geom_node_point(aes(alpha=0.6), size = .1, colour = factor(lay$group), show.legend = FALSE) + 
  # geom_edge_link0(aes(frame = timebins, alpha = delay, width = delay), edge_colour = '#dccf9f') + 
  geom_edge_link0(aes(frame = timebins, alpha = delay, width = delay, colour = factor(node1.group)), data = gEdges(nodePar = 'group'), show.legend = FALSE) +
  # geom_edge_link0(aes(frame = timebins, alpha = delay, width = delay, colour = node1.degree), data = gEdges(nodePar = 'degree'), show.legend = FALSE) +
  scale_edge_alpha(range = c(1, 0), guide = 'none') + 
  scale_edge_width(range = c(0.5, 1.5), trans = 'exp', guide = 'none') + 
  scale_size(guide = 'none') + 
  expand_limits(x = c(min(lay$x), max(lay$x)), y = c(min(lay$y), max(lay$y))) +
  ggforce::theme_no_axes() + 
  theme(plot.background = element_rect(fill = '#103fe8'), 
        panel.background = element_blank(), 
        panel.border = element_blank(), 
        plot.title = element_text(color = '#cecece'))

# And then we animate
animation::ani.options(interval=0.1)
# gganimate(p, '/Users/dalezhou/Desktop/Dropbox/service/kamenArt/animation_louvainNodes_coloredSparks_500.gif', title_frame = FALSE)
gganim <- gganimate(p, '/home/jovyan/sparkingCuriosity_1600x1600_darkBlue.gif', title_frame = FALSE,
         ani.width = 1600, ani.height = 1600, res=300)

# to do
# add changing text