In [6]:
require(rvest)
require(ggplot2)
require(pipeR) # %>>% will be faster than %>%
require(httr)
require(RCurl)
require(dplyr)
library('caTools')
library(igraph)
library('stringdist')
library('rgl')
require(doParallel)
require(foreach)
require(networkD3)
require(plotly)

library(magrittr)
library(extrafont)
font_import()



Loading required package: rvest
Loading required package: xml2
Loading required package: ggplot2
Loading required package: pipeR
Loading required package: httr
Loading required package: RCurl
Loading required package: bitops
Loading required package: dplyr

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Attaching package: ‘igraph’

The following object is masked _by_ ‘.GlobalEnv’:

    edges

The following objects are masked from ‘package:dplyr’:

    %>%, as_data_frame, groups, union

The following object is masked from ‘package:rvest’:

    %>%

The following objects are masked from ‘package:stats’:

    decompose, spectrum

The following object is masked from ‘package:base’:

    union

Loading required package: doParallel
Loading required package: foreach
Loading required package: iterators
Loading required package: parallel
Loading 

Importing fonts may take a few minutes, depending on the number of fonts and the speed of the system.
Continue? [y/n] 


Exiting.


In [None]:
#open up the user specific functions defined in functions.R- this has definitions of the web page format etc.
source('./functions.R')

In [None]:
setClass("Journal",
         slots = list(base = "character", extension= "character",nextIssue = "character", abstract="character", 
                      metaNodes= "character", metaNames = "character", metaContent = "character", 
                      authorNodes= "character",  authorSplit = "character", authorExtractString = "character",authorAffiliationIndex = "character",
                      affiliationNodes = "character", affiliationSplit = "character", affiliationExtractString = "character", 
                      authorSearch = "character", institutionSearch = "character",doiSearch = "character",dateSearch = "character",
                      emailSearch = "character",websiteLayout = "character"))
#decsription of Journal Layout
#base - The base URL of the website
#extension - the extension that is added to the base to get the URL of each of the journal volume abstract list
#metaNodes - This is a node in the XML that is used for all the author names, affiliations etc. Only used for MNRAS, A&A in getData
#metaNames - This is an attribute of the nodes that extracts the labels of each XML data field. Only used for MNRAS, A&A in getData
#metaContent - This is an attribute of the nodes that extracts the content in each XML data field. Only used for MNRAS, A&A in getData
#authorNodes
#authorSplit
#authorExtractString
#affiliationNodes
#affiliationSplit
#affiliationExtractString
#authorSearch
#instiutionSearch
#doiSearch
#dateSearch
#emailSearch
#websiteLayout


# Here we define the different types used in the astronomy code

In [None]:
#header to define the mnras layout
mnras <- new("Journal", base = "http://mnras.oxfordjournals.org/", extension = 'content/313/1.toc', nextIssue = 'Next issue', abstract = '*abstract*',
             metaNodes= 'meta', metaNames = 'name', metaContent = 'content',
             authorSearch = "^citation_author$",institutionSearch = "^citation_author_institution$", doiSearch="^citation_doi$",dateSearch="^citation_date$",
             emailSearch = "^citation_author_email$")

#header to define the Astronomy and Astrophysics layout
astast <- new("Journal", base = "http://www.aanda.org/", extension = 'articles/aa/abs/2001/01/contents/contents.html', nextIssue = 'Next issue', 
              abstract = '/aa/abs/.*aa.*\\.html$', metaNodes= 'meta', metaNames = 'name', metaContent = 'content',authorSearch = "^citation_author$",
             institutionSearch = "^citation_author_institution$", doiSearch="^citation_doi$",dateSearch="^citation_publication_date$",
             emailSearch = "^citation_author_email$")

#header to define the ApJ
astApj <- new("Journal", base = "http://iopscience.iop.org/", extension = '0004-637X/471/1', nextIssue = 'next issue', 
              abstract = '/article/.*meta', metaNodes= 'meta', metaNames = 'name', metaContent = 'content',
              authorNodes= ".mb-0, span", authorSplit = "span", authorExtractString = '.*?\"name\">(.*?)</.*', authorAffiliationIndex = '.*?<sup>(.*?)</sup>.*',
              affiliationNodes = ".wd-jnl-art-author-affiliations",affiliationSplit = "</sup>", affiliationExtractString = ".*sup>.*</sup>.*", 
              institutionSearch = "^citation_author_institution$", doiSearch="^citation_doi$",
              dateSearch="^citation_publication_date$", emailSearch = "^citation_author_email$", websiteLayout = "character")


In [None]:
abstractLinksmnras <- getWebPageDataJournal(mnras,500,'mnrasAbstracts.R')
abstractLinksaa <- getWebPageDataJournal(astast,500,'astronomyAstrophysicsAbstracts.R')
abstractLinksapj <- getWebPageDataJournal(astApj,500,'astrophysicalJournal.R')


In [None]:
#parse the abstracts given previously
mnrasData2<- parseAbstracts(mnras,abstractLinksmnras,5)


In [4]:
load('ddEdgesMNRAS.rdata')


In [None]:
ddEdgesMNRAS


In [7]:
load('ddEdgesMNRAS.rdata')
ddEdges<-ddEdgesMNRAS
g <- graph.data.frame(ddEdges, directed=FALSE)
net <- simplify(g, remove.multiple = F, remove.loops = T)
a<-as.numeric(degree(net))
removeG<-which(a<2)
net<-delete.vertices(net,removeG)
comps <- decompose.graph(net, min.vertices=5)

  

In `[<-.factor`(`*tmp*`, thisvar, value = "NA"): invalid factor level, NA generated

In [10]:
write.graph(g, format="graphml", file="arxiv.graphml")

In [None]:
a<-subgraph.edges(comps[[1]], V(comps[[1]])[1:1000])
G <- a
L <- layout.fruchterman.reingold(G,dim=3)
vs <- V(G)
es <- as.data.frame(get.edgelist(G))
Nv <- length(vs)
Ne <- length(es[1]$V1)
Xn <- L[,1]
Yn <- L[,2]
Zn <- L[,3]
df <- setNames(data.frame(cbind(Xn,Yn,Zn)), c("x", "y", "z"))
network <- plot_ly(df, x = x, y = y, z =z,   type = "scatter3D", mode = "markers")


In [None]:
 edge_shapes <- list()
  for(i in 1:Ne) {
    v0 <- es[i,]$V1
    v1 <- es[i,]$V2
    
    edge_shape = list(
      type = "line",
      line = list(color = "#030303", width = 0.3),
      x0 = Xn[v0],
      y0 = Yn[v0],
      z0 = Zn[v0],
      x1 = Xn[v1],
      y1 = Yn[v1],
      z1 = Zn[v1]
    )
    
    edge_shapes[[i]] <- edge_shape
  }

In [None]:
  network <- layout(
    network,
    title = 'MNRAS Network',
    shapes = edge_shapes,
    xaxis = list(title = "", showgrid = FALSE, showticklabels = FALSE, zeroline = FALSE),
    yaxis = list(title = "", showgrid = FALSE, showticklabels = FALSE, zeroline = FALSE),
    zaxis = list(title = "", showgrid = FALSE, showticklabels = FALSE, zeroline = FALSE)
  )

In [None]:
network