In [3]:
%addJar file:/docker/lib/common-1.0.2-jar-with-dependencies.jar

Using cached version of common-1.0.2-jar-with-dependencies.jar


In [15]:
import com.cloudera.datascience.common.XmlInputFormat
import org.apache.hadoop.io.{ Text, LongWritable }
import org.apache.hadoop.conf.Configuration

val path = "/docker/datasets/medline"
@transient val conf = new Configuration()
conf.set(XmlInputFormat.START_TAG_KEY, "<MedlineCitation ")
conf.set(XmlInputFormat.END_TAG_KEY, "</MedlineCitation>")
val in = sc.newAPIHadoopFile(path, classOf[XmlInputFormat], classOf[LongWritable], classOf[Text], conf)//.sample(false, 0.0001)
val rawRdd = in.map(line => line._2.toString)

In [16]:
import scala.xml.{ XML, Elem }
def majorTopics(elem: Elem): Seq[String] = {
    val dn = elem \\ "DescriptorName"
    val mt = dn.filter(n => (n \ "@MajorTopicYN").text == "Y")
    mt.map(n => n.text)
}

val xmlRdd = rawRdd.map(XML.loadString)
val medline = xmlRdd.map(majorTopics).filter(_.nonEmpty).cache()
medline.take(1)(0)

List(Computer Simulation, Models, Cardiovascular)

In [17]:
val topics = medline.flatMap(x => x)
println(topics.distinct.count)
topics.countByValue.toSeq.sortBy(- _._2).take(10).foreach(println)

15154
(Research,3094)
(Disease,2692)
(Neoplasms,1891)
(Public Policy,1620)
(Jurisprudence,1595)
(Demography,1524)
(Population Dynamics,1502)
(Economics,1382)
(Socioeconomic Factors,1299)
(Blood,1264)


In [12]:
def hash(x: Any) = x.##.toLong
val vertices = topics.map(topic => hash(topic) -> topic)
vertices.map(_._1).countByValue.size == vertices.map(_._2).countByValue.size

true

In [3]:
val topicPairs = medline.flatMap(t => t.sorted.combinations(2))
val cooccurs = topicPairs.map(p => p -> 1).reduceByKey(_ + _)
cooccurs.cache()
cooccurs.count

229601

In [14]:
import org.apache.spark.graphx._

val edges = cooccurs.map { p =>
    val (topics, cnt) = p
    val ids = topics.map(hash).sorted
    Edge(ids(0), ids(1), cnt)
}

In [15]:
val topicGraph = Graph(vertices, edges)

In [3]:
val connectedComponents = topicGraph.connectedComponents
val componentCounts = connectedComponents.vertices.map(_._2).countByValue.toSeq.sortBy(- _._2)
componentCounts

ArrayBuffer((-2146994723,14132), (-2117711008,5), (-1780068791,4), (-1884199532,3), (-770162488,3), (-833189025,3), (-1749011714,3), (-1347759196,3), (-1269853108,3), (-1173222909,3), (349631822,2), (-248503704,2), (694941816,2), (1004812845,2), (-1125835952,2), (1149468248,2), (-188604902,2), (1251460648,2), (-1660541797,2), (-2018435056,2), (511639598,2), (-593879031,2), (890492431,2), (-594015880,2), (-1679655020,2), (-1909316450,2), (-1455963424,2), (-605701197,2), (-235746564,2), (-1928601699,2), (389444582,2), (-1884527612,2), (-1509934809,2), (752675843,2), (-1006799626,2), (-1369445892,2), (-240632659,2), (373043352,2), (-749871690,2), (-265065099,2), (-1865475570,2), (-976690972,2), (-1034421687,2), (250501833,2), (-3388462...