In [2]:
%AddJar file:/docker/lib/ch06-lsa-1.0.2-jar-with-dependencies.jar
%showtypes on

Starting download from file:/docker/lib/ch06-lsa-1.0.2-jar-with-dependencies.jar
Finished download of ch06-lsa-1.0.2-jar-with-dependencies.jar
Types will be printed.


## Reading the fat xml and convert it to RDDs of pages

In [4]:
import com.cloudera.datascience.common.XmlInputFormat
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io._

val path = "file:///docker/datasets/wikidump/wiki-latest.xml"
@transient val conf = new Configuration()
conf.set(XmlInputFormat.START_TAG_KEY, "<page>")
conf.set(XmlInputFormat.END_TAG_KEY, "</page>")

val kvs = sc.newAPIHadoopFile(path, classOf[XmlInputFormat], classOf[LongWritable], classOf[Text], conf).sample(false, 0.00003)
val rawXmls = kvs.map(p => p._2.toString)
rawXmls

org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[6] at map at <console>:33

In [3]:
import edu.umd.cloud9.collection.wikipedia.language._
import edu.umd.cloud9.collection.wikipedia._

def wikiXmlToPlainText(xml: String): Option[(S$tring, String)] = {
    val page = new EnglishWikipediaPage()
    WikipediaPage.readPage(page, xml)
    if (page.isEmpty) None
    else Some(page.getTitle -> page.getContent)
}

In [4]:
val plainText = rawXmls.flatMap(wikiXmlToPlainText).cache()

In [6]:
plainText.saveAsTextFile("WikiText")

## Lematiztion

In [5]:
import edu.stanford.nlp.pipeline._
import edu.stanford.nlp.ling.CoreAnnotations._
import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConverters._

def createNLPPipeline(): StanfordCoreNLP = {
    val props = new java.util.Properties()
    props.put("annotators", "tokenize, ssplit, pos, lemma")
    new StanfordCoreNLP(props)
}

def isOnlyLetters(str: String): Boolean = str forall Character.isLetter

def plainTextToLemmas(text: String, stopWords: Set[String], pipline: StanfordCoreNLP): Seq[String] = {
    val doc = new Annotation(text)
    pipline.annotate(doc)
    val lemmas = new ArrayBuffer[String]()
    val sentences = doc.get(classOf[SentencesAnnotation]).asScala
    for {
        sentence <- sentences
        token <- sentence.get(classOf[TokensAnnotation]).asScala
        } {
        val lemma = token.get(classOf[LemmaAnnotation])
        if (lemma.length > 2 && !stopWords(lemma) && isOnlyLetters(lemma)) {
            lemmas += lemma.toLowerCase
        }
    }
    lemmas
}

In [6]:
import org.apache.spark.rdd.RDD

val stopWords = sc.broadcast(scala.io.Source.fromFile("/docker/Spark/stopwords.txt").getLines.toSet).value

val lemmatized: RDD[Seq[String]] = plainText.mapPartitions { it => 
    val pipeline = createNLPPipeline()
    it.map { case (title, contents) => plainTextToLemmas(contents, stopWords, pipeline) }
}

In [7]:
stopWords

scala.collection.immutable.Set[String] = Set(down, it's, that's, for, further, she'll, any, there's, this, haven't, in, ought, myself, have, your, off, once, i'll, are, is, his, why, too, why's, am, than, isn't, didn't, himself, but, you're, below, what, would, i'd, if, you'll, own, they'll, up, we're, they'd, so, our, do, all, him, ours	ourselves, had, nor, before, it, a, she's, as, hadn't, because, has, she, yours, or, above, yourself, herself, she'd, such, they, each, can't, don't, i, until, that, out, he's, cannot, to, we've, hers, you, did, let's, most, here, these, hasn't, was, there, when's, shan't, doing, at, through, been, over, i've, on, being, same, how, whom, my, after, who, itself, me, them, by, then, couldn't, he, should, few, wasn't, again, while, their, not, with,...

In [8]:
lemmatized.cache()
lemmatized.count()

Long = 70

## Computing TF-IDF

### Computing TF

In [9]:
import scala.collection.mutable.HashMap

val docTermFreqs = lemmatized.map(terms => terms.foldLeft(
    new HashMap[String, Int]())((map, term) => map += term -> (map.getOrElse(term, 0) + 1)))
docTermFreqs.cache() // Important because it will be used more than once

docTermFreqs.type = MapPartitionsRDD[6] at map at <console>:63

In [1]:
import scala.collection.mutable.HashMap

val docTermFreqs = lemmatized.map {
terms =>
    val termFreqsInDoc = terms.foldLeft(new HashMap[String, Int]()) {
        (map, term) => map += term -> (map.getOrElse(term, 0) + 1)
    }
    termFreqsInDoc
}

docTermFreqs.cache() // Important because it will be used more than once

docTermFreqs.type = MapPartitionsRDD[257] at map at <console>:75

### Computing DF

#### Using *aggrigate* but may throws OutOfMemoryException

In [10]:
val zero = new HashMap[String, Int]()

def merge(dfs: HashMap[String, Int], tfs: HashMap[String, Int]): HashMap[String, Int] = {
    tfs.keySet.foreach(term => dfs += term -> (dfs.getOrElse(term, 0) + 1))
    dfs
}

def comb(dfs1: HashMap[String, Int], dfs2: HashMap[String, Int]): HashMap[String, Int] = {
    for((term, count) <- dfs2) dfs1 += term -> (dfs1.getOrElse(term, 0) + count)
    dfs1
}
docTermFreqs.aggregate(zero)(merge, comb)

scala.collection.mutable.HashMap[String,Int] = Map(follow -> 5, establish -> 4, demand -> 1, sister -> 1, former -> 3, labour -> 2, dragons -> 1, divinity -> 1, dorset -> 1, uppsala -> 1, whitford -> 1, founding -> 2, danacord -> 1, witchcraft -> 1, trick -> 1, православ -> 1, famous -> 2, armstrong -> 1, ace -> 2, emigrate -> 1, expense -> 1, godowsky -> 1, cordial -> 1, sixth -> 1, hard -> 1, mincemeat -> 1, guam -> 1, lover -> 1, war -> 4, gap -> 1, mysterious -> 1, development -> 3, carry -> 4, obscenity -> 1, adler -> 1, medium -> 2, easier -> 2, betray -> 1, reagan -> 1, electoral -> 1, μαρτίου -> 1, sasebo -> 1, advance -> 2, therefore -> 1, редакцией -> 1, sorting -> 1, melee -> 1, soas -> 1, large -> 1, occipitus -> 1, salability -> 1, bishop -> 1, kanenobu -> 1, limit -...

#### Distributed calculation

In [11]:
val docFreq = docTermFreqs.flatMap(_.keySet).map(_ -> 1).reduceByKey(_ + _)

### Computing IDF

In [12]:
val numDocs = docTermFreqs.flatMap(_.keySet).distinct.count

In [13]:
import math.log

val numTerms = 50000
implicit val ordering = Ordering.by[(String, Int), Int](_._2)
val topDocFreqs = docFreq.top(numTerms)
val idfs = topDocFreqs.map {
    case (term, count) => (term, log(numDocs.toDouble / count))
}.toMap

## Preparing mllib Vector

In [14]:
import scala.collection.JavaConversions._
import org.apache.spark.mllib.linalg.Vectors

val bIdfs = sc.broadcast(idfs).value
val idTerms = idfs.keys.zipWithIndex.toMap
val termIds = idTerms.map(_.swap)
val bTermIds = sc.broadcast(idTerms).value

val termDocMatrix = docTermFreqs.map { termFreqs =>
    val docTotalTerms = termFreqs.values().sum
    val termScores = termFreqs.filter {
        case (term, freq) => bTermIds.containsKey(term)
    }.map {
        case (term, freq) => bTermIds(term) -> (bIdfs(term) * termFreqs(term) / docTotalTerms)
    }.toSeq
    Vectors.sparse(bTermIds.size, termScores)
}

## Caluculating Single Value Decomposition (SVD)

In [15]:
import org.apache.spark.mllib.linalg.distributed.RowMatrix

val mat = new RowMatrix(termDocMatrix)
println((mat.numRows, mat.numCols))

(70,4411)


In [16]:
import org.apache.spark.mllib.linalg.distributed.RowMatrix

termDocMatrix.cache()
val mat = new RowMatrix(termDocMatrix)
val k = 30 //TODO make it something real e.g.: 1000
val svd = mat.computeSVD(k, computeU=true)

## Finding Important Concepts and Documents

### Finding Important Concepts

In [17]:
import scala.collection.mutable.ArrayBuffer

val v = svd.V
val topTerms = new ArrayBuffer[Seq[(String, Double)]]()
val arr = v.toArray
for (i <- 0 until k) {
    val offs = i * v.numRows // numRows == n (number of words)
    val termWeigts = arr.slice(offs, offs + v.numRows).zipWithIndex // (feature_weight -> feture_id)
    val sorted = termWeigts.sortBy(- _._1) // sort by feature weight descending
    topTerms += sorted.take(numTerms).map { // take top terms and resolve each term itself that represents one feature
       case (score, id) => (termIds(id), score)
    }
}
topTerms

scala.collection.mutable.ArrayBuffer[Seq[(String, Double)]] = ArrayBuffer(ArraySeq((ascaphus,0.5527121377320893), (blast,0.1530877351528438), (eotheod,0.051919568955162435), (borough,4.212207839471294E-12), (fairtrade,4.208688952900275E-12), (corsair,3.6377329100389932E-12), (aruj,3.6376557148443123E-12), (ferenc,3.11739392980237E-12), (munnich,3.1139864992146826E-12), (bethel,1.3773721746490608E-12), (acres,1.3759288847170481E-12), (springs,6.44888656364806E-13), (jemez,6.397139762359672E-13), (tag,2.2410545641449175E-13), (baggage,1.4397944642086102E-13), (code,1.2597648618717372E-13), (bag,1.1205272820724588E-13), (passenger,1.0672712713599708E-13), (bar,9.778690444189886E-14), (game,9.498651865058605E-14), (piano,9.489631302983526E-14), (vox,8.541778395709798E-14), (simon,8.0...

In [18]:
println(svd.V.numRows * svd.V.numCols)
println(svd.V.toArray.size)
print("(" + svd.V.numRows)
println(", " + svd.V.numCols + ")")
svd.V.toArray.slice(0, v.numRows).zipWithIndex.sortBy(- _._1)

132330
132330
(4411, 30)


Array[(Double, Int)] = Array((0.5527121377320893,233), (0.1530877351528438,3288), (0.051919568955162435,4326), (4.212207839471294E-12,4209), (4.208688952900275E-12,833), (3.6377329100389932E-12,1128), (3.6376557148443123E-12,165), (3.11739392980237E-12,2503), (3.1139864992146826E-12,2996), (1.3773721746490608E-12,178), (1.3759288847170481E-12,280), (6.44888656364806E-13,4251), (6.397139762359672E-13,312), (2.2410545641449175E-13,966), (1.4397944642086102E-13,3672), (1.2597648618717372E-13,2154), (1.1205272820724588E-13,660), (1.0672712713599708E-13,383), (9.778690444189886E-14,3041), (9.498651865058605E-14,3878), (9.489631302983526E-14,1548), (8.541778395709798E-14,2890), (8.074790835976842E-14,35), (6.949584137327935E-14,3135), (6.942189878511584E-14,352), (6.926230422532598E-14...

### Finding Important Document - Distributed

In [6]:
val u = svd.U
val docIds = docTermFreqs.flatMap(_._1).zipWithUniqueId().map(_.swap).collectAsMap()
val topDocs = new ArrayBuffer[Seq[(String, Double)]]()
for (i <- 0 until k) {
    val docWeights = u.rows.map(_.toArray(i)).zipWithUniqueId()
    topDocs += docWeights.top(k).map {
        case (score, id) => docIds(id) -> score
    }
}
topDocs

Name: Compile Error
Message: <console>:76: error: value _1 is not a member of scala.collection.mutable.HashMap[String,Int]
       val docIds = docTermFreqs.flatMap(_._1).zipWithUniqueId().map(_.swap).collectAsMap()
                                           ^
StackTrace: 

In [3]:
val xs = new HashMap[Int, String]()
xs += (1 -> "hi")
xs.map(_._1)

scala.collection.mutable.Iterable[Int] = ArrayBuffer(1)

In [7]:
docTermFreqs.take(1)(0).map(_._1)

scala.collection.mutable.Iterable[String] = ArrayBuffer(philosophical, view)

In [None]:
val topdocs = new ArrayBuffer[Seq[(String, Double)]]()

## Querying and Scoring with Low-Dimentional Respresentation

## Relevence

### Term-Term Relelvence

In [None]:
val normalizedVS = rowsNormalized(vsd.V)

### Document-Document Relevence

### Term-Document Relevence

## Mutlible-Term Queries