In [2]:
import pyspark
from operator import add
from pyspark import SparkConf
from pyspark.ml.feature import NGram
from pyspark.sql.functions import col,udf
from pyspark.sql import SQLContext
from operator import add
import numpy as np
import string
sc = pyspark.SparkContext('local[*]',appName="DocClassification")
sqlc = SQLContext(sc)



We're going to create some really simple dummy documents as a proof of concept. Each document is a string, like we would deal with. We'll map them like we usually do, with a slight twist. After we zip with index (to get a document id) and swap them into (did,doc) shape, then map them into (did,wid) pairs and lowercase them, we're going to map them to (did,wid,1) tuples, instead of the usual ((did,wid),1) pairs where we can then reduceByKey(add) to get ((did,wid),count) values. The former is a good representation for a sparse matrix, but the latter is going to let us create a dataframe more easilly. 

In [4]:
def swap(x):
    return (x[1],x[0])

strs = sc.parallelize(["This is something I really want to test.",
                       "To test something is something I really want to do.",
                       "I need to do something that I want to do which is to test something."])
strs = strs.zipWithIndex()\
            .map(lambda x: swap(x))\
            .flatMapValues(lambda x: x.split())\
            .mapValues(lambda x: x.lower().strip(string.punctuation))\
            .map(lambda x: (x[0],x[1],1))

print(strs.collect())



[(0, 'this', 1), (0, 'is', 1), (0, 'something', 1), (0, 'i', 1), (0, 'really', 1), (0, 'want', 1), (0, 'to', 1), (0, 'test', 1), (1, 'to', 1), (1, 'test', 1), (1, 'something', 1), (1, 'is', 1), (1, 'something', 1), (1, 'i', 1), (1, 'really', 1), (1, 'want', 1), (1, 'to', 1), (1, 'do', 1), (2, 'i', 1), (2, 'need', 1), (2, 'to', 1), (2, 'do', 1), (2, 'something', 1), (2, 'that', 1), (2, 'i', 1), (2, 'want', 1), (2, 'to', 1), (2, 'do', 1), (2, 'which', 1), (2, 'is', 1), (2, 'to', 1), (2, 'test', 1), (2, 'something', 1)]


Now we'll throw them into a dataframe. It is of course not in the correct order as you can see...

In [5]:
df = sqlc.createDataFrame(strs,schema = ['did','wid','count'])
df.show()


+---+---------+-----+
|did|      wid|count|
+---+---------+-----+
|  0|     this|    1|
|  0|       is|    1|
|  0|something|    1|
|  0|        i|    1|
|  0|   really|    1|
|  0|     want|    1|
|  0|       to|    1|
|  0|     test|    1|
|  1|       to|    1|
|  1|     test|    1|
|  1|something|    1|
|  1|       is|    1|
|  1|something|    1|
|  1|        i|    1|
|  1|   really|    1|
|  1|     want|    1|
|  1|       to|    1|
|  1|       do|    1|
|  2|        i|    1|
|  2|     need|    1|
+---+---------+-----+
only showing top 20 rows



This is where the magic happens. If we groupBy('did') then we can pivot('wid'). Basically this turns each distinct value in the 'wid' column into its own column. We can really only work with this though if we use some sort of aggregation function. That's where the sum('count') comes in, and why we kept it in this strange format until now.

In [6]:
df = df.groupBy('did').pivot('wid').sum('count')

df.show()

+---+----+---+---+----+------+---------+----+----+----+---+----+-----+
|did|  do|  i| is|need|really|something|test|that|this| to|want|which|
+---+----+---+---+----+------+---------+----+----+----+---+----+-----+
|  0|null|  1|  1|null|     1|        1|   1|null|   1|  1|   1| null|
|  1|   1|  1|  1|null|     1|        2|   1|null|null|  2|   1| null|
|  2|   2|  2|  1|   1|  null|        2|   1|   1|null|  3|   1|    1|
+---+----+---+---+----+------+---------+----+----+----+---+----+-----+



Now we can just turn the null values into 0s and we're on our way home...

In [7]:
df = df.na.fill(0)

df.show()

+---+---+---+---+----+------+---------+----+----+----+---+----+-----+
|did| do|  i| is|need|really|something|test|that|this| to|want|which|
+---+---+---+---+----+------+---------+----+----+----+---+----+-----+
|  0|  0|  1|  1|   0|     1|        1|   1|   0|   1|  1|   1|    0|
|  1|  1|  1|  1|   0|     1|        2|   1|   0|   0|  2|   1|    0|
|  2|  2|  2|  1|   1|     0|        2|   1|   1|   0|  3|   1|    1|
+---+---+---+---+----+------+---------+----+----+----+---+----+-----+

