## INFOH515 Pyspark code
## Author: Gianluca Bontempi
## Pyspark implementation of the matrix transpose 


Let us consider a least-squares regression problem with a single output target Y and n input variables X,..., Xn.

Suppose that the dataset (made of N observations) is stored in a horizontal format, i.e. the first row corresponds to the output Y and the following n rows correspond to the n inputs.

$$0, y_1, ..., y_N\\
1, x_{11}, ..., x_{N1}\\
.. . .\\
n, x_{1n}. ..., x_{Nn}
$$

Write a map-reduce pseudo-code to transpose the matrix.



In [2]:
import os 
import pwd
# Disable warnings, set Matplotlib inline plotting and load Pandas package
import numpy as np
import getpass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

# create an instance of SparkSession
spark=SparkSession.builder.appName('s.com').getOrCreate()
sc=spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/18 11:30:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [22]:
def order(x):
    x1=x[1]
    n=int(len(x1)/2)
    xx=np.zeros(n)
    for i in range(n):
        xx[int(x1[2*i])]=x1[2*i+1]
    return((x[0],xx))    

## Data import

In [23]:
Dataset = sc.textFile("DXY").map(lambda x : x.split(",")).map(lambda x : [float(i) for i in x])
Dataset.collect()
#the first column represents the column in the transpose matrix

[[0.0, 1.0, -1.0, 0.0, 12.0],
 [1.0, 2.0, -3.0, 14.0, 14.0],
 [2.0, 3.0, -5.0, 16.0, 16.0],
 [3.0, 4.0, -0.0, 10.0, 10.0],
 [4.0, 1.0, -1.0, 12.0, 12.0],
 [5.0, 2.0, -5.0, 16.0, 16.0]]

#### map (x)-> (row,(col,x[row]))

In [29]:
tDataset=Dataset.flatMap(lambda x : [(i-1,((x[0]),x[i])) for i in range(1,len(x))])
# i-1 is the key and represents the row in the transpose matrix
# x[0] is the column in the transposed matrix
# x[i] is the element in position x[row,col] in the transpose matrix


tDataset.collect()

[(0, (0.0, 1.0)),
 (1, (0.0, -1.0)),
 (2, (0.0, 0.0)),
 (3, (0.0, 12.0)),
 (0, (1.0, 2.0)),
 (1, (1.0, -3.0)),
 (2, (1.0, 14.0)),
 (3, (1.0, 14.0)),
 (0, (2.0, 3.0)),
 (1, (2.0, -5.0)),
 (2, (2.0, 16.0)),
 (3, (2.0, 16.0)),
 (0, (3.0, 4.0)),
 (1, (3.0, -0.0)),
 (2, (3.0, 10.0)),
 (3, (3.0, 10.0)),
 (0, (4.0, 1.0)),
 (1, (4.0, -1.0)),
 (2, (4.0, 12.0)),
 (3, (4.0, 12.0)),
 (0, (5.0, 2.0)),
 (1, (5.0, -5.0)),
 (2, (5.0, 16.0)),
 (3, (5.0, 16.0))]

In [30]:
tDataset=tDataset.reduceByKey(lambda a,b: (a+b))

# reduce join the lists associated to the same row
tDataset.collect()

[(0, (0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 1.0, 5.0, 2.0)),
 (2, (0.0, 0.0, 1.0, 14.0, 2.0, 16.0, 3.0, 10.0, 4.0, 12.0, 5.0, 16.0)),
 (1, (0.0, -1.0, 1.0, -3.0, 2.0, -5.0, 3.0, -0.0, 4.0, -1.0, 5.0, -5.0)),
 (3, (0.0, 12.0, 1.0, 14.0, 2.0, 16.0, 3.0, 10.0, 4.0, 12.0, 5.0, 16.0))]

#### Order the elements according to the column order

In [32]:
tDataset2=tDataset.map(lambda x : order(x)).sortByKey()
tDataset2.collect()

[(0, array([1., 2., 3., 4., 1., 2.])),
 (1, array([-1., -3., -5., -0., -1., -5.])),
 (2, array([ 0., 14., 16., 10., 12., 16.])),
 (3, array([12., 14., 16., 10., 12., 16.]))]