## INFOH515 Pyspark code
## Author: Gianluca Bontempi
## Pyspark implementation of the matrix transpose 


In [6]:
import os 
import pwd
# Disable warnings, set Matplotlib inline plotting and load Pandas package
import numpy as np
import getpass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

# create an instance of SparkSession
spark=SparkSession.builder.appName('s.com').getOrCreate()
sc=spark.sparkContext

In [7]:
def order(x):
    n=int(len(x)/2)
    xx=np.zeros(n)
    for i in range(n):
        #xx[int(x[2*i])-1]=x[2*i+1]
        xx[int(x[2*i])]=x[2*i+1]
    return(xx)    

## Data import

In [8]:
Dataset = sc.textFile("DXY").map(lambda x : x.split(",")).map(lambda x : [float(i) for i in x])
Dataset.collect()
#the first column represents the column in the transpose matrix

[[0.0, 1.0, -1.0, 0.0, 12.0],
 [1.0, 2.0, -3.0, 14.0, 14.0],
 [2.0, 3.0, -5.0, 16.0, 16.0],
 [3.0, 4.0, -0.0, 10.0, 10.0],
 [4.0, 1.0, -1.0, 12.0, 12.0],
 [5.0, 2.0, -5.0, 16.0, 16.0]]

#### map (x)-> (row,(col,x[row]))

In [9]:
tDataset=Dataset.flatMap(lambda x : [(i-1,((x[0]),x[i])) for i in range(1,len(x))]).reduceByKey(lambda a,b: (a+b))
# i-1 is the key and represents the row in the transpose matrix
# x[0] is the column in the transposed matrix
# x[i] is the element in position x[row,col] in the transpose matrix

# reduce join the lists associated to the same row
tDataset.collect()

[(0, (0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 1.0, 5.0, 2.0)),
 (1, (0.0, -1.0, 1.0, -3.0, 2.0, -5.0, 3.0, -0.0, 4.0, -1.0, 5.0, -5.0)),
 (2, (0.0, 0.0, 1.0, 14.0, 2.0, 16.0, 3.0, 10.0, 4.0, 12.0, 5.0, 16.0)),
 (3, (0.0, 12.0, 1.0, 14.0, 2.0, 16.0, 3.0, 10.0, 4.0, 12.0, 5.0, 16.0))]

#### Order the elements according to the column order

In [10]:
tDataset=tDataset.map(lambda x : order(x[1]))
tDataset.collect()

[array([1., 2., 3., 4., 1., 2.]),
 array([-1., -3., -5., -0., -1., -5.]),
 array([ 0., 14., 16., 10., 12., 16.]),
 array([12., 14., 16., 10., 12., 16.])]