In [None]:
from pyspark import SparkContext
sc = SparkContext()

### NumPy (Numerical Python) - Basics

In [None]:
import numpy
from numpy import array

In [None]:
list_of_list = [[1,2], [4,9], [10,100]]  # list of list
# list_of_list + 1
# list_of_list * 2

# A numpy array is similar to MATLAB matrix
numpy_array = array(list_of_list)
print(numpy_array)
print('Add 1 to each element')
print(numpy_array + 1)
print('-------')
print('Multiply each element by 2')
print(numpy_array * 2)

In [None]:
print('Shape of array is ', numpy_array.shape)
print('Rank of array (matrix) is ', numpy_array.ndim)
print('Item size is ', numpy_array.dtype)
print('Reshaped Array from 3x2 into 2x3:')
print(numpy_array.reshape(2,3))

#### numpy array operations

In [None]:
A = array( [[1,1],
               [0,1]] )
B = array( [[2,0],
                   [3,4]] )

print('A')
print(A)
print('B')
print(B)
print('\n')
print('A*B')
print(A*B)
print('\n')
print('A-B')
print(A-B)
print('\n')
print('A.B')
print(A.dot(B))

In [None]:
# Some more statistical functions
print('Sum of matrix A is ', A.sum())
print('Sum of matrix B is ', B.sum())
print('Min element in marix B is ', B.min())
print('Max element in marix B is ', B.max())
print('Exponential for B is ', numpy.exp(B))
print('Exponential for B is ', numpy.sqrt(B))

## k-means Clustering with Mlib

https://en.wikipedia.org/wiki/K-means_clustering

K-means is one of the most commonly used clustering algorithms that clusters the data points into a predefined number of clusters. 

Spark MLLIB Docs: https://spark.apache.org/docs/latest/mllib-clustering.html#k-means

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from pyspark.mllib.clustering import KMeans, KMeansModel
from math import sqrt

In [None]:
rdd = sc.textFile('data/unbalance.txt')
rdd.take(5)

In [None]:
rdd_split = rdd.map(lambda line: line.split(' '))
rdd_split.take(5)

In [None]:
features = rdd_split.map(lambda x: (float(x[0]), float(x[1]))).cache()  # Select only the features , convert to float
output = rdd_split.map(lambda x: int(x[2])).cache()  # Take the output

In [None]:
features_python_list = features.collect()
# Convert it to a numpy array
features_numpy_array = array(features_python_list)
# Plot the figure using matplotlib library
plt.plot(features_numpy_array[:,0], features_numpy_array[:,1], 'b.')
plt.show()

In [None]:
# Build the model (cluster the data)
cluster_model = KMeans.train(features, 8)  # intialization step is crucial in algorithms which are randomized
#cluster_model = KMeans.train(features, 8, initializationSteps=100, epsilon=1e-4)

In [None]:
cluster_centers = array(cluster_model.clusterCenters)

plt.scatter(features_numpy_array[:,0], features_numpy_array[:,1]) # Plot the points first

for index in range(0,8):  # For every cluster plot the centers
    plt.scatter(cluster_centers[index,0], cluster_centers[index,1], c='r')
plt.show()

In [None]:
# predict the output labels using the model generated
predicted_labels = array(cluster_model.predict(features).collect())
print(predicted_labels)  # have a look at the result

In [None]:
# take out the unique output labels from array (Using numpy unique function)
numpy.unique(predicted_labels)

In [None]:
# Now plot the scatter plot of the points again using the command above but with colors (Hint: use the parameter c to pass color array)
plt.scatter(features_numpy_array[:,0], features_numpy_array[:,1], c=predicted_labels)

In [None]:
# Calculate the computational cost of the cluster
cluster_model.computeCost(features)