In [None]:
!wget https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar -xvf spark-3.2.1-bin-hadoop3.2.tgz
!java -version
!pip install findspark

Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [None]:
import os 
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"
import findspark
findspark.init()
from pyspark.sql import SparkSession
import findspark
findspark.init()
from pyspark.sql import SparkSession

## You can add more config while building 
spark = SparkSession.builder.master("local[8]").\
                    config("spark.app.name","session_one").\
                    getOrCreate() #number of threads = 16

In [None]:
dfs = spark.read.csv("cities.csv",header=True,inferSchema=True)
dfs.show()

+----+-------+-------+-----+-------+-------+-------+-----+------------------+--------+
|LatD| "LatM"| "LatS"| "NS"| "LonD"| "LonM"| "LonS"| "EW"|            "City"| "State"|
+----+-------+-------+-----+-------+-------+-------+-----+------------------+--------+
|41.0|    5.0|   59.0|  "N"|   80.0|   39.0|    0.0|  "W"|      "Youngstown"|      OH|
|42.0|   52.0|   48.0|  "N"|   97.0|   23.0|   23.0|  "W"|         "Yankton"|      SD|
|46.0|   35.0|   59.0|  "N"|  120.0|   30.0|   36.0|  "W"|          "Yakima"|      WA|
|42.0|   16.0|   12.0|  "N"|   71.0|   48.0|    0.0|  "W"|       "Worcester"|      MA|
|43.0|   37.0|   48.0|  "N"|   89.0|   46.0|   11.0|  "W"| "Wisconsin Dells"|      WI|
|36.0|    5.0|   59.0|  "N"|   80.0|   15.0|    0.0|  "W"|   "Winston-Salem"|      NC|
|49.0|   52.0|   48.0|  "N"|   97.0|    9.0|    0.0|  "W"|        "Winnipeg"|      MB|
|39.0|   11.0|   23.0|  "N"|   78.0|    9.0|   36.0|  "W"|      "Winchester"|      VA|
|34.0|   14.0|   24.0|  "N"|   77.0|   55.0

In [None]:
dfs.schema.fields

[StructField(LatD,DoubleType,true),
 StructField( "LatM",DoubleType,true),
 StructField( "LatS",DoubleType,true),
 StructField( "NS",StringType,true),
 StructField( "LonD",DoubleType,true),
 StructField( "LonM",DoubleType,true),
 StructField( "LonS",DoubleType,true),
 StructField( "EW",StringType,true),
 StructField( "City",StringType,true),
 StructField( "State",StringType,true)]

In [None]:
string_cols = [c for c, t in dfs.dtypes if t =='string'] #all stringtype column names in a list
print(string_cols)
stringindex_cols = [(i + "_indexed") for i in string_cols]
print(stringindex_cols)

[' "NS"', ' "EW"', ' "City"', ' "State"']
[' "NS"_indexed', ' "EW"_indexed', ' "City"_indexed', ' "State"_indexed']


In [None]:
from pyspark.ml.feature import StringIndexer
indexer  = StringIndexer( inputCols=string_cols, outputCols=stringindex_cols, handleInvalid='error', stringOrderType='frequencyDesc')
indexer.setHandleInvalid("skip")
indexed = indexer.fit(dfs).transform(dfs)
indexed.show()

+----+-------+-------+-----+-------+-------+-------+-----+------------------+--------+-------------+-------------+---------------+----------------+
|LatD| "LatM"| "LatS"| "NS"| "LonD"| "LonM"| "LonS"| "EW"|            "City"| "State"| "NS"_indexed| "EW"_indexed| "City"_indexed| "State"_indexed|
+----+-------+-------+-----+-------+-------+-------+-----+------------------+--------+-------------+-------------+---------------+----------------+
|41.0|    5.0|   59.0|  "N"|   80.0|   39.0|    0.0|  "W"|      "Youngstown"|      OH|          0.0|          0.0|          119.0|             6.0|
|42.0|   52.0|   48.0|  "N"|   97.0|   23.0|   23.0|  "W"|         "Yankton"|      SD|          0.0|          0.0|          118.0|            18.0|
|46.0|   35.0|   59.0|  "N"|  120.0|   30.0|   36.0|  "W"|          "Yakima"|      WA|          0.0|          0.0|          117.0|             3.0|
|42.0|   16.0|   12.0|  "N"|   71.0|   48.0|    0.0|  "W"|       "Worcester"|      MA|          0.0|          0.

In [None]:
from pyspark.sql.types import *
allnonstringcols = [column.name for column in indexed.schema if column.dataType != StringType()]
print(allnonstringcols)

['LatD', ' "LatM"', ' "LatS"', ' "LonD"', ' "LonM"', ' "LonS"', ' "NS"_indexed', ' "EW"_indexed', ' "City"_indexed', ' "State"_indexed']


In [None]:
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(outputCol="features")
vecAssembler.setInputCols(allnonstringcols)
print(vecAssembler)
dataset = vecAssembler.transform(indexed)

VectorAssembler_4fd0482153f8


In [None]:
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import KMeans
numIterations = 100
numberClusters = 10
kmeans = KMeans().setMaxIter(numIterations).setK(numberClusters).setSeed(1)
model = kmeans.fit(dataset)
predictions = model.transform(dataset)


In [None]:
p1 = predictions.orderBy('prediction')
p1.show(n=30,truncate=False)


+----+-------+-------+-----+-------+-------+-------+-----+---------------------+--------+-------------+-------------+---------------+----------------+--------------------------------------------------+----------+
|LatD| "LatM"| "LatS"| "NS"| "LonD"| "LonM"| "LonS"| "EW"| "City"              | "State"| "NS"_indexed| "EW"_indexed| "City"_indexed| "State"_indexed|features                                          |prediction|
+----+-------+-------+-----+-------+-------+-------+-----+---------------------+--------+-------------+-------------+---------------+----------------+--------------------------------------------------+----------+
|44.0|45.0   |35.0   | "N" |85.0   |37.0   |47.0   | "W" | "Traverse City"     | MI     |0.0          |0.0          |77.0           |9.0             |[44.0,45.0,35.0,85.0,37.0,47.0,0.0,0.0,77.0,9.0]  |0         |
|47.0|14.0   |24.0   | "N" |122.0  |25.0   |48.0   | "W" | "Tacoma"            | WA     |0.0          |0.0          |69.0           |3.0            

In [None]:
print(p1.count())
dfs.count()

128


128

In [None]:

from pyspark.ml.linalg import Vectors #cosine similarity for two vectors
x = Vectors.dense([1,2*2,5])
y = Vectors.dense([5,1*2,1])
cossim =  x.dot(y)/(x.norm(2)*y.norm(2))
print(cossim)

0.50709255283711


In [None]:
import numpy as np
import scipy
from scipy.spatial.distance import cosine
x = Vectors.dense([1,2,5])
y = Vectors.dense([1,2,5])
cosine(x,y)

0.0