<a href="https://colab.research.google.com/github/darkwingpatil/Ml_hackethons/blob/main/PySpark_Ml_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Pyspark 🦜🔗:  ML classification**


### **[Reference](https://python.langchain.com/docs/how_to/routing/)**

In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession

In [None]:
spark= SparkSession.builder.appName('IrisAlgo').getOrCreate();

In [None]:
import pandas as pd
from sklearn import datasets
import numpy as np

iris = datasets.load_iris()
pdf = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                  columns= iris['feature_names'] + ['target']).astype({'target': int}) \
       .assign(species=lambda x: x['target'].map(dict(enumerate(iris['target_names']))))
pdf.head()
pdf.to_csv('iris.csv')
df= spark.read.csv('iris.csv', header=True, inferSchema=True)

In [None]:
df.show()

+---+-----------------+----------------+-----------------+----------------+------+-------+
|_c0|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target|species|
+---+-----------------+----------------+-----------------+----------------+------+-------+
|  0|              5.1|             3.5|              1.4|             0.2|     0| setosa|
|  1|              4.9|             3.0|              1.4|             0.2|     0| setosa|
|  2|              4.7|             3.2|              1.3|             0.2|     0| setosa|
|  3|              4.6|             3.1|              1.5|             0.2|     0| setosa|
|  4|              5.0|             3.6|              1.4|             0.2|     0| setosa|
|  5|              5.4|             3.9|              1.7|             0.4|     0| setosa|
|  6|              4.6|             3.4|              1.4|             0.3|     0| setosa|
|  7|              5.0|             3.4|              1.5|             0.2|     0| setosa|

In [None]:
# Pre processing, basically making features index and label encoder for string

In [None]:
from pyspark.ml.feature import VectorAssembler
numericColumns = ['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']
assembler = VectorAssembler(inputCols=numericColumns, outputCol="features")
df = assembler.transform(df)

df.show()


+---+-----------------+----------------+-----------------+----------------+------+-------+-----------------+
|_c0|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target|species|         features|
+---+-----------------+----------------+-----------------+----------------+------+-------+-----------------+
|  0|              5.1|             3.5|              1.4|             0.2|     0| setosa|[5.1,3.5,1.4,0.2]|
|  1|              4.9|             3.0|              1.4|             0.2|     0| setosa|[4.9,3.0,1.4,0.2]|
|  2|              4.7|             3.2|              1.3|             0.2|     0| setosa|[4.7,3.2,1.3,0.2]|
|  3|              4.6|             3.1|              1.5|             0.2|     0| setosa|[4.6,3.1,1.5,0.2]|
|  4|              5.0|             3.6|              1.4|             0.2|     0| setosa|[5.0,3.6,1.4,0.2]|
|  5|              5.4|             3.9|              1.7|             0.4|     0| setosa|[5.4,3.9,1.7,0.4]|
|  6|              

In [None]:
from pyspark.ml.feature import StringIndexer
label_string = StringIndexer(inputCol="species", outputCol="label")
df = label_string.fit(df).transform(df)
df.show()

+---+-----------------+----------------+-----------------+----------------+------+-------+-----------------+-----+
|_c0|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target|species|         features|label|
+---+-----------------+----------------+-----------------+----------------+------+-------+-----------------+-----+
|  0|              5.1|             3.5|              1.4|             0.2|     0| setosa|[5.1,3.5,1.4,0.2]|  0.0|
|  1|              4.9|             3.0|              1.4|             0.2|     0| setosa|[4.9,3.0,1.4,0.2]|  0.0|
|  2|              4.7|             3.2|              1.3|             0.2|     0| setosa|[4.7,3.2,1.3,0.2]|  0.0|
|  3|              4.6|             3.1|              1.5|             0.2|     0| setosa|[4.6,3.1,1.5,0.2]|  0.0|
|  4|              5.0|             3.6|              1.4|             0.2|     0| setosa|[5.0,3.6,1.4,0.2]|  0.0|
|  5|              5.4|             3.9|              1.7|             0.4|     

In [None]:
df.tail(5)

[Row(_c0=145, sepal length (cm)=6.7, sepal width (cm)=3.0, petal length (cm)=5.2, petal width (cm)=2.3, target=2, species='virginica', features=DenseVector([6.7, 3.0, 5.2, 2.3]), label=2.0),
 Row(_c0=146, sepal length (cm)=6.3, sepal width (cm)=2.5, petal length (cm)=5.0, petal width (cm)=1.9, target=2, species='virginica', features=DenseVector([6.3, 2.5, 5.0, 1.9]), label=2.0),
 Row(_c0=147, sepal length (cm)=6.5, sepal width (cm)=3.0, petal length (cm)=5.2, petal width (cm)=2.0, target=2, species='virginica', features=DenseVector([6.5, 3.0, 5.2, 2.0]), label=2.0),
 Row(_c0=148, sepal length (cm)=6.2, sepal width (cm)=3.4, petal length (cm)=5.4, petal width (cm)=2.3, target=2, species='virginica', features=DenseVector([6.2, 3.4, 5.4, 2.3]), label=2.0),
 Row(_c0=149, sepal length (cm)=5.9, sepal width (cm)=3.0, petal length (cm)=5.1, petal width (cm)=1.8, target=2, species='virginica', features=DenseVector([5.9, 3.0, 5.1, 1.8]), label=2.0)]

In [None]:
#ml

In [None]:
train,test = df.randomSplit([0.7,0.3], seed=42)
# seed is similar to random, baically a random

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(train)

In [None]:
predictions = lr_model.transform(test)

In [None]:
predictions.select("label", "prediction").show()

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
+-----+----------+
only showing top 20 rows



In [None]:
# evalutaion

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
eval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = eval.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.9347826086956522
