In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=978fe0cb30bd9d2b16b5b6ae95e3a0c6368260962f50afee2684bc4ac7c814b1
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [None]:
spark

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
dataset = spark.read.csv('/content/HousingData.csv',inferSchema=True, header =True)

In [None]:
dataset.printSchema()

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)



In [None]:
#Input all the features in one vector column
assembler = VectorAssembler(inputCols=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'], outputCol = 'Attributes')

output = assembler.transform(dataset)

#Input vs Output
finalized_data = output.select("Attributes","MEDV")

finalized_data.show()

+--------------------+----+
|          Attributes|MEDV|
+--------------------+----+
|[0.00632,18.0,2.3...|24.0|
|[0.02731,0.0,7.07...|21.6|
|[0.02729,0.0,7.07...|34.7|
|[0.03237,0.0,2.18...|33.4|
|[0.02985,0.0,2.18...|28.7|
|[0.14455,12.5,7.8...|27.1|
|[0.21124,12.5,7.8...|16.5|
|[0.22489,12.5,7.8...|15.0|
|[0.11747,12.5,7.8...|18.9|
|[0.09378,12.5,7.8...|21.7|
|[0.62976,0.0,8.14...|20.4|
|[0.62739,0.0,8.14...|19.9|
|[1.05393,0.0,8.14...|23.1|
|[0.7842,0.0,8.14,...|17.5|
|[0.80271,0.0,8.14...|20.2|
|[0.7258,0.0,8.14,...|18.2|
|[1.25179,0.0,8.14...|13.6|
|[0.85204,0.0,8.14...|19.6|
|[1.23247,0.0,8.14...|15.2|
|[0.98843,0.0,8.14...|14.5|
+--------------------+----+
only showing top 20 rows



In [None]:
#Split training and testing data
train_data,test_data = finalized_data.randomSplit([0.8,0.2])


regressor = LinearRegression(featuresCol = 'Attributes', labelCol = 'MEDV')

#Learn to fit the model from training set
regressor = regressor.fit(train_data)

#To predict the prices on testing set
pred = regressor.evaluate(test_data)

#Predict the model
pred.predictions.show()

+--------------------+----+------------------+
|          Attributes|MEDV|        prediction|
+--------------------+----+------------------+
|[0.01311,90.0,1.2...|35.4|31.525188586148076|
|[0.01538,90.0,3.7...|44.0| 37.53210437732332|
|[0.01951,17.5,1.3...|33.0|24.074347369662583|
|[0.02763,75.0,2.9...|30.8|30.980024932579006|
|[0.03041,0.0,5.19...|18.5|18.773199762238576|
|[0.03427,0.0,5.19...|19.5| 19.83566081337278|
|[0.03548,80.0,3.6...|20.9|21.953744568879323|
|[0.03871,52.5,5.3...|23.2|26.581897452110645|
|[0.04113,25.0,4.8...|28.0| 28.52535240376085|
|[0.04294,28.0,15....|20.6|26.441163433032578|
|[0.0456,0.0,13.89...|23.3|27.512838027746334|
|[0.04666,80.0,1.5...|30.3|  33.5179042868642|
|[0.04684,0.0,3.41...|22.6|26.736344654313733|
|[0.05083,0.0,5.19...|22.2|21.924160418734633|
|[0.0578,0.0,2.46,...|37.2| 32.54334946626332|
|[0.05789,12.5,6.0...|22.0|21.376120711498945|
|[0.06129,20.0,3.3...|46.0| 41.18223116234166|
|[0.06617,0.0,3.24...|19.3| 21.57024725649697|
|[0.0686,0.0,

In [None]:
#coefficient of the regression model
coeff = regressor.coefficients

#X and Y intercept
intr = regressor.intercept

print ("The coefficient of the model is : %a" %coeff)
print ("The Intercept of the model is : %f" %intr)

The coefficient of the model is : DenseVector([-0.0916, 0.0362, 0.0576, 3.3814, -18.2806, 5.0691, -0.03, -1.3968, 0.2014, -0.0122, -0.9297, 0.0061, -0.3243])
The Intercept of the model is : 29.168350


In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
rf = RandomForestRegressor(featuresCol="Attributes", labelCol = 'MEDV')
regressor_train = rf.fit(train_data)
#To predict the prices on testing set
predictions = regressor_train.transform(test_data)

#Predict the model
#pred_test.select("prediction", "label", "features").show(5)

# Select example rows to display.
predictions.select("prediction", "MEDV", "Attributes").show(5)

evaluator = RegressionEvaluator(
    labelCol="MEDV", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

+------------------+----+--------------------+
|        prediction|MEDV|          Attributes|
+------------------+----+--------------------+
| 33.37830253369978|35.4|[0.01311,90.0,1.2...|
|42.325253541840716|44.0|[0.01538,90.0,3.7...|
|29.701772260737908|33.0|[0.01951,17.5,1.3...|
|27.018383637204636|30.8|[0.02763,75.0,2.9...|
| 20.42032148169011|18.5|[0.03041,0.0,5.19...|
+------------------+----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 3.8194


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="MEDV", predictionCol="prediction", metricName="rmse")

# Root Mean Square Error
rmse = eval.evaluate(pred.predictions)
print("RMSE: %.3f" % rmse)

# Mean Square Error
mse = eval.evaluate(pred.predictions, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)

# Mean Absolute Error
mae = eval.evaluate(pred.predictions, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = eval.evaluate(pred.predictions, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 5.406
MSE: 29.220
MAE: 3.442
r2: 0.668


In [None]:
df2 = spark.read.csv('/content/creditcard.csv', header=True)

In [None]:
df2.show(10)

+----+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------+-----+
|Time|          V1|          V2|          V3|          V4|          V5|          V6|          V7|          V8|          V9|         V10|         V11|         V12|         V13|         V14|         V15|         V16|         V17|         V18|         V19|         V20|         V21|         V22|         V23|         V24|         V25|         V26|         V27|         V28|Amount|Class|
+----+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+-----

In [None]:
df2.describe().show()

+-------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+
|summary|             Time|                  V1|                  V2|                  V3|                  V4|                  V5|                  V6|                  V7|                  V8|                  V9|                 V10|                 V11|                 V12|                 V13|                 V14|                 V15|  