In [48]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Missing").getOrCreate()

In [49]:
training = spark.read.csv("50_Startups.csv",header=True,inferSchema=True)

In [50]:
training.show()

+---------+--------------+---------------+----------+---------+
|R&D Spend|Administration|Marketing Spend|     State|   Profit|
+---------+--------------+---------------+----------+---------+
| 165349.2|      136897.8|       471784.1|  New York|192261.83|
| 162597.7|     151377.59|      443898.53|California|191792.06|
|153441.51|     101145.55|      407934.54|   Florida|191050.39|
|144372.41|     118671.85|      383199.62|  New York|182901.99|
|142107.34|      91391.77|      366168.42|   Florida|166187.94|
| 131876.9|      99814.71|      362861.36|  New York|156991.12|
|134615.46|     147198.87|      127716.82|California|156122.51|
|130298.13|     145530.06|      323876.68|   Florida| 155752.6|
|120542.52|     148718.95|      311613.29|  New York|152211.77|
|123334.88|     108679.17|      304981.62|California|149759.96|
|101913.08|     110594.11|      229160.95|   Florida|146121.95|
|100671.96|      91790.61|      249744.55|California| 144259.4|
| 93863.75|     127320.38|      249839.4

In [51]:
training.printSchema()

root
 |-- R&D Spend: double (nullable = true)
 |-- Administration: double (nullable = true)
 |-- Marketing Spend: double (nullable = true)
 |-- State: string (nullable = true)
 |-- Profit: double (nullable = true)



In [52]:
## we're going to group all features with the help of vector assembler

In [53]:
training.columns

['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit']

In [54]:
##["R&D Spend","Administration","Marketing Spend"]---> new feature----> independent feature

In [55]:
from pyspark.ml.feature import VectorAssembler
featureAssembler=VectorAssembler(inputCols=["R&D Spend","Administration"],outputCol="Independent Features")

In [56]:
output=featureAssembler.transform(training)
output.show()


+---------+--------------+---------------+----------+---------+--------------------+
|R&D Spend|Administration|Marketing Spend|     State|   Profit|Independent Features|
+---------+--------------+---------------+----------+---------+--------------------+
| 165349.2|      136897.8|       471784.1|  New York|192261.83| [165349.2,136897.8]|
| 162597.7|     151377.59|      443898.53|California|191792.06|[162597.7,151377.59]|
|153441.51|     101145.55|      407934.54|   Florida|191050.39|[153441.51,101145...|
|144372.41|     118671.85|      383199.62|  New York|182901.99|[144372.41,118671...|
|142107.34|      91391.77|      366168.42|   Florida|166187.94|[142107.34,91391.77]|
| 131876.9|      99814.71|      362861.36|  New York|156991.12| [131876.9,99814.71]|
|134615.46|     147198.87|      127716.82|California|156122.51|[134615.46,147198...|
|130298.13|     145530.06|      323876.68|   Florida| 155752.6|[130298.13,145530...|
|120542.52|     148718.95|      311613.29|  New York|152211.77|[1

In [57]:
finalized_data=output.select("Independent Features","Profit")

In [58]:
finalized_data.show()

+--------------------+---------+
|Independent Features|   Profit|
+--------------------+---------+
| [165349.2,136897.8]|192261.83|
|[162597.7,151377.59]|191792.06|
|[153441.51,101145...|191050.39|
|[144372.41,118671...|182901.99|
|[142107.34,91391.77]|166187.94|
| [131876.9,99814.71]|156991.12|
|[134615.46,147198...|156122.51|
|[130298.13,145530...| 155752.6|
|[120542.52,148718...|152211.77|
|[123334.88,108679...|149759.96|
|[101913.08,110594...|146121.95|
|[100671.96,91790.61]| 144259.4|
|[93863.75,127320.38]|141585.52|
|[91992.39,135495.07]|134307.35|
|[119943.24,156547...|132602.65|
|[114523.61,122616...|129917.04|
|[78013.11,121597.55]|126992.93|
|[94657.16,145077.58]|125370.37|
|[91749.16,114175.79]| 124266.9|
| [86419.7,153514.11]|122776.86|
+--------------------+---------+
only showing top 20 rows



In [59]:
from pyspark.ml.regression import LinearRegression
### Train-Test split
train_data, test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol="Independent Features",labelCol="Profit")
regressor= regressor.fit(train_data)

In [60]:
### Coefficients
regressor.coefficients

DenseVector([0.8573, -0.0977])

In [61]:
### Intercepts
regressor.intercept

60303.32041913629

In [62]:
### Prediction
pred_results=regressor.evaluate(test_data)

In [63]:
pred_results.predictions.show()

+--------------------+---------+------------------+
|Independent Features|   Profit|        prediction|
+--------------------+---------+------------------+
|     [0.0,135426.92]| 42559.73| 47067.05185235753|
|   [542.05,51743.15]| 35673.41| 55710.80528339601|
| [1315.46,115816.21]| 49490.75| 50111.55250894356|
|[22177.74,154806.14]| 65200.33| 64186.93136421743|
|[28663.76,127056.21]| 90708.19| 72459.88662001677|
|[64664.71,139553.16]|107404.34|102103.66765499633|
|[66051.52,182645.56]|103282.38| 99080.90386749218|
| [86419.7,153514.11]|122776.86| 119390.6723377567|
|[91992.39,135495.07]|134307.35|125929.51814112751|
|[100671.96,91790.61]| 144259.4|137642.45392758338|
|[101913.08,110594...|146121.95|136868.71645915782|
|[114523.61,122616...|129917.04|146505.20937266207|
| [165349.2,136897.8]|192261.83|188684.44271058965|
+--------------------+---------+------------------+



In [68]:
pred_results.meanSquaredError

100077026.20027742