<a href="https://colab.research.google.com/github/dawamassidiqi/Machine-Learning/blob/main/Super%20Vector%20Machine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [104]:
# Download Java Virtual Machine (JVM)
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [105]:
# Download Spark
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz
# Unzip the file
!tar xf spark-3.0.0-bin-hadoop3.2.tgz

In [106]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"

In [107]:
# Install library for finding Spark
!pip install -q findspark
# Import the libary
import findspark
# Initiate findspark
findspark.init()
# Check the location for Spark
findspark.find()

'/content/spark-3.0.0-bin-hadoop3.2'

In [108]:
import pyspark
from pyspark.sql import SparkSession
#SparkSession is now the entry point of Spark
#SparkSession can also be construed as gateway to spark libraries
  
#create instance of spark class
spark=SparkSession.builder.appName('housing_price_model').getOrCreate()
  
#create spark dataframe of input csv file
df=spark.read.csv('/content/sample_data/california_housing_test.csv',inferSchema=True,header=True)
df.show(10)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -122.05|   37.37|              27.0|     3885.0|         661.0|    1537.0|     606.0|       6.6085|          344700.0|
|   -118.3|   34.26|              43.0|     1510.0|         310.0|     809.0|     277.0|        3.599|          176500.0|
|  -117.81|   33.78|              27.0|     3589.0|         507.0|    1484.0|     495.0|       5.7934|          270500.0|
|  -118.36|   33.82|              28.0|       67.0|          15.0|      49.0|      11.0|       6.1359|          330000.0|
|  -119.67|   36.33|              19.0|     1241.0|         244.0|     850.0|     237.0|       2.9375|           81700.0|
|  -119.56|   36.51|    

In [109]:
#prints structure of dataframe along with datatype
df.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)



In [110]:
#In our predictive model, below are the columns
df.columns

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value']

In [111]:
#columns identified as features are as below:
#['Cruise_line','Age','Tonnage','passengers','length','cabins','passenger_density']
#to work on the features, spark MLlib expects every value to be in numeric form
#feature 'Cruise_line is string datatype
#using StringIndexer, string type will be typecast to numeric datatype
#import library strinindexer for typecasting
  
from pyspark.ml.feature import StringIndexer
indexer=StringIndexer(inputCol='latitude',outputCol='cruise_cat')
indexed=indexer.fit(df).transform(df)
  
#above code will convert string to numeric feature and create a new dataframe
#new dataframe contains a new feature 'cruise_cat' and can be used further
#feature cruise_cat is now vectorized and can be used to fed to model
for item in indexed.head(5):
    print(item)
    print('\n')

Row(longitude=-122.05, latitude=37.37, housing_median_age=27.0, total_rooms=3885.0, total_bedrooms=661.0, population=1537.0, households=606.0, median_income=6.6085, median_house_value=344700.0, cruise_cat=80.0)


Row(longitude=-118.3, latitude=34.26, housing_median_age=43.0, total_rooms=1510.0, total_bedrooms=310.0, population=809.0, households=277.0, median_income=3.599, median_house_value=176500.0, cruise_cat=48.0)


Row(longitude=-117.81, latitude=33.78, housing_median_age=27.0, total_rooms=3589.0, total_bedrooms=507.0, population=1484.0, households=495.0, median_income=5.7934, median_house_value=270500.0, cruise_cat=74.0)


Row(longitude=-118.36, latitude=33.82, housing_median_age=28.0, total_rooms=67.0, total_bedrooms=15.0, population=49.0, households=11.0, median_income=6.1359, median_house_value=330000.0, cruise_cat=37.0)


Row(longitude=-119.67, latitude=36.33, housing_median_age=19.0, total_rooms=1241.0, total_bedrooms=244.0, population=850.0, households=237.0, median_income=2

In [112]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
#creating vectors from features
#Apache MLlib takes input if vector form
assembler=VectorAssembler(inputCols=['housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'cruise_cat'],outputCol='features')
output=assembler.transform(indexed)
output.select('features','median_house_value').show(5)
#output as below

+--------------------+------------------+
|            features|median_house_value|
+--------------------+------------------+
|[27.0,3885.0,661....|          344700.0|
|[43.0,1510.0,310....|          176500.0|
|[27.0,3589.0,507....|          270500.0|
|[28.0,67.0,15.0,4...|          330000.0|
|[19.0,1241.0,244....|           81700.0|
+--------------------+------------------+
only showing top 5 rows



In [113]:
#final data consist of features and label which is crew.
final_data=output.select('features','median_house_value')
#splitting data into train and test
train_data,test_data=final_data.randomSplit([0.7,0.3])
train_data.describe().show()

+-------+------------------+
|summary|median_house_value|
+-------+------------------+
|  count|              2120|
|   mean| 204656.0316037736|
| stddev|113139.71957818244|
|    min|           22500.0|
|    max|          500001.0|
+-------+------------------+



In [114]:
test_data.describe().show()

+-------+------------------+
|summary|median_house_value|
+-------+------------------+
|  count|               880|
|   mean|208713.67954545454|
| stddev|113084.24605369722|
|    min|           47500.0|
|    max|          500001.0|
+-------+------------------+



In [115]:
#import LinearRegression library
from pyspark.ml.regression import LinearRegression
#creating an object of class LinearRegression
#object takes features and label as input arguments
ship_lr=LinearRegression(featuresCol='features',labelCol='crew')
#pass train_data to train model
trained_ship_model=ship_lr.fit(train_data)
#evaluating model trained for Rsquared error
ship_results=trained_ship_model.evaluate(train_data)
  
print('Rsquared Error :',ship_results.r2)
#R2 value shows accuracy of model is 92%
#model accuracy is very good and can be use for predictive analysis

IllegalArgumentException: ignored

In [116]:
#testing Model on unlabeled data
#create unlabeled data from test_data
#testing model on unlabeled data
unlabeled_data=test_data.select('features')
unlabeled_data.show(5)

+--------------------+
|            features|
+--------------------+
|[1.0,83.0,15.0,32...|
|[3.0,2821.0,519.0...|
|[3.0,7689.0,1545....|
|[4.0,1346.0,213.0...|
|[4.0,1793.0,390.0...|
+--------------------+
only showing top 5 rows



In [117]:
predictions=trained_ship_model.transform(unlabeled_data)
predictions.show()
#below are the results of output from test data

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[1.0,83.0,15.0,32...|  172807.032326014|
|[3.0,2821.0,519.0...| 145449.3218103586|
|[3.0,7689.0,1545....|168495.38897775538|
|[4.0,1346.0,213.0...| 389991.2763938146|
|[4.0,1793.0,390.0...|199831.68218252243|
|[4.0,2372.0,361.0...| 209818.9794016584|
|[4.0,2937.0,648.0...|202956.54363046767|
|[4.0,6986.0,1217....|132385.05791640584|
|[4.0,11021.0,1565...| 292284.4010842854|
|[4.0,18123.0,3173...|319471.31908251636|
|[4.0,21988.0,4055...|192206.37763677404|
|[4.0,23915.0,4135...|211404.92761180399|
|[5.0,906.0,187.0,...|148800.95741835865|
|[5.0,1395.0,373.0...|133227.15089335683|
|[5.0,1922.0,489.0...| 96093.28072383671|
|[5.0,1998.0,500.0...|185808.68801913795|
|[5.0,2256.0,420.0...|218073.08347404475|
|[5.0,3846.0,786.0...|227835.42377252114|
|[5.0,4303.0,613.0...|206311.30806858174|
|[5.0,4413.0,804.0...| 205747.2465481292|
+--------------------+------------