In [1]:
# importing required modules

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
import time

In [2]:
# initiating spark session

spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [3]:
# loading train and test datasets

start = time.time()

df_train = spark.read.csv('trainX.csv',header=True,inferSchema=True)
df_test = spark.read.csv('testX.csv',header=True,inferSchema=True)

end = time.time()
duration = round(end-start, 2)

print(f"Time taken to load train and test datasets of size ~ 1GB: {duration} seconds")

Time taken to load train and test datasets of size ~ 1GB: 31.84 seconds


In [4]:
# creating a list of all predictors

features_list = []
for col in df_train.dtypes:
    if col[0] != 'price_doc':
        features_list.append(col[0])

In [5]:
# transforming all predictors of train dataset into features, which is supported by pyspark for regression

vector_assembler = VectorAssembler(inputCols=features_list, outputCol='features')
output = vector_assembler.transform(df_train)
data = output.select("features", "price_doc")

In [6]:
# transforming all predictors of test dataset into features, which is supported by pyspark for regression

vector_assembler = VectorAssembler(inputCols=features_list, outputCol='features')
output = vector_assembler.transform(df_test)
test = output.select("features")

In [7]:
# training random forest regressor on train dataset and then predicting results for test dataset

start = time.time()

# creating random forest object and fitting it on the train data
rf = RandomForestRegressor(featuresCol = 'features', labelCol = 'price_doc')
rfModel = rf.fit(data)

end = time.time()
duration = round(end-start, 2)

# predicting target variable for test dataset
predictions = rfModel.transform(test)

print(f"Time taken to train model on train dataset: {duration} seconds")

Time taken to train model on train dataset: 84.38 seconds


In [8]:
predictions.select("prediction").show()

+--------------------+
|          prediction|
+--------------------+
|1.2320796967519594E7|
|   6532912.798491937|
|   5854936.218142115|
|    6328923.54027323|
|  6347550.4442855045|
|   6016097.997534399|
|   5887345.307585415|
|   5887345.307585415|
|   5864824.758091529|
|    5954843.26027645|
|5.6019331012901545E7|
|   5887345.307585415|
| 5.609070458299409E7|
| 5.340167529423499E7|
|   5677944.965858667|
|   5887345.307585415|
|    5954843.26027645|
|   5887345.307585415|
|   5973890.273224078|
|   6082238.077010237|
+--------------------+
only showing top 20 rows



In [9]:
import pandas as pd

In [10]:
predictions.select("prediction").toPandas().to_csv("spark_rf_pred.csv", index=False)

In [11]:
row_id = pd.read_csv("ml1ch_test.csv")
row_id = row_id["row ID"]

In [13]:
pred = pd.read_csv("spark_rf_pred.csv")

In [15]:
pred["row ID"] = row_id

In [17]:
pred = pred[["row ID", "prediction"]]

In [19]:
pred.rename(columns = {'prediction':'price_doc'}, inplace = True)

In [21]:
pred.to_csv('spark_rf.csv', index=False)

In [22]:
pred.head()

Unnamed: 0,row ID,price_doc
0,Row3,12320800.0
1,Row6,6532913.0
2,Row11,5854936.0
3,Row12,6328924.0
4,Row14,6347550.0
