In [1]:
# importing required modules

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
import time

In [2]:
# initiating spark session

spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [3]:
# loading train and test datasets

start = time.time()

df_train = spark.read.csv('trainX.csv',header=True,inferSchema=True)
df_test = spark.read.csv('testX.csv',header=True,inferSchema=True)

end = time.time()
duration = round(end-start, 2)

print(f"Time taken to load train and test datasets of size ~ 1GB: {duration} seconds")

Time taken to load train and test datasets of size ~ 1GB: 30.63 seconds


In [4]:
# creating a list of all predictors

features_list = []
for col in df_train.dtypes:
    if col[0] != 'price_doc':
        features_list.append(col[0])

In [5]:
# transforming all predictors of train dataset into features, which is supported by pyspark for regression

vector_assembler = VectorAssembler(inputCols=features_list, outputCol='features')
output = vector_assembler.transform(df_train)
data = output.select("features", "price_doc")

In [6]:
# transforming all predictors of test dataset into features, which is supported by pyspark for regression

vector_assembler = VectorAssembler(inputCols=features_list, outputCol='features')
output = vector_assembler.transform(df_test)
test = output.select("features")

In [7]:
# training gradient boosting regressor on train dataset and then predicting results for test dataset

start = time.time()

# creating gradient boosting object and fitting it on the train data
gb = GBTRegressor(featuresCol = 'features', labelCol = 'price_doc')
gbModel = gb.fit(data)

end = time.time()
duration = round(end-start, 2)

# predicting target variable for test dataset
predictions = gbModel.transform(test)

print(f"Time taken to train model on train dataset: {duration} seconds")

Time taken to train model on train dataset: 90.01 seconds


In [8]:
predictions.select("prediction").show()

+--------------------+
|          prediction|
+--------------------+
|1.3763556606222438E7|
|   6126187.156817038|
|   4787062.119890908|
|   6623505.843747668|
|   5887944.183244572|
|   5266221.787535881|
|   5438492.369136855|
|   4982263.435258632|
|   5308963.605339882|
|   4878210.567323259|
| 5.577175418372143E7|
|   5466276.734715932|
|  5.49598951212593E7|
| 5.673974972077665E7|
|   5448177.341533182|
|   5953909.823199085|
|   5322049.986872624|
|   5580532.163515232|
|   5648496.175198194|
|    5626763.59324176|
+--------------------+
only showing top 20 rows



In [9]:
import pandas as pd

predictions.select("prediction").toPandas().to_csv("spark_gb_pred.csv", index=False)

row_id = pd.read_csv("ml1ch_test.csv")
pred = pd.read_csv("spark_gb_pred.csv")

pred["row ID"] = row_id["row ID"]
pred = pred[["row ID", "prediction"]]
pred.rename(columns = {'prediction':'price_doc'}, inplace = True)
pred.to_csv('spark_gb.csv', index=False)

pred.head()

Unnamed: 0,row ID,price_doc
0,Row3,13763560.0
1,Row6,6126187.0
2,Row11,4787062.0
3,Row12,6623506.0
4,Row14,5887944.0
