In [1]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MinMaxScalerExample").getOrCreate()

#create spark dataframe of input csv file
df=spark.read.csv('../games.csv'
                  ,inferSchema=True,header=True)

In [2]:
df = df.drop("first", "time_control_name", "game_end_reason", "created_at", "lexicon", "rating_mode", "game_id")

In [3]:
from pyspark.ml.feature import StringIndexer
indexer=StringIndexer(inputCol='initial_time_seconds',outputCol='kaggle_cat')
indexed=indexer.fit(df).transform(df)

for item in indexed.head():
    print(item)

1
1200
0
1
674.844273805618
0.0


In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler=VectorAssembler(inputCols=['initial_time_seconds',
 'increment_seconds',
 'max_overtime_minutes'],outputCol='features')
output=assembler.transform(indexed)

In [5]:
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

# Compute summary statistics and generate MinMaxScalerModel
scalerModel = scaler.fit(output)

# rescale each feature to range [min, max].
scaledData = scalerModel.transform(output)
scaledData.show()

+------+--------------------+-----------------+--------------------+---------------------+----------+----------------+--------------------+
|winner|initial_time_seconds|increment_seconds|max_overtime_minutes|game_duration_seconds|kaggle_cat|        features|      scaledFeatures|
+------+--------------------+-----------------+--------------------+---------------------+----------+----------------+--------------------+
|     1|                1200|                0|                   1|     674.844273805618|       0.0|[1200.0,0.0,1.0]|[0.33054393305439...|
|     1|                3600|                0|                   1|     364.214417934418|       2.0|[3600.0,0.0,1.0]|       [1.0,0.0,0.1]|
|     1|                 900|                0|                   5|     492.268262147903|       1.0| [900.0,0.0,5.0]|[0.24686192468619...|
|     0|                3600|                0|                   1|     350.861140966415|       2.0|[3600.0,0.0,1.0]|       [1.0,0.0,0.1]|
|     0|            

In [8]:
#final data consist of features and label which is crew.
final_data=output.select('features','game_duration_seconds')
train_data,test_data=final_data.randomSplit([0.8,0.2])
train_data.describe().show()

+-------+---------------------+
|summary|game_duration_seconds|
+-------+---------------------+
|  count|                58337|
|   mean|    485.2319373445769|
| stddev|     334.886676894674|
|    min|     7.96298909187317|
|    max|     4444.82164502144|
+-------+---------------------+



In [9]:
#import LinearRegression library
from pyspark.ml.regression import LinearRegression
#creating an object of class LinearRegression
#object takes features and label as input arguments
ship_lr = LinearRegression(featuresCol='features',labelCol='game_duration_seconds')
#pass train_data to train model
kaggle_model=ship_lr.fit(train_data)
#evaluating model trained for Rsquared error
ship_results=kaggle_model.evaluate(train_data)
  
print('Rsquared Error :',ship_results.r2)
#R2 value shows accuracy of model is 92%
#model accuracy is very good and can be use for predictive analysis

Rsquared Error : 0.147515988901467
