In [100]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator


from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

In [93]:

spark = (SparkSession.builder.appName('bigquery').getOrCreate())

pubg=spark.read.csv('C:/Users/faisa/OneDrive - Letterkenny Institute of Technology/2nd Semester/Big Data Analytics - Shagufta/Technical Project/PUBG/pubg_prediction/final.csv', header=True)


pubg_y=spark.read.csv('C:/Users/faisa/OneDrive - Letterkenny Institute of Technology/2nd Semester/Big Data Analytics - Shagufta/Technical Project/PUBG/pubg_prediction/final2.csv', header=True)


In [52]:
pubg.printSchema()

root
 |-- assists: string (nullable = true)
 |-- boosts: string (nullable = true)
 |-- damageDealt: string (nullable = true)
 |-- DBNOs: string (nullable = true)
 |-- headshotKills: string (nullable = true)
 |-- heals: string (nullable = true)
 |-- killPlace: string (nullable = true)
 |-- killPoints: string (nullable = true)
 |-- kills: string (nullable = true)
 |-- killStreaks: string (nullable = true)
 |-- longestKill: string (nullable = true)
 |-- matchDuration: string (nullable = true)
 |-- matchType: string (nullable = true)
 |-- maxPlace: string (nullable = true)
 |-- numGroups: string (nullable = true)
 |-- rankPoints: string (nullable = true)
 |-- revives: string (nullable = true)
 |-- rideDistance: string (nullable = true)
 |-- roadKills: string (nullable = true)
 |-- swimDistance: string (nullable = true)
 |-- teamKills: string (nullable = true)
 |-- vehicleDestroys: string (nullable = true)
 |-- walkDistance: string (nullable = true)
 |-- weaponsAcquired: string (nullable = 

In [53]:
cols = pubg.columns
print(cols)

['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills', 'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace', 'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills', 'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints', 'match_mean', 'match_median', 'totalPlayers', 'teamSize', 'killsNorm', 'damageDealtNorm', 'normMatchType', 'totalDistance', 'maxPossibleKills', 'itemsUsed', 'itemsPerDistance', 'killsPerDistance', 'damageDealtPerDistance', 'maxTeamKills', 'totalTeamKills', 'headshotKillRate', 'itemsUsedPerTeam', 'percKill', 'percTeamKills', 'meanTeamKillPlace']


In [94]:

for col in cols:
    pubg = pubg.withColumn(col, pubg[col].cast(FloatType()))


# #Creating feature vector
# vectorAssembler = VectorAssembler(inputCols = cols, outputCol = 'features')

# #Transforming the dataframe 
# pubg_df=vectorAssembler.transform(pubg)

# #Selecting features and target variable from the dataframe
# pubg_df = pubg_df.select(['features', 'winPlacePerc'])
# pubg_df.show(3)

#Splitting the data into test and train
X_train, X_val, y_train, y_val = train_test_split(pubg.toPandas(), pubg_y.toPandas(), train_size=0.7)


In [96]:
train_df.columns

Index(['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals',
       'killPlace', 'killPoints', 'kills', 'killStreaks', 'longestKill',
       'matchDuration', 'matchType', 'maxPlace', 'numGroups', 'rankPoints',
       'revives', 'rideDistance', 'roadKills', 'swimDistance', 'teamKills',
       'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints',
       'match_mean', 'match_median', 'totalPlayers', 'teamSize', 'killsNorm',
       'damageDealtNorm', 'normMatchType', 'totalDistance', 'maxPossibleKills',
       'itemsUsed', 'itemsPerDistance', 'killsPerDistance',
       'damageDealtPerDistance', 'maxTeamKills', 'totalTeamKills',
       'headshotKillRate', 'itemsUsedPerTeam', 'percKill', 'percTeamKills',
       'meanTeamKillPlace'],
      dtype='object')

In [73]:
def calculate_error(cl,name):
  print(name)
  print('Mean Absolute Error is {:.5f}'.format(mean_absolute_error(y_val, cl.predict(X_val))))
  print('R2 score is {:.2%}'.format(r2_score(y_val, cl.predict(X_val))))

In [101]:
linear = LinearRegression(copy_X=True)
linear.fit(X_train,y_train)
calculate_error(linear,"linear")

ridge = Ridge(copy_X=True)
ridge.fit(X_train,y_train)
calculate_error(ridge,"ridge")

lasso = Lasso(copy_X=True)
lasso.fit(X_train,y_train)
calculate_error(lasso,"lasso")

elastic = ElasticNet(copy_X=True)
elastic.fit(X_train,y_train)
calculate_error(elastic,"elastic")

ada = AdaBoostRegressor(learning_rate=0.8)
ada.fit(X_train,y_train)
calculate_error(ada,"Adaboost")

GBR = GradientBoostingRegressor(learning_rate=0.8)
GBR.fit(X_train,y_train)
calculate_error(GBR,"GBR")

forest = RandomForestRegressor(n_estimators=10)
forest.fit(X_train,y_train)
calculate_error(forest,"forest")

tree = DecisionTreeRegressor()
tree.fit(X_train,y_train)
calculate_error(tree,"tree")

linear
Mean Absolute Error is 0.00679
R2 score is -0.13%
ridge
Mean Absolute Error is 0.01005
R2 score is 16.19%
lasso
Mean Absolute Error is 0.01250
R2 score is -8.91%
elastic
Mean Absolute Error is 0.01228
R2 score is -8.28%
Adaboost
Mean Absolute Error is 0.00203
R2 score is 91.71%
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
GBR
Mean Absolute Error is 0.00114
R2 score is 94.86%
forest
Mean Absolute Error is 0.00127
R2 score is 93.97%
tree
Mean Absolute Error is 0.00202
R2 score is 81.58%


In [83]:

#import decision tree and train the model
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'winPlacePerc')
dt_model = dt.fit(train_df)

dt_predictions = dt_model.transform(test_df)
dt_predictions.select("prediction","winPlacePerc","features").show(5)

#Calculating test score

dt_evaluator = RegressionEvaluator(predictionCol="prediction",\
labelCol="winPlacePerc",metricName="r2")

print("R Squared (R2) on test data = %g" % dt_evaluator.evaluate(dt_predictions))

#Calculating RMSE
dt_evaluator = RegressionEvaluator(
    labelCol="winPlacePerc", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

IllegalArgumentException: 'Field "features" does not exist.\nAvailable fields: assists, boosts, damageDealt, DBNOs, headshotKills, heals, killPlace, killPoints, kills, killStreaks, longestKill, matchDuration, matchType, maxPlace, numGroups, rankPoints, revives, rideDistance, roadKills, swimDistance, teamKills, vehicleDestroys, walkDistance, weaponsAcquired, winPoints, match_mean, match_median, totalPlayers, teamSize, killsNorm, damageDealtNorm, normMatchType, totalDistance, maxPossibleKills, itemsUsed, itemsPerDistance, killsPerDistance, damageDealtPerDistance, maxTeamKills, totalTeamKills, headshotKillRate, itemsUsedPerTeam, percKill, percTeamKills, meanTeamKillPlace'