In [None]:
# Import necessary libraries.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('LinearRegression').getOrCreate()

from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['figure.figsize'] = 8, 5
plt.rcParams['image.cmap'] = 'viridis'
import pandas as pd

In [None]:
# Create dataFrame_Initial from VideoGamesSales.csv file.
dataFrame_Initial = spark.read.csv('../Datasets/VideoGamesSales.csv', header=True, inferSchema=True)

In [None]:
# Data point count for dataFrame_Initial.
print("Total data points:", dataFrame_Initial.count())

In [None]:
# Schema table for dataFrame_Initial.
dataFrame_Initial.printSchema()

In [None]:
# General statistics for 'Year_of_Release'
dataFrame_Initial.select('Year_of_Release').describe().show()

In [None]:
dataFrame_Initial.filter("Year_of_Release > 2000 AND Year_of_Release < 2017").select('Year_of_Release').count()

In [None]:
# Filter dataFrame_Initial to remove empty values.
dataFrame_Filtered = dataFrame_Initial.na.drop()

# Filter dataFrame_Filtered by 'Year_of_Release'
dataFrame_Filtered = dataFrame_Filtered.orderBy('Year_of_Release')

# Check dataFrame has been filtered. 
dataFrame_Filtered.head(5)

In [None]:
# Add key column, 'ID' to dataFrame.
dataFrame_wKey = dataFrame_Filtered.select('*').withColumn('ID', monotonically_increasing_id())

In [None]:
# Set column types to accurately reflect data.
dataFrame_wKey = dataFrame_wKey.withColumn('User_Score', dataFrame_wKey['User_Score'].cast('float'))
dataFrame_wKey = dataFrame_wKey.withColumn('Year_of_Release', dataFrame_wKey['Year_of_Release'].cast('int'))

In [None]:
# Schema table with columns set to correct data type.
dataFrame_wKey.printSchema()

In [None]:
# Create list of columns deemed useful.
columns_Useful = ['ID', 'Name', 'Platform', 'Year_of_Release', 'Genre', 
               'Global_Sales', 'Critic_Score', 'Critic_Count',
               'User_Score', 'User_Count']

In [None]:
# Create new dataframe which contains only useful columns.
dataFrame_Useful = dataFrame_wKey[columns_Useful]

In [None]:
# Show first 5 rows of dataFrame_Useful
dataFrame_Useful.head(5)

In [None]:
pd_df_Useful = dataFrame_Useful.describe().toPandas().transpose()

In [None]:
# Create list of columns for input.
input_Columns = ['ID', 'Year_of_Release', 'Critic_Score', 'User_Score']

vector_Assembler = VectorAssembler(inputCols = input_Columns, outputCol = 'Features')

# Transform the data.
vector_Output = vector_Assembler.transform(dataFrame_Useful)

# Schema table with Features column added.
vector_Output.printSchema()

In [None]:
# Create new dataframe with only Features and GlobalSales columns
dataFrame_Features = vector_Output.select(['Features','Global_Sales'])

# dataFrame_Features now has only 2 columns
print(dataFrame_Features.head(5))

In [None]:
# Split data by amounts stated above
data_train,data_test = dataFrame_Features.randomSplit([0.7,0.3])

# Show data_Train
data_train.describe().show()

# Show data_Test
data_test.describe().show()

In [None]:
regressor = DecisionTreeRegressor(featureCols='Features', labelCol='Global_Sales')

# Fit the model
regression_Model.fit(data_train)

# Make predictions
predictions = regression_Model.transform(data_test)

# Select example rows to display.
predictions.select("Global_Sales", "Features").show(5)

evaluator = RegressionEvaluator(labelCol='Global_Sales', predictionCol='prediction', metricName='rmse')

rootMeanSquaredError = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rootMeanSquaredError)