In [1]:
from pyspark.sql.functions import col
from pyspark.sql.types import StringType,BooleanType,DateType,IntegerType
from pyspark.sql.functions import *

In [2]:
# Load data from a CSV
file_location = "/FileStore/tables/df_panel_fix.csv"
df = spark.read.format("CSV").option("inferSchema", True).option("header", True).load(file_location)
display(df.take(5))


_c0,province,specific,general,year,gdp,fdi,rnr,rr,i,fr,reg,it
0,Anhui,147002.0,,1996,2093.3,50661,0.0,0.0,0.0,1128873,East China,631930
1,Anhui,151981.0,,1997,2347.32,43443,0.0,0.0,0.0,1356287,East China,657860
2,Anhui,174930.0,,1998,2542.96,27673,0.0,0.0,0.0,1518236,East China,889463
3,Anhui,285324.0,,1999,2712.34,26131,,,,1646891,East China,1227364
4,Anhui,195580.0,32100.0,2000,2902.09,31847,0.0,0.0,0.0,1601508,East China,1499110


In [3]:
df.createOrReplaceTempView("fiscal_stats")

sums = spark.sql("""
select year, sum(it) as total_yearly_it, sum(fr) as total_yearly_fr
from fiscal_stats
group by 1
order by year asc
""")

sums.show()

In [4]:
df.describe().toPandas().transpose()


Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
_c0,360,179.5,104.06728592598157,0,359
province,360,,,Anhui,Zhejiang
specific,356,583470.7303370787,654055.3290782663,8964.0,3937966.0
general,169,309127.53846153844,355423.5760674793,0.0,1737800.0
year,360,2001.5,3.4568570586927794,1996,2007
gdp,360,4428.653416666667,4484.668659976412,64.98,31777.01
fdi,360,196139.38333333333,303043.97011891654,2,1743140
rnr,294,0.0355944252244898,0.16061503029299648,0.0,1.214285714
rr,296,0.059688621057432424,0.15673351824073453,0.0,0.84


In [5]:
df2 = df.withColumn("gdp",col("gdp").cast(IntegerType())) \
.withColumn("specific",col("specific").cast(IntegerType())) \
.withColumn("general",col("general").cast(IntegerType())) \
.withColumn("year",col("year").cast(IntegerType())) \
.withColumn("fdi",col("fdi").cast(IntegerType())) \
.withColumn("rnr",col("rnr").cast(IntegerType())) \
.withColumn("rr",col("rr").cast(IntegerType())) \
.withColumn("i",col("i").cast(IntegerType())) \
.withColumn("fr",col("fr").cast(IntegerType()))

In [6]:
df2.printSchema()

In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

assembler = VectorAssembler(inputCols=['gdp', 'fdi'], outputCol="features")
train_df = assembler.transform(df2) 

In [8]:
train_df.select("specific", "year").show()

In [9]:
lr = LinearRegression(featuresCol = 'features', labelCol='it')
lr_model = lr.fit(train_df)

trainingSummary = lr_model.summary
print("Coefficients: " + str(lr_model.coefficients))
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("R2: %f" % trainingSummary.r2)


In [10]:
lr_predictions = lr_model.transform(train_df)
lr_predictions.select("prediction","it","features").show(5)
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="it",metricName="r2")



In [11]:
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

In [12]:
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()

In [13]:
predictions = lr_model.transform(test_df)
predictions.select("prediction","it","features").show()

In [14]:
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'it')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(train_df)
dt_evaluator = RegressionEvaluator(
    labelCol="it", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


In [15]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'it', maxIter=10)
gbt_model = gbt.fit(train_df)
gbt_predictions = gbt_model.transform(train_df)
gbt_predictions.select('prediction', 'it', 'features').show(5)


gbt_evaluator = RegressionEvaluator(
    labelCol="it", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)