# Predicting citation count using Gradient-Boosted Decision Trees on paper abstracts

In [18]:
from pyspark import SparkFiles, SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

In [2]:
spark = SparkSession.builder.enableHiveSupport().appName('CitationCount').getOrCreate()
sc = spark.sparkContext

In [None]:
df_filter = spark.read.option("header", True).csv("gs://msca-bdp-student-gcs/Group5_Final_Project/raw_data/clean_filtered")

In [6]:
df_filter = df_filter.select('abstract', 'n_citation')

In [7]:
(training_data, testing_data) = df_filter.randomSplit([0.8, 0.2])

In [8]:
tokenizer = Tokenizer(inputCol="abstract", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
cv = CountVectorizer(inputCol="filtered", outputCol="raw_features")
idf = IDF(inputCol="raw_features", outputCol="features")
assembler = VectorAssembler(inputCols=["features"], outputCol="assembled_features")
gbt = GBTRegressor(featuresCol="features", labelCol="n_citation", maxIter=10)

In [9]:
pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, assembler, gbt])

In [None]:
model = pipeline.fit(training_data)

23/03/06 08:03:36 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.0 MiB
23/03/06 08:04:15 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.2 MiB
23/03/06 08:04:42 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.2 MiB
23/03/06 08:04:45 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 10.2 MiB
23/03/06 08:04:47 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 10.2 MiB
23/03/06 08:05:10 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 12.8 MiB
23/03/06 08:05:43 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1034.3 KiB
23/03/06 08:05:45 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 14.2 MiB
23/03/06 08:07:23 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task b

In [None]:
predictions = model.transform(testing_data)

In [None]:
evaluator = RegressionEvaluator(labelCol="n_citation", predictionCol="prediction", metricName="rmse")


In [None]:
rmse = evaluator.evaluate(predictions)

23/03/06 08:29:16 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 10.2 MiB
23/03/06 08:29:40 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 10.2 MiB
                                                                                

In [None]:
rmse

215.83083874557414