# Predicting citation count using Gradient-Boosted Decision Trees on paper titles

In [1]:
from pyspark import SparkFiles, SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

In [2]:
spark = SparkSession.builder.enableHiveSupport().appName('CitationCount').getOrCreate()
sc = spark.sparkContext

In [None]:
df_filter = spark.read.option("header", True).csv("gs://msca-bdp-student-gcs/Group5_Final_Project/raw_data/clean_filtered")

In [6]:
df_filter = df_filter.select('title', 'n_citation')

In [7]:
(training_data, testing_data) = df_filter.randomSplit([0.8, 0.2])

In [8]:
tokenizer = Tokenizer(inputCol="title", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
cv = CountVectorizer(inputCol="filtered", outputCol="raw_features")
idf = IDF(inputCol="raw_features", outputCol="features")
assembler = VectorAssembler(inputCols=["features"], outputCol="assembled_features")
gbt = GBTRegressor(featuresCol="features", labelCol="n_citation", maxIter=10)

In [9]:
pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, assembler, gbt])

In [10]:
model = pipeline.fit(training_data)

23/03/06 07:31:48 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.0 MiB
23/03/06 07:32:33 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/03/06 07:32:54 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/03/06 07:32:56 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 5.5 MiB
23/03/06 07:32:57 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 5.5 MiB
23/03/06 07:33:16 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.3 MiB
23/03/06 07:33:39 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.8 MiB
23/03/06 07:34:14 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.8 MiB
23/03/06 07:34:20 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary w

In [11]:
predictions = model.transform(testing_data)

In [12]:
evaluator = RegressionEvaluator(labelCol="n_citation", predictionCol="prediction", metricName="rmse")


In [13]:
rmse = evaluator.evaluate(predictions)

23/03/06 07:47:46 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 5.5 MiB
23/03/06 07:48:28 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 5.5 MiB
                                                                                

In [14]:
rmse

169.60625603020185