In [0]:
#SETUP

#importing functions and creating spark session
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

#Loading files into dataframes

# to update the journal file only the path_journal_info variable needs to be changed
path_journal_info = "/FileStore/tables/journal_information.csv"
journal_info = spark.read.option('header', True).csv(path_journal_info)

# to update the publications file only the path_df variable needs to be changed
path_df = "/FileStore/tables/large.json.gz"
publications_df = spark.read.json(path_df)

In [0]:
#pre-processing

#chainging column names with spaces to make querying easier
journal_info = journal_info.withColumnRenamed("Journal Name", "journal_name") 
journal_info = journal_info.withColumnRenamed("Category & Journal Quartiles", "category_and_journal_quartiles") 

#removing null row
journal_info = journal_info.dropna(subset=['journal_name'])

In [0]:
#question 1

#first checking the total number of rows in file
total_count = publications_df.count()

# Counting the number of distinct 'corpusid' values
distinct_id_count = publications_df.select(col("corpusid")).distinct().count()

#printing answers
print(f"Total number of papers: {total_count}")
print(f"Number of distinct IDs: {distinct_id_count}")

# Checking if all papers have unique IDs
if distinct_id_count == total_count:
    print("All papers have unique IDs.")
else:
    print("There are duplicate IDs in the dataset.")

Total number of papers: 150000
Number of distinct IDs: 150000
All papers have unique IDs.


In [0]:
#question 1 further analysis

# Exploding column
category_df = publications_df.withColumn("category", explode("s2fieldsofstudy.category"))

#counting categories
filtered_category = category_df.filter(col("category").isNotNull())
category_counts = filtered_category.groupBy("category").count().orderBy(col("count").desc())

# displaying results
category_counts.display()


category,count
Medicine,65619
Engineering,24799
Biology,19676
Computer Science,19043
Chemistry,18220
Environmental Science,16847
Materials Science,16012
Physics,15303
Psychology,9010
Political Science,7189


Databricks visualization. Run in Databricks to view.

In [0]:
#question 2

author_no = publications_df.withColumn("num_authors", size(col("authors")))
avg_authors = author_no.agg({"num_authors": "avg"}).collect()[0][0]
print("Average number of authors per paper:", avg_authors)


Average number of authors per paper: 2.81628


In [0]:
#question 2 further analysis - filtering out papers with no authors

author_no = publications_df.withColumn("num_authors", size(col("authors"))).filter(size("authors") > 0)
avg_authors = author_no.agg({"num_authors": "avg"}).collect()[0][0]
print("Average number of authors per paper:", avg_authors)


Average number of authors per paper: 2.8947072689398095


In [0]:
#question 3

filtered_df = publications_df.filter(col("journal.name").isNotNull()).filter(col("journal.name") != '') #filtering empty values and null

journal_count = filtered_df.select("journal.name").distinct().count()

print("Number of distinct journals:", journal_count)

Number of distinct journals: 33916


In [0]:
#question 3 further analysis - count of publications per journal

Q3_further_analysis = publications_df.filter(col("journal.name").isNotNull()).filter(col("journal.name") != '') #filtering empty values and null

#grouping the data by journal name and counting publications in each journal, then sorting data and limiting the result to only top 10 highest
journal_pub_count = Q3_further_analysis.groupBy("journal.name").count().orderBy(col("count").desc()).limit(10)
journal_pub_count.display()


name,count
ChemInform,386
Nature,302
ArXiv,242
Science,217
British Medical Journal,208
PLoS ONE,178
The Lancet,150
Reactions Weekly,148
Scientific Reports,146
SSRN Electronic Journal,137


Databricks visualization. Run in Databricks to view.

In [0]:
#question 4

df_exploded = publications_df.select(explode("authors").alias("authors"))
authors_publications = df_exploded.groupBy("authors").count()

top_authors = authors_publications.orderBy(col("count").desc()).limit(5)

top_authors.display()

authors,count
"List(2149377746, B. Noble)",23
"List(90537224, S. Sukhoruchkin)",16
"List(88842366, Z. Soroko)",16
"List(49898687, M. Kumar)",15
"List(49611617, M. Jain)",10


Databricks visualization. Run in Databricks to view.

In [0]:
#Question 5 - comparing 2 columns with journal names to see which one contains more data
#this was done to decide which column is better to use when joining datasets


journal_name_count = publications_df.select(col("journal.name")).distinct().count()
venue_count = publications_df.select(col("venue")).distinct().count()

print("Number of columns in journal.name column:", journal_name_count)
print("Number of columns in venue column:", venue_count)

Number of columns in journal.name column: 33918
Number of columns in venue column: 21267


In [0]:
#question 5


selected_columns = publications_df.select(col("journal.name").alias("name"), "authors")
joined_df = selected_columns.join(journal_info, selected_columns.name == journal_info.journal_name)
authors_df = joined_df.select(explode("authors").alias("author"), "IF")

author_if = authors_df.groupBy("author").agg({"IF": "sum"}).withColumnRenamed("sum(IF)", "cumulative_impact_factor")
author_if = author_if.orderBy(col("cumulative_impact_factor").desc()).limit(5)
author_if.display()

author,cumulative_impact_factor
"List(2155504929, Ying Li)",93.832
"List(144797099, M. Viana)",92.238
"List(5152451, L. Andrade)",92.238
"List(49900836, H. Wood)",90.422
"List(7695437, A. M. Ruscio)",87.899


Databricks visualization. Run in Databricks to view.

In [0]:
#question 6

selected_columns_for_q6 = publications_df.select(col("journal.name").alias("name"), "year")
joined_df = selected_columns_for_q6.join(journal_info, selected_columns_for_q6.name == journal_info.journal_name, "left")
filtered_df = joined_df.filter(joined_df.IF >= 1)
publications_count = filtered_df.groupBy("year").count()
publications_count = publications_count.withColumn("year", publications_count["year"].cast("integer"))
publications_count = publications_count.where((publications_count.year >= 2010) & (publications_count.year <= 2020)).orderBy(desc("year"))
publications_count.display()



year,count
2020,444
2019,396
2018,365
2017,329
2016,283
2015,244
2014,242
2013,178
2012,165
2011,139


Databricks visualization. Run in Databricks to view.