# Installing the necessary dependencies

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://apache.mirror.rafal.ca/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Find Spark and import necessary modules

In [0]:
import os
import findspark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"
findspark.init()


from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
from pyspark.sql import SQLContext
from pyspark.sql import functions as f
from pyspark.sql.types import IntegerType, StructType, StructField, StringType
from pyspark import SparkConf


sqlContext = SQLContext(spark)

# Creating the DataFrames for the wikipedia data set

In [0]:
pages_rdd = spark.sparkContext.textFile('./titles-sorted.txt')
# Creates an index for each title starting from 1
pages_rdd = pages_rdd.zipWithIndex().map(lambda x: ((x[1] + 1), x[0]))
# Creates the schema for the pages' DataFrame
pages_df_schema = StructType([
    StructField("Link", IntegerType()),
    StructField("Title", StringType()),
])
# Creates de DataFrame for pages
pages_df = spark.createDataFrame(pages_rdd, pages_df_schema)

In [0]:
links_df = spark.read.csv('./100000_links-simple-sorted.txt.gz', inferSchema=True, sep=':')
# Renames the columns to 'From' for links, and 'To' for outlinks
links_df = links_df.select(f.col('_c0').alias('From'), f.col('_c1').alias('To'))
# Strip any lead or trace spaces in each column
links_df = links_df.withColumn('From', f.ltrim(f.rtrim(links_df['From'])))
links_df = links_df.withColumn('To', f.ltrim(f.rtrim(links_df['To']))) 
# Converts the column 'From' to integers
links_df = links_df.withColumn('From', links_df.From.cast(IntegerType()))

# Q2A

In [0]:
# Joins the two dataframes based on the links that are present in pages_df
# but are not present in the column 'From' of links_df
print('Q2A: Pages with no Outlinks')
no_outlnks = pages_df.join(links_df, pages_df.Link == links_df.From, how='left_anti')
no_outlnks.show()

Q2A: Pages with no Outlinks
+-----+--------------------+
| Link|               Title|
+-----+--------------------+
|   21|      !?Revolution!?|
| 3726|...And_Oceans_vs....|
| 4401|        .458_Express|
| 6054|                  0.|
| 6072|           0.7_FILMS|
| 6142|                 001|
| 6786|  0_(disambiguation)|
| 8949|100_Days_(1999_film)|
| 9692|    102.5_The_Bridge|
|12672|   110th_US_Congress|
|15205|121st_Pennsylvani...|
|19060|  13th_Finance_Group|
|20915|14th_Bombardment_...|
|26236|             1710_AM|
|28202|           17_Relics|
|38781|1931-32_Southern_...|
|39187|1933_Baffin_Bay_e...|
|41856|  1946–47_OHA_season|
|58618|       1988_election|
|59758|    1990_Kremlin_Cup|
+-----+--------------------+
only showing top 20 rows



# Q2B

In [0]:
# Splits the values of the column 'To' of links_df into a list
inlinks_df = links_df.select(f.col('From'), f.split(f.col('To'), ' ').alias('To'))
# Explodes the column 'To' to get each value individually
inlinks_df = inlinks_df.select(inlinks_df.From, f.explode(inlinks_df.To).alias('To'))
# Converts the column 'From' to integers
inlinks_df = inlinks_df.withColumn('To', inlinks_df.To.cast(IntegerType()))

In [0]:
# Joins the two dataframes based on the links that are present in pages_df
# but are not present in the column 'To' of inlinks_df
print('Q2B: Pages with no Inlinks')
no_inlinks = pages_df.join(inlinks_df, pages_df.Link == inlinks_df.To, how='left_anti')
no_inlinks.show()

Q2B: Pages with no Inlinks
+----+--------------------+
|Link|               Title|
+----+--------------------+
|   1|                   !|
|   4|                !!!!|
|   6|!!!Fuck_You!!!_An...|
|   7|!!!Fuck_You!!!_An...|
|  10|          !!!_(band)|
|  11|  !!Destroy-Oh-Boy!!|
|  12|        !!Fuck_you!!|
|  13|                 !!M|
|  14|!!Que_Corra_La_Voz!!|
|  15|          !!_(chess)|
|  16|                 !!m|
|  17| !'O-!khung_language|
|  18|                  !=|
|  19|                  !?|
|  20|                 !?!|
|  21|      !?Revolution!?|
|  22|          !?_(chess)|
|  23|          !A_Luchar!|
|  24|       !Action_Pact!|
|  25|      !Adios_Amigos!|
+----+--------------------+
only showing top 20 rows



# Q2C

In [0]:
# For this question, I defined popularity as the page with most inlinks
# To get the most popular links, inlinks_df is grouped by the column 'To'
# then a count operation is performed, and finally is ordered by this count
# in descending order.
print('Q2C: Most Popular Pages')
popularity_df = inlinks_df.groupBy('To').count().orderBy('count', ascending=False)
popularity_df.show()

Q2C: Most Popular Pages
+-------+-----+
|     To|count|
+-------+-----+
|  88822|10860|
| 481424| 7132|
|3766300| 6872|
|5302153| 6680|
|3283594| 6367|
|2367662| 6225|
|1613751| 6194|
|3078798| 6070|
|3294332| 6055|
| 192438| 6006|
|3766108| 5930|
|3014548| 5844|
|4080285| 5830|
| 419148| 5769|
|3766260| 5766|
| 437972| 5690|
|3047022| 5679|
|4747397| 5640|
|3391195| 5455|
|3009165| 5015|
+-------+-----+
only showing top 20 rows

