<a href="https://colab.research.google.com/github/frCheval/DATATHON-CPE-LYON-2023/blob/main/Datathon_profiling_queries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Configuration du Notebook

Install Apache Spark and the Archive Unleashed Toolkit (AUT).

In [1]:
# Set to True if working in Amazon SageMaker
SAGEMAKER = False

In [2]:
%%capture

import os

APPS_HOME      = os.getcwd() + "/apps"

SPARK_VERSION  = "3.0.0"
HADOOP_VERSION = "2.7"
AUT_VERSION    = "0.91.0"
JAVA_VERSION   = "11"

SPARK_HADOOP_VERSION = "spark-{}-bin-hadoop{}".format(SPARK_VERSION, HADOOP_VERSION)

if SAGEMAKER:
    !sudo amazon-linux-extras install java-openjdk11 -y
    os.environ["JAVA_HOME"]  = "/usr/lib/jvm/java-11-openjdk-11.0.16.0.8-1.amzn2.0.1.x86_64"
else:
    !apt-get install openjdk-"$JAVA_VERSION"-jdk-headless
    os.environ["JAVA_HOME"]  = "/usr/lib/jvm/java-{}-openjdk-amd64".format(JAVA_VERSION)

!pip install -q findspark

!wget https://archive.apache.org/dist/spark/spark-"$SPARK_VERSION"/"$SPARK_HADOOP_VERSION".tgz
!wget https://github.com/archivesunleashed/aut/releases/download/aut-"$AUT_VERSION"/aut-"$AUT_VERSION".zip
!wget https://github.com/archivesunleashed/aut/releases/download/aut-"$AUT_VERSION"/aut-"$AUT_VERSION"-fatjar.jar

!tar -xf "$SPARK_HADOOP_VERSION".tgz
!mkdir -p "$APPS_HOME"
!mv spark-* aut-* "$APPS_HOME"

!rm -rf sample_data "$APPS_HOME"/"$SPARK_HADOOP_VERSION".tgz

# Spark init

Initialize spark in single-node cluster and configure pyspark with the AUT toolkit.

In [3]:
import os
import findspark

SPARK_DRIVER_MEMORY   = "8g"

os.environ["SPARK_HOME"] = "{}/{}".format(APPS_HOME, SPARK_HADOOP_VERSION)
os.environ['PYSPARK_SUBMIT_ARGS'] = '--driver-memory {0} --jars {2}/aut-{1}-fatjar.jar --py-files {2}/aut-{1}.zip pyspark-shell'.format(SPARK_DRIVER_MEMORY, AUT_VERSION, APPS_HOME)

findspark.init()

In [4]:
import pyspark
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc, col, udf
from pyspark.sql.types import StringType

sc = pyspark.SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

sc

# Téléchargement du dataset

In [5]:
%%capture
DIR="LIFRANUM"
!mkdir -p $DIR

#!gsutil -m cp -r gs://cpe-lyon/LIFRANUM/autre $DIR
#!gsutil -m cp -r gs://cpe-lyon/LIFRANUM/cartoweb $DIR
#!gsutil -m cp -r gs://cpe-lyon/LIFRANUM/lifranum-method $DIR
!gsutil -m cp -r gs://cpe-lyon/LIFRANUM/repo-ecritures-num $DIR

# Analyse du dataset

## Searching, ranking and grouping “Domains” within the data collections

In [6]:
from aut import *

WARCs_path = "LIFRANUM/repo-ecritures-num/*.warc*"

In [9]:
%%capture
!pip install tldextract

In [10]:
import tldextract
from pyspark.sql.functions import desc

@udf("string")
def extract_tld(s):
    return tldextract.extract(s).suffix


# Number of domain
WebArchive(sc, sqlContext, WARCs_path) \
  .webpages() \
  .select(extract_tld("url").alias("tld")) \
  .groupBy("tld") \
  .count() \
  .sort(desc("count"))\
  .show(10, False)

+----+-----+
|tld |count|
+----+-----+
|com |7560 |
|fr  |736  |
|net |520  |
|ca  |74   |
|blog|22   |
|org |6    |
+----+-----+



In [None]:
import tldextract
from pyspark.sql.functions import desc

@udf("string")
def extract_tld(s):
    return tldextract.extract(s).suffix


# % of domain
WebArchive(sc, sqlContext, WARCs_path) \
  .webpages() \
  .select(extract_tld("url").alias("tld")) \
  .groupBy("tld") \
  .count() \
  .sort(desc("count"))\
  .show(10, False)