In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as sf
import numpy as np
import warnings

In [2]:
sc = SparkContext(master='local[1]')
spark = SparkSession.builder.appName('Test').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/17 12:36:29 WARN Utils: Your hostname, promo-ds4-gra9-10, resolves to a loopback address: 127.0.1.1; using 51.91.85.76 instead (on interface ens3)
25/11/17 12:36:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/17 12:36:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#  1. Arbres de Paris

## Load Data

In [3]:
df = spark.read.csv("../data/arbresremarquablesparis.csv", sep=";", header=True)

                                                                                

In [4]:
#df.show()

## Some stats...

In [6]:
age = df.agg({"Année de plantation": "min"}).collect()[0]
print(f"Age de l'arbre le plus ancien : {2025-int(age['min(Année de plantation)'])} ans.")

Age de l'arbre le plus ancien : 423 ans.


In [5]:
# Convert to float
df = df.withColumn("hauteur en m", df["hauteur en m"].astype("float"))
df = df.withColumn("circonference en cm", df["circonference en cm"].astype("float"))

# Get volume of tallest tree
df = df.withColumn("volume", np.pi * ((df["circonference en cm"]/(100 * 2 * np.pi)) ** 2 * df["hauteur en m"]))
df.agg(sf.max_by("volume", "hauteur en m")).show()

+----------------------------+
|max_by(volume, hauteur en m)|
+----------------------------+
|          102.49677806957492|
+----------------------------+



In [7]:
df.groupBy("genre").mean().select(["genre", "avg(hauteur en m)"]).orderBy("avg(hauteur en m)", ascending=False).show()

+--------------+------------------+
|         genre| avg(hauteur en m)|
+--------------+------------------+
|Sequoiadendron|              29.4|
|      Platanus|28.955555555555556|
|      Taxodium|             28.75|
|    Pterocarya|26.285714285714285|
|       Juglans|              26.0|
|         Tilia|              23.0|
|  Liriodendron|             22.75|
|      Aesculus| 21.22222222222222|
|        Ginkgo|            21.125|
|      Fraxinus|              21.0|
|    Calocedrus|              21.0|
|         Pinus|              21.0|
|       Zelkova|             20.25|
|        Betula|              20.0|
|       Sequoia|              20.0|
|         Alnus|              20.0|
|   Metasequoia|              19.0|
|          Acer|              18.8|
|        Cedrus|             18.25|
|       Corylus|              17.5|
+--------------+------------------+
only showing top 20 rows


In [8]:
df.groupby("arrondissement2").count().orderBy("count", ascending=False).show()

+---------------+-----+
|arrondissement2|count|
+---------------+-----+
|             16|   52|
|             12|   35|
|             20|   13|
|             19|   12|
|             18|   10|
|             14|   10|
|              7|    9|
|              5|    7|
|             17|    7|
|             15|    6|
|              8|    6|
|              4|    6|
|             13|    4|
|              3|    3|
|              9|    2|
|            1er|    1|
|             11|    1|
|           NULL|    1|
|             10|    1|
+---------------+-----+



In [10]:
nb_trees_cimetiere = df.filter(df["adresse"].contains("CIMETIERE DU PERE LACHAISE")).count()
print(f"Il y a {nb_trees_cimetiere} arbres remarquables au cimetière du père lachaise.\n\n")

Il y a 9 arbres remarquables au cimetière du père lachaise.




# 2. Beautiful Stories

## Load Data

In [65]:
textFile = sc.textFile("../data/beautiful_stories.txt")

## Most frequent words

In [92]:
word_counts = textFile.flatMap(lambda x: x.split(" "))
word_counts = word_counts.map(lambda x:x.strip('“".?/,;-:'))
word_counts = word_counts.map(lambda x:x.lower()).countByValue()

word_counts = sorted(
    word_counts.items(),
    key=lambda x:x[1],
    reverse=True
    )

In [96]:
word_counts[:6]

[('', 9077),
 ('the', 2348),
 ('and', 1984),
 ('to', 1511),
 ('of', 1213),
 ('a', 993)]

## Parse stories

**Note :** last question is skipped.

In [117]:
with open("../data/beautiful_stories.txt") as f:
    textFile = f.read()
expression = "\n\n\n\n\n"
textFile = textFile.split(expression)
textFile[10]

"THE WINTER'S TALE\n\n\n\nLeontes was the King of Sicily, and his dearest friend was Polixenes,\nKing of Bohemia. They had been brought up together, and only separated\nwhen they reached man's estate and each had to go and rule over\nhis kingdom. After many years, when each was married and had a son,\nPolixenes came to stay with Leontes in Sicily.\n\nLeontes was a violent-tempered man and rather silly, and he took it into\nhis stupid head that his wife, Hermione, liked Polixenes better than\nshe did him, her own husband. When once he had got this into his head,\nnothing could put it out; and he ordered one of his lords, Camillo, to\nput a poison in Polixenes' wine. Camillo tried to dissuade him from this\nwicked action, but finding he was not to be moved, pretended to consent.\nHe then told Polixenes what was proposed against him, and they fled from\nthe Court of Sicily that night, and returned to Bohemia, where Camillo\nlived on as Polixenes' friend and counselor.\n\nLeontes threw the