In [25]:
import sys
import pyspark.sql
from pyspark.sql import SparkSession
import findspark
findspark.init() 
from pyspark import SparkContext
from pyspark.ml import feature, classification
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

Fichiers nécessaires si pas déjà dans le répertoire :

In [2]:
#!wget https://s3-eu-west-1.amazonaws.com/course.oc-static.com/courses/4297166/agents.json
#!wget http://classics.mit.edu/Homer/iliad.mb.txt
#!wget http://classics.mit.edu/Homer/odyssey.mb.txt

In [3]:
# Initialiser une session Spark
spark = SparkSession \
    .builder \
    .appName("Spark tests") \
    .getOrCreate()

# Commandes de base

## json et dataframes

In [3]:
# Lire le json, voir la structure
agents = spark.read.json("agents.json")
agents.printSchema()

root
 |-- country_name: string (nullable = true)
 |-- id: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- sex: string (nullable = true)



In [4]:
# Voir les5 premières lignes
agents.show(5)

+------------+----------+------------------+------------------+------+
|country_name|        id|          latitude|         longitude|   sex|
+------------+----------+------------------+------------------+------+
|       China| 227417393| 33.15219798270325|100.85840672174572|  Male|
|       Haiti|6821129477|19.325567983697297|-72.43795260265814|Female|
|       India|2078667700|23.645271492037235| 80.85636526088884|Female|
|       China| 477556555| 33.45864668881662| 93.33604038078953|Female|
|       India|1379059984|28.816938290678692|  80.7728698035823|Female|
+------------+----------+------------------+------------------+------+
only showing top 5 rows



In [5]:
# Filtrer sur plusieurs critères
agents.filter(agents.sex=="Female") \
    .filter(agents.latitude>20) \
    .show(5)

+------------+----------+------------------+------------------+------+
|country_name|        id|          latitude|         longitude|   sex|
+------------+----------+------------------+------------------+------+
|       India|2078667700|23.645271492037235| 80.85636526088884|Female|
|       China| 477556555| 33.45864668881662| 93.33604038078953|Female|
|       India|1379059984|28.816938290678692|  80.7728698035823|Female|
|       India|1375733494|22.385712662257426| 77.90320433636231|Female|
|       China| 265404548|29.447948266668902|106.56441719305467|Female|
+------------+----------+------------------+------------------+------+
only showing top 5 rows



In [6]:
# Ordonner, montrer les 3 premiers
agents.orderBy(agents.longitude) \
    .limit(3) \
    .show()

+----------------+----------+-------------------+-------------------+------+
|    country_name|        id|           latitude|          longitude|   sex|
+----------------+----------+-------------------+-------------------+------+
|French Polynesia|7170821229|-15.004219445056265|-140.01650828107668|  Male|
|   United States|2663608288|  38.34123451380646|-115.14211841078837|  Male|
|   United States|2651719771|  37.05792617608187|-109.88579232396816|Female|
+----------------+----------+-------------------+-------------------+------+



In [8]:
# Création de vue pour utiliser les requêtes SQL
agents.createTempView("agents_view")

In [9]:
spark.sql("SELECT * FROM agents_view ORDER BY longitude LIMIT 3") \
    .show()

+----------------+----------+-------------------+-------------------+------+
|    country_name|        id|           latitude|          longitude|   sex|
+----------------+----------+-------------------+-------------------+------+
|French Polynesia|7170821229|-15.004219445056265|-140.01650828107668|  Male|
|   United States|2663608288|  38.34123451380646|-115.14211841078837|  Male|
|   United States|2651719771|  37.05792617608187|-109.88579232396816|Female|
+----------------+----------+-------------------+-------------------+------+



In [10]:
# Convertir en rdd
agents.rdd

MapPartitionsRDD[34] at javaToPython at NativeMethodAccessorImpl.java:0

In [11]:
# Montrer la première ligne
agents.rdd.first()

Row(country_name='China', id=227417393, latitude=33.15219798270325, longitude=100.85840672174572, sex='Male')

In [13]:
first_row = agents.rdd.first()
print(first_row.country_name, first_row.id)

China 227417393


In [15]:
# Création et ajout d'une nouvelle ligne au rdd
pyspark.sql.Row(country_name="Ambre", id=000, latitude=0, longitude=0, sex="Undefined")

Row(country_name='Ambre', id=0, latitude=0, longitude=0, sex='Undefined')

## Utilisation d'un fichier texte

In [18]:
# Transformation d'un ficier texte en rdd
iliad_rdd = spark.sparkContext.textFile("iliad.mb.txt") \
    .flatMap(lambda line: line.split()) \
    .map(lambda word: word.strip(",;.?:/!\"'~()-"))

In [19]:
# Conversion des objets contenus dans le RDD en rows
iliad_rows = iliad_rdd.map(lambda word: pyspark.sql.Row(word=word))

In [20]:
iliad_rows

PythonRDD[39] at RDD at PythonRDD.scala:53

In [21]:
# Création d'un df à partir des rows
iliad = spark.createDataFrame(iliad_rows)
iliad.show(5)

+--------+
|    word|
+--------+
|Provided|
|      by|
|     The|
|Internet|
|Classics|
+--------+
only showing top 5 rows



In [26]:
# les 10 derniers mots différents (distinct = .unique()) lorsque classés par ordre alphabétique
iliad.distinct() \
    .orderBy("word", ascending=False) \
    .show(10)

+----------+
|      word|
+----------+
|      zeal|
|    youths|
|     youth|
| youselves|
|yourselves|
|  yourself|
|     yours|
|   yourelf|
|      your|
|  youngest|
+----------+
only showing top 10 rows



# NLP avec l'Iliade et l'Odyssé

But : A partir d'une ligne de texte , prédire si elle appartient à l'Iliade ou l'Odyssé.

Algo : classification supervisée binaire.

In [4]:
# Lecture de l'Iliade en lignes
iliad_lines = spark.sparkContext.textFile("iliad.mb.txt") \
        .map(lambda line: line.split()) \
        .map(lambda words: [w.strip(",;.?:/!\"'~()-") for w in words])

In [5]:
for line in iliad_lines.takeSample(False, 5):
    print(line)

['one', 'Antilochus', 'speared', 'Atymnius', 'driving', 'the', 'point', 'of', 'the', 'spear']
['for', 'you', 'are', 'foremost', 'at', 'all', 'times', 'alike', 'in', 'fight', 'and', 'counsel', 'hold']
['go', 'to', 'Ilius', 'and', 'tell', 'the', 'old', 'men', 'of', 'our', 'council', 'and', 'our', 'wives', 'to', 'pray']
['scare', 'me', 'as', 'though', 'I', 'were', 'a', 'child', 'I', 'too', 'if', 'I', 'will', 'can', 'brag', 'and']
['half', 'true', 'and', 'the', 'other', 'lies', 'as', 'rage', 'inspires', 'them', 'No', 'words', 'of', 'yours']


In [6]:
# Idem pour l'Odyssé
odyssey_lines = spark.sparkContext.textFile("odyssey.mb.txt") \
        .map(lambda line: line.split()) \
        .map(lambda words: [w.strip(",;.?:/!\"'~()-") for w in words])

In [7]:
# Labéliser nos lignes (0 = Iliade, 1 = Odyssé) et transformer en rows
iliad_labeled = iliad_lines.map(lambda words: pyspark.sql.Row(label=0, words=words))
odyssey_labeled = odyssey_lines.map(lambda words: pyspark.sql.Row(label=1, words=words))

In [8]:
# Union des rdd labélisés
data = spark.createDataFrame(iliad_labeled.union(odyssey_labeled))

On va faire un bag of word de nos mots.

In [10]:
vectorizer = feature.CountVectorizer(inputCol="words", outputCol="bow") \
    .fit(data)
features = vectorizer.transform(data)

In [11]:
features.show(10)

+-----+--------------------+--------------------+
|label|               words|                 bow|
+-----+--------------------+--------------------+
|    0|[Provided, by, Th...|(10915,[35,72,370...|
|    0|[See, bottom, for...|(10915,[12,36,104...|
|    0|[http://classics....|(10915,[9874],[1.0])|
|    0|                  []|       (10915,[],[])|
|    0|        [The, Iliad]|(10915,[72,10762]...|
|    0|         [By, Homer]|(10915,[1613,6680...|
|    0|                  []|       (10915,[],[])|
|    0|                  []|       (10915,[],[])|
|    0|[Translated, by, ...|(10915,[35,5188,5...|
|    0|                  []|       (10915,[],[])|
+-----+--------------------+--------------------+
only showing top 10 rows



In [14]:
# Séparation en train et test
train, test = features.randomSplit([0.75, 0.25])

In [15]:
# On prend un classifieur Naive Bayes et on le fit sur le train
clf = classification.NaiveBayes(labelCol="label", featuresCol="bow", predictionCol="target_predicted") \
    .fit(train)

In [16]:
# Predict sur le test
predicted = clf.transform(test)

In [24]:
# Evaluation avec l'accuracy
accuracy = predicted.filter(predicted.target_predicted==predicted.label).count() / float(predicted.count())
print("Accuracy", accuracy)

Accuracy 0.7770646850194224


In [26]:
# ou encore
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="target_predicted", metricName="accuracy")
accuracy = evaluator.evaluate(predicted)

print(f"Accuracy = {accuracy:.2f}")

Accuracy = 0.78
