# Ejemplos programación RDDs Spark Core

In [1]:
import os

# En nuestro ordenador personal, si no esta definida la variable JAVA_HOME, deberemos indicarla
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"

# En los laboratorios docentes, sera necesario utilizar la siguiente
# os.environ["JAVA_HOME"] = "/usr/"

os.environ["JAVA_HOME"]

'/home/maes/.sdkman/candidates/java/current'

**Import libraries, check versions**

In [2]:
import pyspark
print(pyspark.__version__)

3.5.3


In [3]:
%%bash
java -version

openjdk version "21.0.4" 2024-07-16 LTS
OpenJDK Runtime Environment Temurin-21.0.4+7 (build 21.0.4+7-LTS)
OpenJDK 64-Bit Server VM Temurin-21.0.4+7 (build 21.0.4+7-LTS, mixed mode, sharing)


## Opening SparkSession

In [4]:
from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .master("local[*]")
         #.config("spark.driver.cores", 1)
         .appName("101 Spark DataFrames")
         .getOrCreate() )

sc = spark.sparkContext
spark

24/11/18 01:17:30 WARN Utils: Your hostname, maes-GE72-7RE resolves to a loopback address: 127.0.1.1; using 192.168.1.58 instead (on interface enp3s0)
24/11/18 01:17:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/18 01:17:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/18 01:17:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Operaciones de bajo nivel

In [5]:
rdd = sc.parallelize(["#epicfail",  "#hadoop", "#rstats",  "#rstudio", "#rstats", "#spark", "#hadoop", "#hdfs",
"#hadoop",  "#oreilly", "#spark", "#python", "#spark", "#scala", "#spark", "#strataconf", "#strataconf", "#oreilly",
"#spark", "#databricks", "#hadoop", "#hdfs", "#spark",  "#hdfs"], 4)

In [6]:
rdd.getNumPartitions()

4

In [7]:
rdd_distinct = rdd.distinct()
print(rdd_distinct.toDebugString().decode("utf8"))

(4) PythonRDD[5] at RDD at PythonRDD.scala:53 []
 |  MapPartitionsRDD[4] at mapPartitions at PythonRDD.scala:160 []
 |  ShuffledRDD[3] at partitionBy at NativeMethodAccessorImpl.java:0 []
 +-(4) PairwiseRDD[2] at distinct at /tmp/ipykernel_71389/813733018.py:1 []
    |  PythonRDD[1] at distinct at /tmp/ipykernel_71389/813733018.py:1 []
    |  ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:289 []


In [8]:
print(rdd_distinct)

PythonRDD[5] at RDD at PythonRDD.scala:53


In [9]:
print(rdd_distinct.collect())

[Stage 0:>                                                          (0 + 4) / 4]

['#hdfs', '#oreilly', '#databricks', '#rstudio', '#spark', '#scala', '#strataconf', '#epicfail', '#hadoop', '#rstats', '#python']


                                                                                

## Contador de palabras

In [10]:
rdd = sc.textFile("data/wordcount_data.txt", 4)

In [11]:
rdd.count()

44

In [12]:
rdd.take(5)

['word count from Wikipedia the free encyclopedia',
 'the word count is the number of words in a document or passage of text Word counting may be needed when a text',
 'is required to stay within certain numbers of words This may particularly be the case in academia legal',
 'proceedings journalism and advertising Word count is commonly used by translators to determine the price for',
 'the translation job Word counts may also be used to calculate measures of readability and to measure typing']

In [13]:
rdd_word_count =(rdd
.flatMap(lambda line: line.split())
.map(lambda word: (word, 1))
.reduceByKey(lambda x, y: x + y))

In [14]:
print(rdd_word_count.toDebugString().decode("utf8"))

(4) PythonRDD[14] at RDD at PythonRDD.scala:53 []
 |  MapPartitionsRDD[13] at mapPartitions at PythonRDD.scala:160 []
 |  ShuffledRDD[12] at partitionBy at NativeMethodAccessorImpl.java:0 []
 +-(4) PairwiseRDD[11] at reduceByKey at /tmp/ipykernel_71389/3516608846.py:4 []
    |  PythonRDD[10] at reduceByKey at /tmp/ipykernel_71389/3516608846.py:4 []
    |  data/wordcount_data.txt MapPartitionsRDD[7] at textFile at NativeMethodAccessorImpl.java:0 []
    |  data/wordcount_data.txt HadoopRDD[6] at textFile at NativeMethodAccessorImpl.java:0 []


In [15]:
rdd_word_count.take(10)

[('free', 1),
 ('of', 25),
 ('in', 11),
 ('counting', 6),
 ('may', 8),
 ('when', 2),
 ('numbers', 1),
 ('particularly', 1),
 ('used', 4),
 ('job', 1)]

In [16]:
rdd_word_count.getNumPartitions()

4

In [17]:
rdd_word_count.getNumPartitions()

4

In [18]:
rdd2 = sc.textFile("data/wordcount_data.txt")

In [19]:
%%timeit -r1 -n1
rdd_word_count =(rdd2
.flatMap(lambda line: line.split())
.map(lambda word: (word, 1))
.reduceByKey(lambda x, y: x + y)
.take(10))

624 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## Ejercicio WordCount - Tweets

En este ejercicio realizaras un conteo de palabras del texto de una colección de Tweets situados en `data/15m-sample.json`.
- Los tweets tienen un campo `text` con el texto del tweet
- Es posible que necesites la librería `json` para leer correctante un tweet
- Deberán devolverse las 5 palabras más repetidas (buscar la operación más adecuada en la [documentación](https://spark.apache.org/docs/latest/api/python/reference/pyspark.html))

In [22]:
spark.stop()