# About Dataframes
A powerful abstaction for distributed and structured data.
A structure similar to a table in a database.
Contains schema information.

Because schema info is included, dataframes enable query optimizations and predicate pushdowns; better performances than that of RDD - leverages Spark's Catalyst optimizer.

Dataframes offer a SQL-like approach, seamless integration into SparkSQL, MLib, GraphX

Dataframes are convertable to other formats such as Pandas

In [1]:
# set env
import os
os.environ['SPARK_HOME'] = "/home/cloud_user/apps/spark/current"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [2]:
# Import PySpark
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc

In [3]:
# Create SparkSession
spark = SparkSession.builder \
    .appName("dataframes-tutorial") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/25 09:23:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
rdd = spark.sparkContext.textFile("data/st-augustine-on-christian-doctrine.txt")
result_rdd = rdd.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda x: x[1], ascending=False)

                                                                                

In [8]:
print(f"Top Ten: {result_rdd.take(100)}")

Top Ten: [('', 18382), ('the', 4212), ('of', 2743), ('to', 2412), ('and', 1828), ('in', 1497), ('is', 1358), ('that', 1221), ('a', 917), ('not', 815), ('it', 794), ('be', 712), ('as', 661), ('are', 646), ('which', 614), ('he', 576), ('by', 558), ('they', 473), ('for', 461), ('with', 459), ('we', 455), ('or', 422), ('who', 413), ('but', 399), ('this', 368), ('I', 348), ('when', 333), ('have', 331), ('And', 324), ('what', 310), ('from', 309), ('if', 283), ('For', 281), ('so', 281), ('his', 278), ('their', 266), ('all', 262), ('one', 260), ('our', 255), ('these', 229), ('was', 225), ('on', 223), ('them', 208), ('those', 208), ('man', 201), ('may', 191), ('has', 184), ('an', 170), ('men', 168), ('__________________________________________________________________', 166), ('things', 165), ('at', 164), ('more', 163), ('any', 159), ('no', 156), ('do', 155), ('But', 153), ('even', 150), ('Chapter', 150), ('other', 149), ('God', 147), ('were', 141), ('there', 140), ('He', 137), ('will', 133), ('

In [10]:
# now do word frequency using dataframes
df = spark.read.text("data/st-augustine-on-christian-doctrine.txt")
result_df = df.selectExpr("explode(split(value, ' ')) as word") \
    .groupBy("word").count().orderBy(desc("count"))

In [11]:
print(f"Word freq desc: {result_df.take(100)}")

[Stage 17:>                                                         (0 + 1) / 1]

Word freq desc: [Row(word='', count=18382), Row(word='the', count=4212), Row(word='of', count=2743), Row(word='to', count=2412), Row(word='and', count=1828), Row(word='in', count=1497), Row(word='is', count=1358), Row(word='that', count=1221), Row(word='a', count=917), Row(word='not', count=815), Row(word='it', count=794), Row(word='be', count=712), Row(word='as', count=661), Row(word='are', count=646), Row(word='which', count=614), Row(word='he', count=576), Row(word='by', count=558), Row(word='they', count=473), Row(word='for', count=461), Row(word='with', count=459), Row(word='we', count=455), Row(word='or', count=422), Row(word='who', count=413), Row(word='but', count=399), Row(word='this', count=368), Row(word='I', count=348), Row(word='when', count=333), Row(word='have', count=331), Row(word='And', count=324), Row(word='what', count=310), Row(word='from', count=309), Row(word='if', count=283), Row(word='For', count=281), Row(word='so', count=281), Row(word='his', count=278), Row(

                                                                                