In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.remote("sc://192.168.2.20:15002").getOrCreate()

In [8]:
df = spark\
    .read\
    .format("json")\
    .load("datasets/2015-summary.json")
    
df.schema

StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', StringType(), True)])

In [11]:
from pyspark.sql.types import StructField, StructType, StringType, LongType

explicitSchema = StructType([
StructField("DEST_COUNTRY_NAME", StringType(), False),
StructField("ORIGIN_COUNTRY_NAME", StringType(), False),
StructField("count", LongType(), False, metadata={"hello":"world"})
])

edf = spark\
    .read\
    .format("json")\
    .schema(explicitSchema)\
    .load("datasets/2015-summary.json")

edf.schema


StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', LongType(), True)])

In [24]:
from pyspark.sql.functions import col, column

cc = col("columnaChiquitita")
cg = column("columnaGrandotota")

print(cc)
print(edf.DEST_COUNTRY_NAME)

#edf.select(col("DEST_COUNTRY_NAME"), column("ORIGIN_COUNTRY_NAME")).show(2)

Column<'columnaChiquitita'>
Column<'DEST_COUNTRY_NAME'>


In [None]:
from pyspark.sql.functions import expr, col

#expr("count - 5")
#col("count") - 5
#expr("count") - 5

edf.select(expr("count") - 5,expr("count - 5"), col("count") - 5, (((col("count") - 5) * 200) - 6) < col("count")).show(10)

(((col("count") - 5) * 200) - 6) < col("count")
#expr("(((count - 5) * 200) - 6) < count")



+-----------+-----------+-----------+-----------------------------------+
|(count - 5)|(count - 5)|(count - 5)|((((count - 5) * 200) - 6) < count)|
+-----------+-----------+-----------+-----------------------------------+
|         10|         10|         10|                              false|
|         -4|         -4|         -4|                               true|
|        339|        339|        339|                              false|
|         10|         10|         10|                              false|
|         57|         57|         57|                              false|
|         -4|         -4|         -4|                               true|
|         57|         57|         57|                              false|
|        583|        583|        583|                              false|
|         35|         35|         35|                              false|
|         -4|         -4|         -4|                               true|
+-----------+-----------+-----------+-

Column<'None'>

In [None]:
print(edf.first())

from pyspark.sql import Row

myRow = Row("Hello", None, 1, False)
print(myRow, myRow[0], myRow[1], myRow[2], myRow[3])

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15)
<Row('Hello', None, 1, False)> Hello None 1 False


In [58]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
StructField("some", StringType(), True),
StructField("col", StringType(), True),
StructField("names", LongType(), False)
])

myRow = Row("Hello", None, 1)
myDf = spark.createDataFrame([myRow], myManualSchema)

myDf.show()

+-----+----+-----+
| some| col|names|
+-----+----+-----+
|Hello|NULL|    1|
+-----+----+-----+



In [68]:
edf.createOrReplaceTempView("edf")

# spark.sql("SELECT * FROM edf").show(2)
# spark.sql("SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count FROM edf").show(2)
# spark.sql("SELECT DEST_COUNTRY_NAME as destination, ORIGIN_COUNTRY_NAME as origin, count * 10 FROM edf").show(2)

# spark.sql("SELECT DEST_COUNTRY_NAME FROM edf").show(2)
edf.select("DEST_COUNTRY_NAME").show(2)

# spark.sql("SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME FROM edf").show(2)
edf.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").show(2)

edf.select(
    expr("DEST_COUNTRY_NAME"),
    col("DEST_COUNTRY_NAME"),
    column("DEST_COUNTRY_NAME"))\
    .show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Croatia|
+-----------------+-------------------+
only showing top 2 rows

+-----------------+-----------------+-----------------+
|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|
+-----------------+-----------------+-----------------+
|    United States|    United States|    United States|
|    United States|    United States|    United States|
+-----------------+-----------------+-----------------+
only showing top 2 rows



In [72]:
edf.select(expr("DEST_COUNTRY_NAME AS destination")).show(2)

edf.select(
        expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME")
    ).show(2)



+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



In [81]:
edf.selectExpr("DEST_COUNTRY_NAME as newColumnName", "DEST_COUNTRY_NAME").show(2)

# SELECT *, (DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountryFROM edf LIMIT 2
edf.selectExpr(
    "*", # all original columns
    "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry")\
.show(2)

# SELECT avg(count), count(distinct(DEST_COUNTRY_NAME)) FROM edf LIMIT 2
edf.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show(2)

+-------------+-----------------+
|newColumnName|DEST_COUNTRY_NAME|
+-------------+-----------------+
|United States|    United States|
|United States|    United States|
+-------------+-----------------+
only showing top 2 rows

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows

+-----------+---------------------------------+
| avg(count)|count(DISTINCT DEST_COUNTRY_NAME)|
+-----------+---------------------------------+
|1770.765625|                              132|
+-----------+---------------------------------+



In [None]:
from pyspark.sql.functions import lit

# SELECT *, 1 as One FROM edf LIMIT 2
edf.select(expr("*"), lit(1).alias("One")).show(2)


+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|One|
+-----------------+-------------------+-----+---+
|    United States|            Romania|   15|  1|
|    United States|            Croatia|    1|  1|
+-----------------+-------------------+-----+---+
only showing top 2 rows

