In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StringType, StructType

In [2]:
spark = (
    SparkSession
    .builder
    .appName("MyApp")
    .getOrCreate()
)

In [5]:
spark

In [32]:
df = spark.read.csv("users.csv", header=True, inferSchema=True)

In [33]:
df.show()

+-------+-----+---+------+
|user_id| name|age|  city|
+-------+-----+---+------+
|      1| Анна| 25|Москва|
|      2| Иван| 30|   СПБ|
|      3|Мария| 22|Москва|
|      4| Петр| 35|Казань|
|      5|Елена| 28|   СПБ|
+-------+-----+---+------+



In [34]:
df.describe()

DataFrame[summary: string, user_id: string, name: string, age: string, city: string]

In [35]:
df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)



In [36]:
df.columns

['user_id', 'name', 'age', 'city']

In [37]:
#from pyspark.sql.types import StructField, StringType, IntegerType, StructType, LongType, TimestampType, DecimalType

data_schema = [
    StructField("user_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("age", StringType(), True),
    StructField("city", StringType(), True)
]

In [41]:
print(*[el for el in data_schema], sep='\n')

StructField('user_id', StringType(), True)
StructField('name', StringType(), True)
StructField('age', StringType(), True)
StructField('city', StringType(), True)


In [42]:
df['age']

Column<'age'>

In [44]:
type(df["age"])

pyspark.sql.column.Column

In [45]:
df.select("age")

DataFrame[age: int]

In [46]:
df.select("age").show()

+---+
|age|
+---+
| 25|
| 30|
| 22|
| 35|
| 28|
+---+



In [47]:
type(df.select("age").show())

+---+
|age|
+---+
| 25|
| 30|
| 22|
| 35|
| 28|
+---+



NoneType

In [49]:
final_struct = StructType(fields=data_schema)

In [51]:
df2 = spark.read.csv("users.csv", schema=final_struct)

In [52]:
df2.show()

+-------+-----+---+------+
|user_id| name|age|  city|
+-------+-----+---+------+
|user_id| name|age|  city|
|      1| Анна| 25|Москва|
|      2| Иван| 30|   СПБ|
|      3|Мария| 22|Москва|
|      4| Петр| 35|Казань|
|      5|Елена| 28|   СПБ|
+-------+-----+---+------+



In [53]:
df2.describe()

DataFrame[summary: string, user_id: string, name: string, age: string, city: string]

In [54]:
# Получить два первых объекта
df.head(2)

[Row(user_id=1, name='Анна', age=25, city='Москва'),
 Row(user_id=2, name='Иван', age=30, city='СПБ')]

In [56]:
# ВЫбор объектов через lis object
df.select(['user_id', 'name']).show()

+-------+-----+
|user_id| name|
+-------+-----+
|      1| Анна|
|      2| Иван|
|      3|Мария|
|      4| Петр|
|      5|Елена|
+-------+-----+



In [58]:
# Добавление новой колонки "newage"
df.withColumn('newage', df['age']).show()

+-------+-----+---+------+------+
|user_id| name|age|  city|newage|
+-------+-----+---+------+------+
|      1| Анна| 25|Москва|    25|
|      2| Иван| 30|   СПБ|    30|
|      3|Мария| 22|Москва|    22|
|      4| Петр| 35|Казань|    35|
|      5|Елена| 28|   СПБ|    28|
+-------+-----+---+------+------+



In [59]:
df.show()

+-------+-----+---+------+
|user_id| name|age|  city|
+-------+-----+---+------+
|      1| Анна| 25|Москва|
|      2| Иван| 30|   СПБ|
|      3|Мария| 22|Москва|
|      4| Петр| 35|Казань|
|      5|Елена| 28|   СПБ|
+-------+-----+---+------+



In [60]:
# Переименование колонки

In [61]:
df.withColumnRenamed('age', 'возраст')

DataFrame[user_id: int, name: string, возрастхузраст: int, city: string]

In [62]:
df.show()

+-------+-----+---+------+
|user_id| name|age|  city|
+-------+-----+---+------+
|      1| Анна| 25|Москва|
|      2| Иван| 30|   СПБ|
|      3|Мария| 22|Москва|
|      4| Петр| 35|Казань|
|      5|Елена| 28|   СПБ|
+-------+-----+---+------+



In [63]:
df3 = df.withColumnRenamed('age', 'возраст')

In [64]:
df3.show()

+-------+-----+--------------+------+
|user_id| name|возрастхузраст|  city|
+-------+-----+--------------+------+
|      1| Анна|            25|Москва|
|      2| Иван|            30|   СПБ|
|      3|Мария|            22|Москва|
|      4| Петр|            35|Казань|
|      5|Елена|            28|   СПБ|
+-------+-----+--------------+------+



In [65]:
# withColumn Может принимать не только колонки, но и функции
# Над добавленной колонкой можно производить действия
df.withColumn('doubleage', df['age'] * 2).show()

+-------+-----+---+------+---------+
|user_id| name|age|  city|doubleage|
+-------+-----+---+------+---------+
|      1| Анна| 25|Москва|       50|
|      2| Иван| 30|   СПБ|       60|
|      3|Мария| 22|Москва|       44|
|      4| Петр| 35|Казань|       70|
|      5|Елена| 28|   СПБ|       56|
+-------+-----+---+------+---------+



In [66]:
df.withColumn('add_one_age', df['age'] + 1).show()

+-------+-----+---+------+-----------+
|user_id| name|age|  city|add_one_age|
+-------+-----+---+------+-----------+
|      1| Анна| 25|Москва|         26|
|      2| Иван| 30|   СПБ|         31|
|      3|Мария| 22|Москва|         23|
|      4| Петр| 35|Казань|         36|
|      5|Елена| 28|   СПБ|         29|
+-------+-----+---+------+-----------+



In [67]:
df.withColumn('half_age', df['age'] / 2).show()

+-------+-----+---+------+--------+
|user_id| name|age|  city|half_age|
+-------+-----+---+------+--------+
|      1| Анна| 25|Москва|    12.5|
|      2| Иван| 30|   СПБ|    15.0|
|      3|Мария| 22|Москва|    11.0|
|      4| Петр| 35|Казань|    17.5|
|      5|Елена| 28|   СПБ|    14.0|
+-------+-----+---+------+--------+



In [68]:
df.withColumn('half_age', df["age"] / 2)

DataFrame[user_id: int, name: string, age: int, city: string, half_age: double]

In [70]:
# Using SQL

In [86]:
df.createOrReplaceTempView("users")

In [87]:
sql_result = spark.sql("SELECT * FROM users")

In [88]:
sql_result.show()

+-------+-----+---+------+
|user_id| name|age|  city|
+-------+-----+---+------+
|      1| Анна| 25|Москва|
|      2| Иван| 30|   СПБ|
|      3|Мария| 22|Москва|
|      4| Петр| 35|Казань|
|      5|Елена| 28|   СПБ|
+-------+-----+---+------+



In [90]:
sql_result

DataFrame[user_id: int, name: string, age: int, city: string]

In [92]:
spark.sql("SELECT user_id, name, age FROM users WHERE age = 22").show()

+-------+-----+---+
|user_id| name|age|
+-------+-----+---+
|      3|Мария| 22|
+-------+-----+---+

