In [13]:
from pyspark.sql import SparkSession

In [14]:
spark = (
    SparkSession
    .builder
    .appName("MyApplication")
    .getOrCreate()
)

In [18]:
df = spark.read.csv("users.csv", header=True)

In [19]:
df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- city: string (nullable = true)



In [20]:
df.show()

+-------+-----+---+------+
|user_id| name|age|  city|
+-------+-----+---+------+
|      1| Анна| 25|Москва|
|      2| Иван| 30|   СПБ|
|      3|Мария| 22|Москва|
|      4| Петр| 35|Казань|
|      5|Елена| 28|   СПБ|
+-------+-----+---+------+



In [21]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType


In [25]:
schema = StructType([
    StructField(name="user_id", dataType=IntegerType(), nullable=True),
    StructField(name="name", dataType=StringType(), nullable=True),
    StructField(name="age", dataType=IntegerType(), nullable=True),
    StructField(name="city", dataType=StringType(), nullable=True)
])

In [26]:
df2 = spark.read.csv("users.csv", header=True, schema=schema)

In [27]:
df2.show()


+-------+-----+---+------+
|user_id| name|age|  city|
+-------+-----+---+------+
|      1| Анна| 25|Москва|
|      2| Иван| 30|   СПБ|
|      3|Мария| 22|Москва|
|      4| Петр| 35|Казань|
|      5|Елена| 28|   СПБ|
+-------+-----+---+------+



In [29]:
df2.describe()

DataFrame[summary: string, user_id: string, name: string, age: string, city: string]

In [31]:
df2.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)



In [35]:
df3 = spark.read.csv("users.csv", header=True,  inferSchema=True)

In [36]:
df3.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)



In [37]:
df.show()

+-------+-----+---+------+
|user_id| name|age|  city|
+-------+-----+---+------+
|      1| Анна| 25|Москва|
|      2| Иван| 30|   СПБ|
|      3|Мария| 22|Москва|
|      4| Петр| 35|Казань|
|      5|Елена| 28|   СПБ|
+-------+-----+---+------+



In [38]:
%%bash
head -10 users.csv

user_id,name,age,city
1,Анна,25,Москва
2,Иван,30,СПБ
3,Мария,22,Москва
4,Петр,35,Казань
5,Елена,28,СПБ

In [47]:
json_path = 'data/browser_events.jsonl'
df = spark.read.json(json_path, multiLine=False)

In [41]:
df.show()

+----------------+------------+--------------------+--------------------+--------------------+--------------------+----------+
|browser_language|browser_name|  browser_user_agent|            click_id|            event_id|     event_timestamp|event_type|
+----------------+------------+--------------------+--------------------+--------------------+--------------------+----------+
|          sat_IN|      Chrome|Mozilla/5.0 (Linu...|811320f1-3bc2-42b...|8cca1c7d-b0cc-473...|2022-11-28 20:51:...|  pageview|
|          tcy_IN|     Firefox|Mozilla/5.0 (Wind...|dd5a2b23-3357-4bb...|692733df-429d-420...|2022-11-28 20:51:...|  pageview|
|          sat_IN|      Chrome|Mozilla/5.0 (Linu...|811320f1-3bc2-42b...|1dc17a5c-3a3b-47f...|2022-11-28 20:51:...|  pageview|
|          mni_IN|      Chrome|Mozilla/5.0 (Linu...|7ab8087d-bfd4-4cb...|55318f64-86e5-4a2...|2022-11-28 20:51:...|  pageview|
|           ca_FR|      Chrome|Mozilla/5.0 (Linu...|58cdfc1e-85e0-465...|90462d5d-26b7-486...|2022-11-28 20:51:

In [48]:
df.printSchema()

root
 |-- browser_language: string (nullable = true)
 |-- browser_name: string (nullable = true)
 |-- browser_user_agent: string (nullable = true)
 |-- click_id: string (nullable = true)
 |-- event_id: string (nullable = true)
 |-- event_timestamp: string (nullable = true)
 |-- event_type: string (nullable = true)



In [49]:
%%bash
head -20 'data/browser_events.jsonl'

{"event_id": "8cca1c7d-b0cc-4738-be92-c644101e3fff", "event_timestamp": "2022-11-28 20:51:05.627882", "event_type": "pageview", "click_id": "811320f1-3bc2-42b9-a841-5a1e5a812f2d", "browser_name": "Chrome", "browser_user_agent": "Mozilla/5.0 (Linux; Android 2.3.5) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/32.0.833.0 Safari/531.2", "browser_language": "sat_IN"}
{"event_id": "692733df-429d-4208-a1d5-b1594895fa3c", "event_timestamp": "2022-11-28 20:51:10.914882", "event_type": "pageview", "click_id": "dd5a2b23-3357-4bba-857a-1afdb45f9144", "browser_name": "Firefox", "browser_user_agent": "Mozilla/5.0 (Windows NT 5.0; iw-IL; rv:1.9.1.20) Gecko/2012-11-11 04:47:57 Firefox/3.6.11", "browser_language": "tcy_IN"}
{"event_id": "1dc17a5c-3a3b-47fc-8b9f-9961d42cb56f", "event_timestamp": "2022-11-28 20:51:14.060882", "event_type": "pageview", "click_id": "811320f1-3bc2-42b9-a841-5a1e5a812f2d", "browser_name": "Chrome", "browser_user_agent": "Mozilla/5.0 (Linux; Android 2.3.5) AppleWebKit/531.2 (