In [2]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=77a288869c0a7da2dde371144c717e5e02a54111520eff22de496d9ba8356508
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


Importing the packages

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, StructField, IntegerType
import pyspark.sql.functions as func

Creating the SparkSession

In [5]:
spark = SparkSession.builder.appName("FirstApp").getOrCreate()

Defining Schema for your Dataframe 

In [9]:
myschema =  StructType([\
                        StructField("userID", IntegerType(), True),
                        StructField("name", StringType(), True),
                        StructField("age", IntegerType(), True),
                        StructField("friends", IntegerType(), True),
                        ])

In [13]:
myschema

StructType([StructField('userID', IntegerType(), True), StructField('name', StringType(), True), StructField('age', IntegerType(), True), StructField('friends', IntegerType(), True)])

Creating Dataframe on a CSV file

In [12]:
people = spark.read.format("csv")\
    .schema(myschema)\
    .option("path","/content/fakeFriends.csv")\
    .load()

In [15]:
print(people)

DataFrame[userID: int, name: string, age: int, friends: int]


Performing all Transformations

In [16]:
output = people.select(people.userID, people.name\
                       ,people.age, people.friends)\
                       .where(people.age<30).withColumn('insert_ts',func.current_timestamp())\
                       .orderBy(people.userID)

In [18]:
output.count()

112

Creating a Temp View

In [22]:
output.createOrReplaceTempView("people")

In [23]:
spark.sql("select name, age,friends, insert_ts from people").show()

+--------+---+-------+--------------------+
|    name|age|friends|           insert_ts|
+--------+---+-------+--------------------+
|Jean-Luc| 26|      2|2023-05-26 16:56:...|
|    Hugh| 27|    181|2023-05-26 16:56:...|
|  Weyoun| 22|    323|2023-05-26 16:56:...|
|   Miles| 19|    268|2023-05-26 16:56:...|
|  Julian| 25|      1|2023-05-26 16:56:...|
|     Ben| 21|    445|2023-05-26 16:56:...|
|  Julian| 22|    100|2023-05-26 16:56:...|
|     Nog| 26|    281|2023-05-26 16:56:...|
| Beverly| 27|    305|2023-05-26 16:56:...|
|    Morn| 25|     96|2023-05-26 16:56:...|
|   Brunt| 24|     49|2023-05-26 16:56:...|
|     Nog| 20|      1|2023-05-26 16:56:...|
| Beverly| 19|    269|2023-05-26 16:56:...|
|   Brunt| 19|      5|2023-05-26 16:56:...|
|  Geordi| 20|    100|2023-05-26 16:56:...|
|  Geordi| 21|    477|2023-05-26 16:56:...|
|  Kasidy| 22|    179|2023-05-26 16:56:...|
|   Brunt| 20|    384|2023-05-26 16:56:...|
|     Ben| 28|    311|2023-05-26 16:56:...|
|    Worf| 24|    492|2023-05-26