In [48]:
from pyspark.sql import SparkSession

In [49]:
spark = SparkSession.builder.appName("Basics").getOrCreate()

In [50]:
spark

In [51]:
df_spark = spark.read.csv("test1.csv")

In [52]:
df_spark.show()

+---------+---+----------+------+
|      _c0|_c1|       _c2|   _c3|
+---------+---+----------+------+
|     Name|age|Experience|Salary|
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [53]:
df_spark = spark.read.option("header", "true").csv("test1.csv")
df_spark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [54]:
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [55]:
df_spark.head(3)

[Row(Name='Krish', age='31', Experience='10', Salary='30000'),
 Row(Name='Sudhanshu', age='30', Experience='8', Salary='25000'),
 Row(Name='Sunny', age='29', Experience='4', Salary='20000')]

In [56]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Salary: string (nullable = true)



In [57]:
# inferschema=true make string types integer!
df_spark = spark.read.option("header", "true").csv("test1.csv",inferSchema=True)
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [58]:
df_spark.columns

['Name', 'age', 'Experience', 'Salary']

In [59]:
df_spark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [60]:
df_spark.select("Name").show()

+---------+
|     Name|
+---------+
|    Krish|
|Sudhanshu|
|    Sunny|
|     Paul|
|   Harsha|
|  Shubham|
+---------+



In [61]:
df_spark.select("Name", "age").show()

+---------+---+
|     Name|age|
+---------+---+
|    Krish| 31|
|Sudhanshu| 30|
|    Sunny| 29|
|     Paul| 24|
|   Harsha| 21|
|  Shubham| 23|
+---------+---+



In [62]:
df_spark.describe().show()

+-------+------+------------------+-----------------+------------------+
|summary|  Name|               age|       Experience|            Salary|
+-------+------+------------------+-----------------+------------------+
|  count|     6|                 6|                6|                 6|
|   mean|  null|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|  null| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|Harsha|                21|                1|             15000|
|    max| Sunny|                31|               10|             30000|
+-------+------+------------------+-----------------+------------------+



In [63]:
# Adding columns
df_spark = df_spark.withColumn("Over 25 years old", df_spark["age"] > 25)
df_spark.show()

+---------+---+----------+------+-----------------+
|     Name|age|Experience|Salary|Over 25 years old|
+---------+---+----------+------+-----------------+
|    Krish| 31|        10| 30000|             true|
|Sudhanshu| 30|         8| 25000|             true|
|    Sunny| 29|         4| 20000|             true|
|     Paul| 24|         3| 20000|            false|
|   Harsha| 21|         1| 15000|            false|
|  Shubham| 23|         2| 18000|            false|
+---------+---+----------+------+-----------------+



In [65]:
# Drop columns
df_spark = df_spark.drop("Over 25 years old")
df_spark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [66]:
# Rename columns
df_spark = df_spark.withColumnRenamed("age", "Age")
df_spark.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+

