## Intro to DataFrames with PySpark

In [24]:
from pyspark.sql import SparkSession

In [25]:
spark=SparkSession.builder.appName('Dataframe').getOrCreate()

In [26]:
spark

In [44]:
## read the dataset
df_pyspark=spark.read.option('header','true').csv('files/test2.csv',inferSchema=True)


In [45]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [46]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [47]:
df_pyspark.head(3)

[Row(Name='Krish', age=31, Experience=10, Salary=30000),
 Row(Name='Sudhanshu', age=30, Experience=8, Salary=25000),
 Row(Name='Sunny', age=29, Experience=4, Salary=20000)]

In [48]:
df_pyspark.select(['Name','Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|    Krish|        10|
|Sudhanshu|         8|
|    Sunny|         4|
|     Paul|         3|
|   Harsha|         1|
|  Shubham|         2|
|   Mahesh|      NULL|
|     NULL|        10|
|     NULL|      NULL|
+---------+----------+



In [49]:
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [50]:
df_pyspark.describe().show()

+-------+------+------------------+------------------+-----------------+
|summary|  Name|               age|        Experience|           Salary|
+-------+------+------------------+------------------+-----------------+
|  count|     7|                 8|                 7|                8|
|   mean|  NULL|              28.5| 5.428571428571429|          25750.0|
| stddev|  NULL|5.3718844791323335|3.8234863173611093|9361.776388210581|
|    min|Harsha|                21|                 1|            15000|
|    max| Sunny|                36|                10|            40000|
+-------+------+------------------+------------------+-----------------+



### Adding columns in DataFrame

In [51]:
df_pyspark = df_pyspark.withColumn('Experience After 2 year', df_pyspark['Experience']+2)

In [52]:
df_pyspark.show()

+---------+----+----------+------+-----------------------+
|     Name| age|Experience|Salary|Experience After 2 year|
+---------+----+----------+------+-----------------------+
|    Krish|  31|        10| 30000|                     12|
|Sudhanshu|  30|         8| 25000|                     10|
|    Sunny|  29|         4| 20000|                      6|
|     Paul|  24|         3| 20000|                      5|
|   Harsha|  21|         1| 15000|                      3|
|  Shubham|  23|         2| 18000|                      4|
|   Mahesh|NULL|      NULL| 40000|                   NULL|
|     NULL|  34|        10| 38000|                     12|
|     NULL|  36|      NULL|  NULL|                   NULL|
+---------+----+----------+------+-----------------------+



In [53]:
spark.stop()

### Dropping columns and rows

In [54]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [55]:
df_pyspark = spark.read.csv('files/test2.csv', header=True, inferSchema=True)

### Drop the columns

In [56]:
df_pyspark.drop('Name').show()

+----+----------+------+
| age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  30|         8| 25000|
|  29|         4| 20000|
|  24|         3| 20000|
|  21|         1| 15000|
|  23|         2| 18000|
|NULL|      NULL| 40000|
|  34|        10| 38000|
|  36|      NULL|  NULL|
+----+----------+------+



In [57]:
df_pyspark.na.drop(how='any').show() # If any of the columns in the row have a null

df_pyspark.na.drop(how='any', thresh=2).show() # If any of the columns in the row with at least 2 null values

df_pyspark.na.drop(how='all').show() # If all values in the row are nulls

df_pyspark.na.drop(how='any', subset=['age']).show() # If all values in the row are nulls

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
+---------+----+----------+------+

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 

In [58]:
df_pyspark.na.drop(how='any', subset=['age']).show() # Applies only to 'age' column

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     NULL| 34|        10| 38000|
|     NULL| 36|      NULL|  NULL|
+---------+---+----------+------+



### Filling the missing values

In [70]:
df_filled = df_pyspark.na.fill({
    'Name': 'Missing values',  # For string columns
    'age': 0,                  # For numeric columns (e.g., integer)
    'Experience': 0,           # For numeric columns
    'Salary': 0                # For numeric columns
})

df_filled.show()


+--------------+---+----------+------+
|          Name|age|Experience|Salary|
+--------------+---+----------+------+
|         Krish| 31|        10| 30000|
|     Sudhanshu| 30|         8| 25000|
|         Sunny| 29|         4| 20000|
|          Paul| 24|         3| 20000|
|        Harsha| 21|         1| 15000|
|       Shubham| 23|         2| 18000|
|        Mahesh|  0|         0| 40000|
|Missing values| 34|        10| 38000|
|Missing values| 36|         0|     0|
+--------------+---+----------+------+



### Handling missing values by mean

In [75]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols = ['age', 'Experience', 'Salary'],
    outputCols=[f"{c}_imputed" for c in ['age', 'Experience', 'Salary']]
).setStrategy('mean')


imputer.fit(df_pyspark).transform(df_pyspark).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         1| 15000|         21|                 1|         15000|
|  Shubham|  23|         2| 18000|         23|                 2|         18000|
|   Mahesh|NULL|      NULL| 40000|         28|                 5|         40000|
|     NULL|  34|        10| 38000|         34|                10|         38000|
|     NULL|  36|      NULL|  NULL|         36|                 5|         25750|
+---------+----+----------+-

### Filter Operations

In [79]:
## read the dataset
df_pyspark=spark.read.option('header','true').csv('files/test1.csv',inferSchema=True)
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [87]:
### Salary of the people that make less than or equal to 20000

df_pyspark.filter('Salary<=20000').show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [88]:
df_pyspark.filter('Salary<=20000').select(['Name', 'age']).show()

+-------+---+
|   Name|age|
+-------+---+
|  Sunny| 29|
|   Paul| 24|
| Harsha| 21|
|Shubham| 23|
+-------+---+



In [95]:
df_pyspark.filter((df_pyspark['Salary']<=20000) & (df_pyspark['age']<=23)).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [96]:
df_pyspark.filter((df_pyspark['Salary']<=20000) | 
                  (df_pyspark['Salary']<=15000)).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



### Aggregate and GroupBy

In [98]:
df_pyspark=spark.read.csv('files/test3.csv', header=True, inferSchema=True)
df_pyspark.show()

+---------+------------+------+
|     Name| Departments|salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [101]:
### Groupby

### Groupby

# Find max number of people with certain name
df_pyspark.groupBy('Name').count().show()

# Find max salary
df_pyspark.groupBy('Name').sum().show()

+---------+-----+
|     Name|count|
+---------+-----+
|Sudhanshu|    3|
|    Sunny|    2|
|    Krish|    3|
|   Mahesh|    2|
+---------+-----+



In [102]:
### Agreggate 

# Total expenditure of salaries
df_pyspark.agg({'Salary': 'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|      73000|
+-----------+



In [103]:
spark.stop()