## Topics:
- Dataframe
- Reading the data from CSV
- Checking the datatypes of the columns in the Dataframe and also Infer schema with the inferSchema' argument in ...read.csv(...) / .option(...).csv(...) method.
- Select columns and indexing
- Check describe option similar to pandas
- Adding new columns
- Dropping columns
- Renaming columns

In [138]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkOperation').getOrCreate()
spark

In [139]:
df = spark.read.csv('data/DND_20250708.csv', header=True, inferSchema=True)

In [140]:
# Print Schema of the dataframe
df.printSchema() # Gives a tree structure of the dataframe

root
 |-- NUMBER: string (nullable = true)
 |-- TE: integer (nullable = true)



In [141]:
# list all columns in the dataframe, outputs a python list with string values.
df.columns


['NUMBER', 'TE']

In [142]:
# Show dataframe and rows
df.show(5)  # Show the first 5 rows

df.head(5)  # Get the first 5 rows as a list of Row objects

df.count()  # Count the number of rows in the DataFrame

df.tail(5)  # Get the last 5 rows as a list of Row objects

df.describe().show()  # Get summary statistics for numeric columns


+--------+----+
|  NUMBER|  TE|
+--------+----+
|54283248|  89|
|54283299|3234|
|54283313|  43|
|54283321|  23|
|54283332| 232|
+--------+----+
only showing top 5 rows

+-------+-------------------+------------------+
|summary|             NUMBER|                TE|
+-------+-------------------+------------------+
|  count|              55570|                 5|
|   mean|5.700295175958753E7|             724.2|
| stddev|  4352739.084227369|1405.3959228630201|
|    min|            5713508|                23|
|    max|             NUMBER|              3234|
+-------+-------------------+------------------+



In [143]:
# SELECTING COLUMNS:

# df.columnName #  Works only if column name is a valid Python identifier (e.g. no spaces, special characters).
# df['columnName'] # This is more robust and supports any column name (even with spaces or special chars).

from pyspark.sql.functions import col
# df.select(col("NUMBER"), col('NUMBER'))# col(): Preferred when chaining expressions like transformations, filters, or with withColumn().
# df.select('NUMBER', 'NUMBER') # to return a DataFrame with one or more columns

'''
# df.columns[0] # using the index to access the first column, from the .columns list; 
# df.select(df.columns[0])
'''

df.select(
    (col('NUMBER') / 1000).alias('PHONE_NUMBER'),
    col('TE').alias('TE')
    ).show(10)  # Show the first 3 rows of the 'NUMBER' column with an alias



+------------+----+
|PHONE_NUMBER|  TE|
+------------+----+
|   54283.248|  89|
|   54283.299|3234|
|   54283.313|  43|
|   54283.321|  23|
|   54283.332| 232|
|   54283.333|NULL|
|   54283.335|NULL|
|   54283.354|NULL|
|   54283.355|NULL|
|   54283.367|NULL|
+------------+----+
only showing top 10 rows



In [None]:
print(df.dtypes, "\n") # Get the data types of each column in the DataFrame as a list of tuples [(column_name, data_type), ...]
print(df.schema, "\n")  # Get the schema of the DataFrame, which includes column names and data types
df.printSchema()  # Print the schema of the DataFrame in a tree format

[('NUMBER', 'string'), ('TE', 'int')] 

StructType([StructField('NUMBER', StringType(), True), StructField('TE', IntegerType(), True)]) 

root
 |-- NUMBER: string (nullable = true)
 |-- TE: integer (nullable = true)



In [132]:
# Adding a new column to the DataFrame
from pyspark.sql.functions import lit, concat
d = df.withColumn(colName='Generated', col=concat(lit('230'), col("NUMBER").cast('string'))) # Add a new column 'Generated' by concatenating '230' with the 'NUMBER' column

d.schema  # Show the DataFrame with the new column
d.show(2)

+--------+----+-----------+
|  NUMBER|  TE|  Generated|
+--------+----+-----------+
|54283248|  89|23054283248|
|54283299|3234|23054283299|
+--------+----+-----------+
only showing top 2 rows



In [136]:
# Dropping a column from the DataFrame

dropped_df = d.drop('Generated', 'Te')  # Drop duplicates based on the 'Generated Column'
dropped_df.show(2)  # Show the first 5 rows of the DataFrame after dropping duplicates

# Renaming a column in the DataFrame

renamed_df = dropped_df.withColumnRenamed(existing='NUMBER', new='PHONE_NUMBER')  # Rename the 'NUMBER' column to 'PHONE_NUMBER'
renamed_df.show(2)  # Show the first 5 rows of the DataFrame after renaming the column

+--------+
|  NUMBER|
+--------+
|54283248|
|54283299|
+--------+
only showing top 2 rows

+------------+
|PHONE_NUMBER|
+------------+
|    54283248|
|    54283299|
+------------+
only showing top 2 rows

