## Topics:
- Dataframe
- Reading the data from CSV
- Checking the datatypes of the columns in the Dataframe and also Infer schema with the inferSchema' argument in ...read.csv(...) / .option(...).csv(...) method.
- Select columns and indexing
- Check describe option similar to pandas
- Adding new columns
- Dropping columns
- Renaming columns

In [1]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName('SparkOperation').getOrCreate()
spark

In [2]:
df = spark.read.csv('./data/testdata.csv', header=True, inferSchema=True)

# Schema inference, which means that we want Spark to take a best guess at what
# the schema of our DataFrame should be.

In [3]:
# Print Schema of the dataframe
df.printSchema() # Gives a tree structure of the dataframe

root
 |-- MSISDN: integer (nullable = true)
 |-- NID: string (nullable = true)
 |-- NID_LEN: integer (nullable = true)
 |-- CUSTOMERIDNAME: string (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- GROSS_DATE: string (nullable = true)
 |-- CUSTOMER_TYPE: string (nullable = true)
 |-- USERTYPE: string (nullable = true)
 |-- SUBSCRIBERCAT: string (nullable = true)
 |-- SUBSCRIBERSUBCAT: string (nullable = true)
 |-- SEGMENT: string (nullable = true)
 |-- POSTPAID_TARIFF: string (nullable = true)
 |-- POSTPAID_MAINPRODUCT: string (nullable = true)
 |-- POSTPAID_SUBPRODUCT: string (nullable = true)
 |-- NATIONALITY: string (nullable = true)
 |-- AON_DAYS: integer (nullable = true)
 |-- AON_MONTH: double (nullable = true)
 |-- AON_YEAR: double (nullable = true)
 |-- ALTERNATE_MOB_NUM: string (nullable = true)
 |-- FAVOURITE_LOCATION: string (nullable = true)
 |-- DAYS_INACT  BAL_1: integer (nullable = true)
 |-- BAL_2: double (nullable = true)
 |-- BAL_3: double (nullable = true)
 

In [4]:
# list all columns in the dataframe, outputs a python list with string values.
df.columns

['MSISDN',
 'NID',
 'NID_LEN',
 'CUSTOMERIDNAME',
 'EMAIL',
 'GROSS_DATE',
 'CUSTOMER_TYPE',
 'USERTYPE',
 'SUBSCRIBERCAT',
 'SUBSCRIBERSUBCAT',
 'SEGMENT',
 'POSTPAID_TARIFF',
 'POSTPAID_MAINPRODUCT',
 'POSTPAID_SUBPRODUCT',
 'NATIONALITY',
 'AON_DAYS',
 'AON_MONTH',
 'AON_YEAR',
 'ALTERNATE_MOB_NUM',
 'FAVOURITE_LOCATION',
 'DAYS_INACT  BAL_1',
 'BAL_2',
 'BAL_3',
 'DATA_USERS',
 'PACK_USERS',
 'PACK_90D',
 'PACK_30D',
 'DAILY_PACK_90D',
 'DAILY_PACK_30D',
 'WEEKLY_PACK_90D',
 'WEEKLY_PACK_30D',
 'MONTHLY_PACK_90D',
 'MONTHLY_PACK_30D',
 'RECHARGE_VAL_30D',
 'RECHARGE_VAL_D1',
 'RECHARGE_VAL_D2',
 'RECHARGE_VAL_D3',
 'AIRTIME_USG YOUTH_PACK',
 'TOTAL_ARPU',
 'ONNET_REVENUE',
 'XNET_REVENUE',
 'ONNET_TOTAL_MINS_90D',
 'ONNET_TOTAL_MINS_30D',
 'XNET_TOTAL_MINS_90D',
 'XNET_TOTAL_MINS_30D',
 'VOICE_INCOMING_ARPU SMS_ARPU',
 'DATA_USG_GB_90D',
 'DATA_USG_GB_30D',
 'DATA_PAYG_USG_GB_90D',
 'DATA_PAYG_USG_GB_30D',
 'TOTAL_DATA_ARPU_90D',
 'TOTAL_DATA_ARPU_30D',
 'TOTAL_DATA_PAYG_ARPU_90D',

In [8]:
# Show dataframe and rows
df.show(2)  # Show the first 5 rows

# df.head(5)  # Get the first 5 rows as a list of Row objects

# df.count()  # Count the number of rows in the DataFrame

# df.tail(5)  # Get the last 5 rows as a list of Row objects

# df.describe().show()  # Get summary statistics for numeric columns


+--------+--------------+-------+--------------------+--------------------+----------+-------------+--------+-------------+----------------+-------+---------------+--------------------+-------------------+-----------+--------+---------+--------+-----------------+------------------+-----------------+------+------+----------+----------+--------+--------+--------------+--------------+---------------+---------------+----------------+----------------+----------------+---------------+---------------+---------------+----------------------+----------+-------------+------------+--------------------+--------------------+-------------------+-------------------+----------------------------+---------------+---------------+--------------------+--------------------+-------------------+-------------------+------------------------+------------------------+--------+------------+----------------+----------------+----------------+-----------+-----------+-----------+------------+------------+------------+-

In [10]:
# SELECTING COLUMNS:

# df.MSISDN #  Works only if column name is a valid Python identifier (e.g. no spaces, special characters).
# df['MSISDN'] # This is more robust and supports any column name (even with spaces or special chars).

from pyspark.sql.functions import col
# df.select(col("MSISDN"), col('MSISDN'))# col(): Preferred when chaining expressions like transformations, filters, or with withColumn().
# df.select('MSISDN', 'MSISDN') # to return a DataFrame with one or more columns

'''
# df.columns[0] # using the index to access the first column, from the .columns list; 
# df.select(df.columns[0])
'''

df.select(
    (col('MSISDN') / 1000).alias('PHONE_NUMBER'),
    # col('TE').alias('TE')
    ).show(10)  # Show the first 3 rows of the 'NUMBER' column with an alias



+------------+
|PHONE_NUMBER|
+------------+
|   54924.133|
|   54846.497|
|   57369.115|
|   57113.437|
|   58468.805|
|   57228.141|
|   57119.226|
|   57118.074|
|   54881.304|
|   57320.678|
+------------+
only showing top 10 rows



In [11]:
print(df.dtypes, "\n") # Get the data types of each column in the DataFrame as a list of tuples [(column_name, data_type), ...]
print(df.schema, "\n")  # Get the schema of the DataFrame, which includes column names and data types
df.printSchema()  # Print the schema of the DataFrame in a tree format

[('MSISDN', 'int'), ('NID', 'string'), ('NID_LEN', 'int'), ('CUSTOMERIDNAME', 'string'), ('EMAIL', 'string'), ('GROSS_DATE', 'string'), ('CUSTOMER_TYPE', 'string'), ('USERTYPE', 'string'), ('SUBSCRIBERCAT', 'string'), ('SUBSCRIBERSUBCAT', 'string'), ('SEGMENT', 'string'), ('POSTPAID_TARIFF', 'string'), ('POSTPAID_MAINPRODUCT', 'string'), ('POSTPAID_SUBPRODUCT', 'string'), ('NATIONALITY', 'string'), ('AON_DAYS', 'int'), ('AON_MONTH', 'double'), ('AON_YEAR', 'double'), ('ALTERNATE_MOB_NUM', 'string'), ('FAVOURITE_LOCATION', 'string'), ('DAYS_INACT  BAL_1', 'int'), ('BAL_2', 'double'), ('BAL_3', 'double'), ('DATA_USERS', 'double'), ('PACK_USERS', 'string'), ('PACK_90D', 'string'), ('PACK_30D', 'int'), ('DAILY_PACK_90D', 'int'), ('DAILY_PACK_30D', 'int'), ('WEEKLY_PACK_90D', 'int'), ('WEEKLY_PACK_30D', 'int'), ('MONTHLY_PACK_90D', 'int'), ('MONTHLY_PACK_30D', 'int'), ('RECHARGE_VAL_30D', 'int'), ('RECHARGE_VAL_D1', 'double'), ('RECHARGE_VAL_D2', 'double'), ('RECHARGE_VAL_D3', 'double'), ('

In [13]:
# Adding a new column to the DataFrame
from pyspark.sql.functions import lit, concat
d = df.withColumn(colName='Phone', col=concat(lit('230'), col("MSISDN").cast('string'))) # Add a new column 'Generated' by concatenating '230' with the 'NUMBER' column

d.schema  # Show the DataFrame with the new column
d.show(2)

+--------+--------------+-------+--------------------+--------------------+----------+-------------+--------+-------------+----------------+-------+---------------+--------------------+-------------------+-----------+--------+---------+--------+-----------------+------------------+-----------------+------+------+----------+----------+--------+--------+--------------+--------------+---------------+---------------+----------------+----------------+----------------+---------------+---------------+---------------+----------------------+----------+-------------+------------+--------------------+--------------------+-------------------+-------------------+----------------------------+---------------+---------------+--------------------+--------------------+-------------------+-------------------+------------------------+------------------------+--------+------------+----------------+----------------+----------------+-----------+-----------+-----------+------------+------------+------------+-

In [15]:
# Dropping a column from the DataFrame

dropped_df = d.drop('Phone')  # Drop duplicates based on the 'Generated Column'
dropped_df.show(2)  # Show the first 5 rows of the DataFrame after dropping duplicates

# Renaming a column in the DataFrame

renamed_df = dropped_df.withColumnRenamed(existing='MSISDN', new='PHONE_NUMBER')  # Rename the 'NUMBER' column to 'PHONE_NUMBER'
renamed_df.show(2)  # Show the first 5 rows of the DataFrame after renaming the column

+--------+--------------+-------+--------------------+--------------------+----------+-------------+--------+-------------+----------------+-------+---------------+--------------------+-------------------+-----------+--------+---------+--------+-----------------+------------------+-----------------+------+------+----------+----------+--------+--------+--------------+--------------+---------------+---------------+----------------+----------------+----------------+---------------+---------------+---------------+----------------------+----------+-------------+------------+--------------------+--------------------+-------------------+-------------------+----------------------------+---------------+---------------+--------------------+--------------------+-------------------+-------------------+------------------------+------------------------+--------+------------+----------------+----------------+----------------+-----------+-----------+-----------+------------+------------+------------+-