In [None]:
# Creating quick dataframe
# Uniform the year 
# Drop the Duplicates

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

if __name__ == '__main__':

    spark = (
        SparkSession.builder
        .appName('mis transformations')
        .master('local[*]')
        .getOrCreate()
    )

    data = [
        ('John', 'Doe', 28, 'New York',2006,12,12),
        ('Jane', 'Smith', 34, 'Los Angeles',2010,15,12),
        ('Sam', 'Brown', 22, 'Chicago',84,20,12),
        ('Emily', 'Davis', 29, 'Houston',96,25,12),
        ('Michael', 'Johnson', 31, 'Phoenix',98,31,12),
        ('Jane', 'Smith', 34, 'Los Angeles',2010,30,12)
    ]

    schema = StructType([
        StructField('FirstName',StringType()),
        StructField('LastName',StringType()),
        StructField('Age',IntegerType()),
        StructField('City',StringType()),
        StructField('Year',IntegerType()),
        StructField('Day',IntegerType()),
        StructField('Month',IntegerType())
    ]
    )

    spark_df = spark.createDataFrame(
        data,
        schema = schema
    )

    # Rectifiy the Date issue; we can see the pattern that if the year is 2 digit then its mostly from 1900s

    date_rectified_df = (
        spark_df.withColumn(
        'Year',
        when(
            length(col('Year')) == 2,
            col('Year') + 1900
        ).otherwise(
            col('Year')
        )
    )
    )

    # Add a incremental unique id

    unique_id_df = (
        date_rectified_df.withColumn(
            'UniqueID',
            monotonically_increasing_id()
        )
    ).select(
        'UniqueID',
        'FirstName',
        'LastName',
        'Age',
        'City',
        'Year'
    )

    # Showing Type casting error
    # Type casting is useful when we inherit the data from files with issues
    # when we define the dataframe its good to create the schema as well as we created 

    data2 = [
        ('John', 'Doe', 28, 'New York','2006'),
        ('Jane', 'Smith', 34, 'Los Angeles','2010'),
        ('Sam', 'Brown', 22, 'Chicago','84'),
        ('Emily', 'Davis', 29, 'Houston','96'),
        ('Michael', 'Johnson', 31, 'Phoenix','98'),
        ('Jane', 'Smith', 34, 'Los Angeles','2010')
    ]

    spark_df2 = spark.createDataFrame(
        data2,
        ['FirstName','LastName','Age','City','Year']                
        ).withColumn(
            'Year',
            when (
                length(col('Year')) == 2,
                col('Year') + 1900
            ).otherwise(
                col('Year')
            )
        )
#+---------+--------+---+-----------+------+
#|FirstName|LastName|Age|       City|  Year|
#+---------+--------+---+-----------+------+
#|     John|     Doe| 28|   New York|  2006|
#|     Jane|   Smith| 34|Los Angeles|  2010|
#|      Sam|   Brown| 22|    Chicago|1984.0|
#|    Emily|   Davis| 29|    Houston|1996.0|
#|  Michael| Johnson| 31|    Phoenix|1998.0|
#|     Jane|   Smith| 34|Los Angeles|  2010|
#+---------+--------+---+-----------+------+
    # The transformation that happenend here as there incorrect datatype and automatic type promotion by spark
    # The year is string when we performed arithmetic opetation spark sql engine prtomoted it to float and after opetation the column was demoted to string and decimals wasnt removed  
    # Solution is the Casting method

# 2 Ways:
    # Casting so with transformation so Spark dooest automatically promote or demote the datatype

    spark_df2 = spark.createDataFrame(
        data2,
        ['FirstName','LastName','Age','City','Year']                
        ).withColumn(
            'Year',
            when (
                length(col('Year')) == 2,
                col('Year').cast(IntegerType()) + 1900
            ).otherwise(
                col('Year')
            )
        )

# +---------+--------+---+-----------+----+
# |FirstName|LastName|Age|       City|Year|
# +---------+--------+---+-----------+----+
# |     John|     Doe| 28|   New York|2006|
# |     Jane|   Smith| 34|Los Angeles|2010|
# |      Sam|   Brown| 22|    Chicago|1984|
# |    Emily|   Davis| 29|    Houston|1996|
# |  Michael| Johnson| 31|    Phoenix|1998|
# |     Jane|   Smith| 34|Los Angeles|2010|
# +---------+--------+---+-----------+----+
    # You dont see the decimals anymore
    # Recommended approach

    # Second Approach Change the schema format with casting in the beginning post the defining of the dataframe
    spark_df2 = spark.createDataFrame(
    data2,
    ['FirstName','LastName','Age','City','Year']                
    ).withColumn(
        'FirstName',col('FirstName').cast(StringType())
    ).withColumn(
        'LastName',col('LastName').cast(StringType())
    ).withColumn(
        'Age',col('Age').cast(IntegerType())
    ).withColumn(
        'City',col('City').cast(StringType())
    ).withColumn(
        'Year',col('Year').cast(IntegerType())
    ).withColumn(
        'Year',
        when (
            length(col('Year')) == 2,
            col('Year') + 1900
        ).otherwise(
            col('Year')
        )
    )

# +---------+--------+---+-----------+----+
# |FirstName|LastName|Age|       City|Year|
# +---------+--------+---+-----------+----+
# |     John|     Doe| 28|   New York|2006|
# |     Jane|   Smith| 34|Los Angeles|2010|
# |      Sam|   Brown| 22|    Chicago|1984|
# |    Emily|   Davis| 29|    Houston|1996|
# |  Michael| Johnson| 31|    Phoenix|1998|
# |     Jane|   Smith| 34|Los Angeles|2010|
# +---------+--------+---+-----------+----+

    # Adding and removing Columns
        #Adding New column by concating the month day and year as date of birth

    date_rectified_df.withColumn(
        'DateOfBirth',
        to_date(
            concat_ws(
                '-',
                col('Year'),
                col('Month'),
                col('Day')
            ),
            'yyyy-MM-dd'
        )
    )

 # Creating quick dataframe
# Uniform the year 
# Drop the Duplicates

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

if __name__ == '__main__':

    spark = (
        SparkSession.builder
        .appName('mis transformations')
        .master('local[*]')
        .getOrCreate()
    )

    data = [
        ('John', 'Doe', 28, 'New York',2006,12,12),
        ('Jane', 'Smith', 34, 'Los Angeles',2010,30,12),
        ('Sam', 'Brown', 22, 'Chicago',84,20,12),
        ('Emily', 'Davis', 29, 'Houston',96,25,12),
        ('Michael', 'Johnson', 31, 'Phoenix',98,31,12),
        ('Jane', 'Smith', 34, 'Los Angeles',2010,30,12)
    ]

    schema = StructType([
        StructField('FirstName',StringType()),
        StructField('LastName',StringType()),
        StructField('Age',IntegerType()),
        StructField('City',StringType()),
        StructField('Year',IntegerType()),
        StructField('Day',IntegerType()),
        StructField('Month',IntegerType())
    ]
    )

    spark_df = spark.createDataFrame(
        data,
        schema = schema
    )

    # Rectifiy the Date issue; we can see the pattern that if the year is 2 digit then its mostly from 1900s

    date_rectified_df = (
        spark_df.withColumn(
        'Year',
        when(
            length(col('Year')) == 2,
            col('Year') + 1900
        ).otherwise(
            col('Year')
        )
    )
    )

    # Add a incremental unique id

    unique_id_df = (
        date_rectified_df.withColumn(
            'UniqueID',
            monotonically_increasing_id()
        )
    ).select(
        'UniqueID',
        'FirstName',
        'LastName',
        'Age',
        'City',
        'Year'
    )

    # Showing Type casting error
    # Type casting is useful when we inherit the data from files with issues
    # when we define the dataframe its good to create the schema as well as we created 

    data2 = [
        ('John', 'Doe', 28, 'New York','2006'),
        ('Jane', 'Smith', 34, 'Los Angeles','2010'),
        ('Sam', 'Brown', 22, 'Chicago','84'),
        ('Emily', 'Davis', 29, 'Houston','96'),
        ('Michael', 'Johnson', 31, 'Phoenix','98'),
        ('Jane', 'Smith', 34, 'Los Angeles','2010')
    ]

    spark_df2 = spark.createDataFrame(
        data2,
        ['FirstName','LastName','Age','City','Year']                
        ).withColumn(
            'Year',
            when (
                length(col('Year')) == 2,
                col('Year') + 1900
            ).otherwise(
                col('Year')
            )
        )
#+---------+--------+---+-----------+------+
#|FirstName|LastName|Age|       City|  Year|
#+---------+--------+---+-----------+------+
#|     John|     Doe| 28|   New York|  2006|
#|     Jane|   Smith| 34|Los Angeles|  2010|
#|      Sam|   Brown| 22|    Chicago|1984.0|
#|    Emily|   Davis| 29|    Houston|1996.0|
#|  Michael| Johnson| 31|    Phoenix|1998.0|
#|     Jane|   Smith| 34|Los Angeles|  2010|
#+---------+--------+---+-----------+------+
    # The transformation that happenend here as there incorrect datatype and automatic type promotion by spark
    # The year is string when we performed arithmetic opetation spark sql engine prtomoted it to float and after opetation the column was demoted to string and decimals wasnt removed  
    # Solution is the Casting method

# 2 Ways:
    # Casting so with transformation so Spark dooest automatically promote or demote the datatype

    spark_df2 = spark.createDataFrame(
        data2,
        ['FirstName','LastName','Age','City','Year']                
        ).withColumn(
            'Year',
            when (
                length(col('Year')) == 2,
                col('Year').cast(IntegerType()) + 1900
            ).otherwise(
                col('Year')
            )
        )

# +---------+--------+---+-----------+----+
# |FirstName|LastName|Age|       City|Year|
# +---------+--------+---+-----------+----+
# |     John|     Doe| 28|   New York|2006|
# |     Jane|   Smith| 34|Los Angeles|2010|
# |      Sam|   Brown| 22|    Chicago|1984|
# |    Emily|   Davis| 29|    Houston|1996|
# |  Michael| Johnson| 31|    Phoenix|1998|
# |     Jane|   Smith| 34|Los Angeles|2010|
# +---------+--------+---+-----------+----+
    # You dont see the decimals anymore
    # Recommended approach

    # Second Approach Change the schema format with casting in the beginning post the defining of the dataframe
    spark_df2 = spark.createDataFrame(
    data2,
    ['FirstName','LastName','Age','City','Year']                
    ).withColumn(
        'FirstName',col('FirstName').cast(StringType())
    ).withColumn(
        'LastName',col('LastName').cast(StringType())
    ).withColumn(
        'Age',col('Age').cast(IntegerType())
    ).withColumn(
        'City',col('City').cast(StringType())
    ).withColumn(
        'Year',col('Year').cast(IntegerType())
    ).withColumn(
        'Year',
        when (
            length(col('Year')) == 2,
            col('Year') + 1900
        ).otherwise(
            col('Year')
        )
    )

# +---------+--------+---+-----------+----+
# |FirstName|LastName|Age|       City|Year|
# +---------+--------+---+-----------+----+
# |     John|     Doe| 28|   New York|2006|
# |     Jane|   Smith| 34|Los Angeles|2010|
# |      Sam|   Brown| 22|    Chicago|1984|
# |    Emily|   Davis| 29|    Houston|1996|
# |  Michael| Johnson| 31|    Phoenix|1998|
# |     Jane|   Smith| 34|Los Angeles|2010|
# +---------+--------+---+-----------+----+

    # Adding and removing Columns and dropping duplicates
        #Adding New column by concating the month day and year as date of birth

    date_rectified_df.withColumn(
        'DateOfBirth',
        to_date(
            concat_ws(
                '-',
                col('Year'),
                col('Month'),
                col('Day')
            ),
            'yyyy-MM-dd'
        )
    ).drop(
        'Year',
        'Month',
        'Day'
    ).dropDuplicates(
        ['FirstName',
        'DateOfBirth']
    ).sort(
        [col('FirstName').asc(),
        col('DateOfBirth').asc()]
    ).show() 

# Concatenated transforation result
# +---------+--------+---+-----------+----+---+-----+-----------+
# |FirstName|LastName|Age|       City|Year|Day|Month|DateOfBirth|
# +---------+--------+---+-----------+----+---+-----+-----------+
# |     John|     Doe| 28|   New York|2006| 12|   12| 2006-12-12|
# |     Jane|   Smith| 34|Los Angeles|2010| 15|   12| 2010-12-15|
# |      Sam|   Brown| 22|    Chicago|1984| 20|   12| 1984-12-20|
# |    Emily|   Davis| 29|    Houston|1996| 25|   12| 1996-12-25|
# |  Michael| Johnson| 31|    Phoenix|1998| 31|   12| 1998-12-31|
# |     Jane|   Smith| 34|Los Angeles|2010| 30|   12| 2010-12-30|
# +---------+--------+---+-----------+----+---+-----+-----------+

#Drop Column and Duplicate results
# +---------+--------+---+-----------+-----------+
# |FirstName|LastName|Age|       City|DateOfBirth|
# +---------+--------+---+-----------+-----------+
# |    Emily|   Davis| 29|    Houston| 1996-12-25|
# |     Jane|   Smith| 34|Los Angeles| 2010-12-30|
# |     John|     Doe| 28|   New York| 2006-12-12|
# |  Michael| Johnson| 31|    Phoenix| 1998-12-31|
# |      Sam|   Brown| 22|    Chicago| 1984-12-20|
# +---------+--------+---+-----------+-----------+

# Sorted by first name and date of birth
# +---------+--------+---+-----------+-----------+
# |FirstName|LastName|Age|       City|DateOfBirth|
# +---------+--------+---+-----------+-----------+
# |    Emily|   Davis| 29|    Houston| 1996-12-25|
# |     Jane|   Smith| 34|Los Angeles| 2010-12-30|
# |     John|     Doe| 28|   New York| 2006-12-12|
# |  Michael| Johnson| 31|    Phoenix| 1998-12-31|
# |      Sam|   Brown| 22|    Chicago| 1984-12-20|
# +---------+--------+---+-----------+-----------+


+---------+--------+---+-----------+-----------+
|FirstName|LastName|Age|       City|DateOfBirth|
+---------+--------+---+-----------+-----------+
|    Emily|   Davis| 29|    Houston| 1996-12-25|
|     Jane|   Smith| 34|Los Angeles| 2010-12-30|
|     John|     Doe| 28|   New York| 2006-12-12|
|  Michael| Johnson| 31|    Phoenix| 1998-12-31|
|      Sam|   Brown| 22|    Chicago| 1984-12-20|
+---------+--------+---+-----------+-----------+

