In [0]:
import pyspark 
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

spark = SparkSession.builder \
    .master("local[1]") \
    .appName('ProjectFirst') \
    .getOrCreate()

data = [("James", "", "William", "36636", "M", 3000), ("Michael", "Smith", "", "40288", "M", 4000), ("Robert", "", "Dawson", "42114", "M", 4000), 
        ("Maria", "", "Jones", "39192", "F", 4000)]

schema = StructType([
    StructField("firstname", StringType(), True),\
    StructField("middlename", StringType(), True),\
    StructField("lastname", StringType(), True),\
    StructField("id", StringType(), True),\
    StructField("gender", StringType(), True),\
    StructField("salary", IntegerType(), True)\
    ])

df = spark.createDataFrame(data = data, schema = schema)
df.printSchema()
df.show(truncate = False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |William |36636|M     |3000  |
|Michael  |Smith     |        |40288|M     |4000  |
|Robert   |          |Dawson  |42114|M     |4000  |
|Maria    |          |Jones   |39192|F     |4000  |
+---------+----------+--------+-----+------+------+



In [0]:
import pyspark 
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr

spark = SparkSession.builder \
    .master("local[1]") \
    .appName('ProjectSecond') \
    .getOrCreate()

data = [("James", "Sales", 3000),\
    ("Michael", "Sales", 4600),\
    ("Robert", "Sales", 4100),\
    ("Maria", "Finance", 3000),\
    ("James", "Sales", 3000),\
    ("Scott", "Finance", 3300),\
    ("Jen", "Finance", 3900),\
    ("Jeff", "Marketing", 3000),\
    ("Kumar", "Marketing", 2000),\
    ("Dogu", "Sales", 4100)]

column = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data = data, schema = column)
df.printSchema()
df.show(truncate = False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Dogu         |Sales     |4100  |
+-------------+----------+------+



In [0]:
#Distinct
distinctDF = df.distinct()
print("Distinct Count: " + str(distinctDF.count()))
distinctDF.show(truncate = False)

#Drop Duplicates
df2 = df.dropDuplicates()
print("Distinct Count: " + str(df2.count()))
df2.show(truncate = False)

dropDisDF = df.dropDuplicates(["department", "salary"])
print("Distinct Count: " + str(dropDisDF.count()))
dropDisDF.show(truncate = False)

Distinct Count: 9
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Michael      |Sales     |4600  |
|James        |Sales     |3000  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|Jen          |Finance   |3900  |
|Scott        |Finance   |3300  |
|Kumar        |Marketing |2000  |
|Jeff         |Marketing |3000  |
|Dogu         |Sales     |4100  |
+-------------+----------+------+

Distinct Count: 9
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Michael      |Sales     |4600  |
|James        |Sales     |3000  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|Jen          |Finance   |3900  |
|Scott        |Finance   |3300  |
|Kumar        |Marketing |2000  |
|Jeff         |Marketing |3000  |
|Dogu         |Sales     |4100  |
+-------------+----------+------+

Distinct Count: 8
+-------------+----------+------+
|employee_name|department|

In [0]:
import pyspark 
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

spark = SparkSession.builder \
    .master("local[1]") \
    .appName('ProjectFirst') \
    .getOrCreate()

data = [("James", "", "William", "36636", "M", 3000), ("Michael", "Smith", "", "40288", "M", 4000), ("Robert", "", "Dawson", "42114", "M", 4000), 
        ("Maria", "", "Jones", "39192", "F", 4000)]

schema = StructType([
    StructField("firstname", StringType(), True),\
    StructField("middlename", StringType(), True),\
    StructField("lastname", StringType(), True),\
    StructField("id", StringType(), True),\
    StructField("gender", StringType(), True),\
    StructField("salary", IntegerType(), True)\
    ])

df = spark.createDataFrame(data = data, schema = schema)
df.printSchema()
df.show(truncate = False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |William |36636|M     |3000  |
|Michael  |Smith     |        |40288|M     |4000  |
|Robert   |          |Dawson  |42114|M     |4000  |
|Maria    |          |Jones   |39192|F     |4000  |
+---------+----------+--------+-----+------+------+



In [0]:
PandasDF = df.toPandas()
print(PandasDF)

  firstname middlename lastname     id gender  salary
0     James             William  36636      M    3000
1   Michael      Smith           40288      M    4000
2    Robert              Dawson  42114      M    4000
3     Maria               Jones  39192      F    4000


In [0]:
import pyspark 
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr

data = [("Banana", 1000, "USA"), ("Carrots", 1500, "USA"), ("Beans", 1600, "USA"),\
    ("Orange", 2000, "USA"), ("Orange", 2000, "USA"), ("Banana", 4000, "China"),\
    ("Carrots", 1200, "China"), ("Beans", 1500, "China"), ("Orange", 4000, "China"),\
    ("Banana", 2000, "Canada"), ("Carrots", 2000, "Canada"), ("Beans", 2000, "Mexico")\
    ]

columns = ['Product', 'Amount', 'Country']

df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate = False)

root
 |-- Product: string (nullable = true)
 |-- Amount: long (nullable = true)
 |-- Country: string (nullable = true)

+-------+------+-------+
|Product|Amount|Country|
+-------+------+-------+
|Banana |1000  |USA    |
|Carrots|1500  |USA    |
|Beans  |1600  |USA    |
|Orange |2000  |USA    |
|Orange |2000  |USA    |
|Banana |4000  |China  |
|Carrots|1200  |China  |
|Beans  |1500  |China  |
|Orange |4000  |China  |
|Banana |2000  |Canada |
|Carrots|2000  |Canada |
|Beans  |2000  |Mexico |
+-------+------+-------+



In [0]:
pivotDF = df.groupBy("Product").pivot("Country").sum("Amount")
pivotDF.printSchema()
pivotDF.show(truncate = False)

root
 |-- Product: string (nullable = true)
 |-- Canada: long (nullable = true)
 |-- China: long (nullable = true)
 |-- Mexico: long (nullable = true)
 |-- USA: long (nullable = true)

+-------+------+-----+------+----+
|Product|Canada|China|Mexico|USA |
+-------+------+-----+------+----+
|Orange |null  |4000 |null  |4000|
|Beans  |null  |1500 |2000  |1600|
|Banana |2000  |4000 |null  |1000|
|Carrots|2000  |1200 |null  |1500|
+-------+------+-----+------+----+



In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[1]") \
    .appName('ProjectThird') \
    .getOrCreate()

df = spark.read.format('delta') \
    .options(header = 'True', inferschema = 'True')\
    .load("/user/hive/warehouse/advertising", header = True)

df.show(5)
df.printSchema()

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
+-----+-----+---------+-----+
only showing top 5 rows

root
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)



In [0]:
#RDD creation

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[1]") \
    .appName('ProjectRDDCreation') \
    .getOrCreate()

df = spark.sparkContext.parallelize([(1,2,3, 'a b c'), (4,5,6, 'd e f'), (7,8,9, 'g h i')]).toDF(['col1', 'col2', 'col3', 'col4'])
df.show()

+----+----+----+-----+
|col1|col2|col3| col4|
+----+----+----+-----+
|   1|   2|   3|a b c|
|   4|   5|   6|d e f|
|   7|   8|   9|g h i|
+----+----+----+-----+



In [0]:
#Transformations & Actions