In [2]:
import pandas as pd

!pip install pyspark

from pyspark.sql import SparkSession
# Create a SparkSession (without a specified name)
spark = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True) #for simple calls and better display


Collecting pyspark
  Downloading pyspark-3.5.4.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.4-py2.py3-none-any.whl size=317849765 sha256=637049e986747eafb73f130b715dcf2c1198c83c08408793d14221e382b4cd50
  Stored in directory: /root/.cache/pip/wheels/d9/1c/98/31e395a42d1735d18d42124971ecbbade844b50bb9845b6f4a
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.4


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/31 17:18:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Understanding some aspects of Pandas groupby

In [3]:
# testing pandas groupby functionality

import pandas as pd
technologies   = ({
    'Courses':["Spark","PySpark","Hadoop","Python","Pandas","Hadoop","Spark","Python","NA"],
    'Fee' :[22000,25000,23000,24000,26000,25000,25000,22000,1500],
    'Duration':['30days','50days','55days','40days','60days','35days','30days','50days','40days'],
    'Discount':[1000,2300,1000,1200,2500,None,1400,1600,0]
          })
df = pd.DataFrame(technologies)
print("Create DataFrame:\n", df)


Create DataFrame:
    Courses    Fee Duration  Discount
0    Spark  22000   30days    1000.0
1  PySpark  25000   50days    2300.0
2   Hadoop  23000   55days    1000.0
3   Python  24000   40days    1200.0
4   Pandas  26000   60days    2500.0
5   Hadoop  25000   35days       NaN
6    Spark  25000   30days    1400.0
7   Python  22000   50days    1600.0
8       NA   1500   40days       0.0


In [4]:
df.groupby("Courses").sum().reset_index() # here the sum is applied on the all the remaining cols

Unnamed: 0,Courses,Fee,Duration,Discount
0,Hadoop,48000,55days35days,1000.0
1,,1500,40days,0.0
2,Pandas,26000,60days,2500.0
3,PySpark,25000,50days,2300.0
4,Python,46000,40days50days,2800.0
5,Spark,47000,30days30days,2400.0


In [5]:
df.groupby("Courses")["Fee"].sum().reset_index() # here the sum is only applied on the column mentioned in []

Unnamed: 0,Courses,Fee
0,Hadoop,48000
1,,1500
2,Pandas,26000
3,PySpark,25000
4,Python,46000
5,Spark,47000


In [6]:
df.groupby("Courses")[["Fee","Discount"]].sum().reset_index() # here the sum is only applied on the columns mentioned in []

Unnamed: 0,Courses,Fee,Discount
0,Hadoop,48000,1000.0
1,,1500,0.0
2,Pandas,26000,2500.0
3,PySpark,25000,2300.0
4,Python,46000,2800.0
5,Spark,47000,2400.0


In [7]:
df.groupby("Courses").agg({"Fee":"sum"}).reset_index() 
# note the use of dict's {} in the agg function

Unnamed: 0,Courses,Fee
0,Hadoop,48000
1,,1500
2,Pandas,26000
3,PySpark,25000
4,Python,46000
5,Spark,47000


In [8]:
df.groupby("Courses").agg(cust_col_name=("Fee","sum")).reset_index() 
# note the use of () in the agg function to set the name

Unnamed: 0,Courses,cust_col_name
0,Hadoop,48000
1,,1500
2,Pandas,26000
3,PySpark,25000
4,Python,46000
5,Spark,47000


# Problem 11

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Initialize Spark session
spark = SparkSession.builder.appName("EmployeeBonus").getOrCreate()

# Define schema for Employee DataFrame
employee_schema = StructType([
    StructField("empId", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("supervisor", IntegerType(), True),
    StructField("salary", IntegerType(), True)
])

# Sample data for Employee
employee_data = [
    (3, "Brad", None, 4000),
    (1, "John", 3, 1000),
    (2, "Dan", 3, 2000),
    (4, "Thomas", 3, 4000)
]

# Create Employee DataFrame
employee_df = spark.createDataFrame(employee_data, schema=employee_schema)

bonus_schema = StructType([
    StructField("empId", IntegerType(), True),
    StructField("bonus", IntegerType(), True)
])

# Sample data for Bonus
bonus_data = [
    (2, 500),
    (4, 2000)
]

# Create Bonus DataFrame
bonus_df = spark.createDataFrame(bonus_data, schema=bonus_schema)
bonus_df.show()
employee_df.show()

25/01/31 17:18:11 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

+-----+-----+
|empId|bonus|
+-----+-----+
|    2|  500|
|    4| 2000|
+-----+-----+

+-----+------+----------+------+
|empId|  name|supervisor|salary|
+-----+------+----------+------+
|    3|  Brad|      NULL|  4000|
|    1|  John|         3|  1000|
|    2|   Dan|         3|  2000|
|    4|Thomas|         3|  4000|
+-----+------+----------+------+



## PySpark

In [10]:
from pyspark.sql.functions import col
employee_df.join(bonus_df,bonus_df.empId == employee_df.empId,"left").filter( (col("bonus") < 1000) | (col("bonus").isNull()))

                                                                                

empId,name,supervisor,salary,empId.1,bonus
3,Brad,,4000,,
1,John,3.0,1000,,
2,Dan,3.0,2000,2.0,500.0


## Pandas

In [11]:
import pandas as pd

pd_emp = employee_df.toPandas()
pd_bonus = bonus_df.toPandas()

pd_res = pd_emp.merge(pd_bonus,on="empId",how="left")
pd_res[ (pd_res["bonus"]<1000) | (pd_res["bonus"].isna()) ]

                                                                                

Unnamed: 0,empId,name,supervisor,salary,bonus
0,3,Brad,,4000,
1,1,John,3.0,1000,
2,2,Dan,3.0,2000,500.0


# Problem 12
Each student from the Students table takes every course from the Subjects table.
Each row of this table indicates that a student with ID student_id attended the exam of subject_name. 
Write a solution to find the number of times each student attended each exam.
Return the result table ordered by student_id and subject_name.

The result table should contain all students and all subjects.

In [12]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("Students Example").getOrCreate()

# Create the Students DataFrame
students_data = [
    (1, "Alice"),
    (2, "Bob"),
    (13, "John"),
    (6, "Alex")
]

students_columns = ["student_id", "student_name"]
st_df = spark.createDataFrame(students_data, schema=students_columns)

# Create the Subjects DataFrame
subjects_data = [
    ("Math",),
    ("Physics",),
    ("Programming",)
]

subjects_columns = ["subject_name"]
sub_df = spark.createDataFrame(subjects_data, schema=subjects_columns)

# Create the Examinations DataFrame
examinations_data = [
    (1, "Math"),
    (1, "Physics"),
    (1, "Programming"),
    (2, "Programming"),
    (1, "Physics"),
    (1, "Math"),
    (13, "Math"),
    (13, "Programming"),
    (13, "Physics"),
    (2, "Math"),
    (1, "Math")
]

examinations_columns = ["student_id", "subject_name"]
exam_df = spark.createDataFrame(examinations_data, schema=examinations_columns)


st_df
sub_df
exam_df

25/01/31 17:18:29 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


student_id,subject_name
1,Math
1,Physics
1,Programming
2,Programming
1,Physics
1,Math
13,Math
13,Programming
13,Physics
2,Math


#### TESTING MYSELF after long time, since I don't completely remember all the topics

In [13]:
from pyspark.sql.functions import count
# st_df
# sub_df
# exam_df
res = st_df.join(sub_df,how="cross")
st_sub_cross = res.orderBy("student_id","subject_name")
st_sub_cross.show()

# Below both give the same result
# exam_grp =exam_df.groupBy("student_id","subject_name").count()
exam_grp =exam_df.groupBy("student_id","subject_name").agg(count(col("*")).alias("cnt"))

exam_grp.show()

# the below is wrong because col() does not support dot notation without alias
# res = st_sub_cross.join(exam_grp,col("exam_grp.student_id")==col("st_sub_cross.student_id"),how='left')

# the below is RIGHT and works because col() supports dot notation WITH alias
# res = st_sub_cross.alias('st_sub_cross').join(exam_grp.alias('exam_grp'),col("exam_grp.student_id")==col("st_sub_cross.student_id"),how='left')

# the below works fine without any issue, but we won't be able to have access to ambiguous column names using this method
# res = st_sub_cross.join(exam_grp,\
#                         (exam_grp.student_id==st_sub_cross.student_id) & ( exam_grp.subject_name==st_sub_cross.subject_name ),\
#                         how='left')

# The below won't work as we can't access ambiguous column names present in both sides of a join using the table name .(dot) prefix
# If we want to access the ambiguous column names, then we HAVE to alias them before hand
# # res = res.selectExpr("student_id as st_sub_student_id","exam_grp.student_id")


res = st_sub_cross.alias('st_sub_cross').join(exam_grp.alias('exam_grp'),\
                        (col("exam_grp.student_id")==col("st_sub_cross.student_id")) & ( col("exam_grp.subject_name")==col("st_sub_cross.subject_name") ),\
                        how='left')

# in this below command we can specifically see that using alias gives us special access to ambiguous column names
res.selectExpr("st_sub_cross.*","COALESCE(cnt, 0) AS cnt_1").show()

                                                                                

+----------+------------+------------+
|student_id|student_name|subject_name|
+----------+------------+------------+
|         1|       Alice|        Math|
|         1|       Alice|     Physics|
|         1|       Alice| Programming|
|         2|         Bob|        Math|
|         2|         Bob|     Physics|
|         2|         Bob| Programming|
|         6|        Alex|        Math|
|         6|        Alex|     Physics|
|         6|        Alex| Programming|
|        13|        John|        Math|
|        13|        John|     Physics|
|        13|        John| Programming|
+----------+------------+------------+

+----------+------------+---+
|student_id|subject_name|cnt|
+----------+------------+---+
|         1|     Physics|  2|
|         1|        Math|  3|
|         1| Programming|  1|
|         2| Programming|  1|
|        13|        Math|  1|
|        13| Programming|  1|
|         2|        Math|  1|
|        13|     Physics|  1|
+----------+------------+---+



                                                                                

+----------+------------+------------+-----+
|student_id|student_name|subject_name|cnt_1|
+----------+------------+------------+-----+
|         1|       Alice|        Math|    3|
|         1|       Alice|     Physics|    2|
|         1|       Alice| Programming|    1|
|         2|         Bob|        Math|    1|
|         2|         Bob|     Physics|    0|
|         2|         Bob| Programming|    1|
|        13|        John|        Math|    1|
|        13|        John|     Physics|    1|
|        13|        John| Programming|    1|
|         6|        Alex|        Math|    0|
|         6|        Alex|     Physics|    0|
|         6|        Alex| Programming|    0|
+----------+------------+------------+-----+



## PySpark

In [14]:
from pyspark.sql.functions import when,sum

st_sub = st_df.crossJoin(sub_df)
st_sub.orderBy("student_id","subject_name")

df = st_sub.alias('a').join( exam_df.alias('b'), (exam_df.student_id == st_sub.student_id ) & ( exam_df.subject_name == st_sub.subject_name ),"left")\
    .drop(exam_df.student_id)

df_2 = df.select("*", when(col("b.subject_name").isNull(), 0).otherwise(1).alias("cnt"))
df_2.show()
df_3 = df.withColumn("cnt",when(col("b.subject_name").isNull(),0).otherwise(1))
df_3.show()
df_3.groupBy(col("student_id"),col("a.subject_name")).agg(sum("cnt")).orderBy("student_id","subject_name")

                                                                                

+----------+------------+------------+------------+---+
|student_id|student_name|subject_name|subject_name|cnt|
+----------+------------+------------+------------+---+
|         1|       Alice|        Math|        Math|  1|
|         1|       Alice|        Math|        Math|  1|
|         1|       Alice|        Math|        Math|  1|
|         1|       Alice|     Physics|     Physics|  1|
|         1|       Alice|     Physics|     Physics|  1|
|         1|       Alice| Programming| Programming|  1|
|         2|         Bob|        Math|        Math|  1|
|         2|         Bob|     Physics|        NULL|  0|
|         2|         Bob| Programming| Programming|  1|
|        13|        John|        Math|        Math|  1|
|        13|        John|     Physics|     Physics|  1|
|        13|        John| Programming| Programming|  1|
|         6|        Alex|        Math|        NULL|  0|
|         6|        Alex|     Physics|        NULL|  0|
|         6|        Alex| Programming|        NU

                                                                                

+----------+------------+------------+------------+---+
|student_id|student_name|subject_name|subject_name|cnt|
+----------+------------+------------+------------+---+
|         1|       Alice|        Math|        Math|  1|
|         1|       Alice|        Math|        Math|  1|
|         1|       Alice|        Math|        Math|  1|
|         1|       Alice|     Physics|     Physics|  1|
|         1|       Alice|     Physics|     Physics|  1|
|         1|       Alice| Programming| Programming|  1|
|         2|         Bob|        Math|        Math|  1|
|         2|         Bob|     Physics|        NULL|  0|
|         2|         Bob| Programming| Programming|  1|
|        13|        John|        Math|        Math|  1|
|        13|        John|     Physics|     Physics|  1|
|        13|        John| Programming| Programming|  1|
|         6|        Alex|        Math|        NULL|  0|
|         6|        Alex|     Physics|        NULL|  0|
|         6|        Alex| Programming|        NU

                                                                                

student_id,subject_name,sum(cnt)
1,Math,3
1,Physics,2
1,Programming,1
2,Math,1
2,Physics,0
2,Programming,1
6,Math,0
6,Physics,0
6,Programming,0
13,Math,1


In [15]:
from pyspark.sql.functions import col, count

st_sub = st_df.crossJoin(sub_df).orderBy("student_id", "subject_name")

df = st_sub.alias('a').join(exam_df.alias('b'),
                            (col("a.student_id") == col("b.student_id")) & (col("a.subject_name") == col("b.subject_name")),
                            "left")

result = df.groupBy("a.student_id", "a.subject_name").agg(count(col("b.subject_name")).alias("attendance_count")).orderBy("student_id", "subject_name")
result

                                                                                

student_id,subject_name,attendance_count
1,Math,3
1,Physics,2
1,Programming,1
2,Math,1
2,Physics,0
2,Programming,1
6,Math,0
6,Physics,0
6,Programming,0
13,Math,1


## Pandas

In [16]:
st_pdf = st_df.toPandas()
sub_pdf = sub_df.toPandas()
exam_pdf = exam_df.toPandas()

st_sub_pdf = st_pdf.merge(sub_pdf,how="cross")

exam_pdf.rename(columns={"student_id":"ex_student_id","subject_name":"ex_subject_name"},inplace=True)

pdf = pd.merge(st_sub_pdf,exam_pdf, 
                       left_on=["student_id", "subject_name"], 
                       right_on=["ex_student_id", "ex_subject_name"], 
                       how="left")

pdf.groupby(["student_id","student_name","subject_name"]).agg({"ex_subject_name":"count"}).rename(columns={"ex_subject_name":"attended_exams"}).reset_index()

Unnamed: 0,student_id,student_name,subject_name,attended_exams
0,1,Alice,Math,3
1,1,Alice,Physics,2
2,1,Alice,Programming,1
3,2,Bob,Math,1
4,2,Bob,Physics,0
5,2,Bob,Programming,1
6,6,Alex,Math,0
7,6,Alex,Physics,0
8,6,Alex,Programming,0
9,13,John,Math,1


# Problem 13
Write a solution to find managers with at least five direct reports.

In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Create Spark session
spark = SparkSession.builder \
    .appName("Employee Manager Example") \
    .getOrCreate()

# Define the schema for the Employee table
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("department", StringType(), True),
    StructField("managerId", IntegerType(), True)
])

# Create the data for the Employee table
data = [
    (101, "John", "A", None),
    (102, "Dan", "A", 101),
    (103, "James", "A", 101),
    (104, "Amy", "A", 101),
    (105, "Anne", "A", 101),
    (106, "Ron", "B", 101)
]

# Create DataFrame using the schema and data
employee_df = spark.createDataFrame(data, schema)

# Show the DataFrame (to see if it's created correctly)
employee_df.show()


25/01/31 17:19:02 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+---+-----+----------+---------+
| id| name|department|managerId|
+---+-----+----------+---------+
|101| John|         A|     NULL|
|102|  Dan|         A|      101|
|103|James|         A|      101|
|104|  Amy|         A|      101|
|105| Anne|         A|      101|
|106|  Ron|         B|      101|
+---+-----+----------+---------+



## PySpark

In [18]:
df = employee_df
df_gp = df.groupby("managerId").count().filter("count >= 5")
df.alias('a').join(df_gp.alias('b'),col("b.managerId") == col("a.id"),"inner").select("name")

                                                                                

name
John


## Pandas

In [19]:
pdf = employee_df.toPandas()
pdf_gp = pdf.groupby("managerId").size().reset_index(name="cnt") # this will create the new column with the given name
# .rename(columns={"0":"cnt"})
pdf_gp = pdf_gp[pdf_gp["cnt"]>=5]
print(pdf_gp.columns)

pdf.merge(pdf_gp,left_on="id",right_on="managerId",how='inner')["name"].to_frame()

Index(['managerId', 'cnt'], dtype='object')


Unnamed: 0,name
0,John


# Problem 14

The confirmation rate of a user is the number of 'confirmed' messages divided by the total number of requested confirmation messages. The confirmation rate of a user that did not request any confirmation messages is 0. Round the confirmation rate to two decimal places.

Write a solution to find the confirmation rate of each user.

In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, StringType
from datetime import datetime

# Create a Spark session
spark = SparkSession.builder \
    .appName("Signup Confirmations") \
    .getOrCreate()

# Schema for the Signups table
signups_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("time_stamp", TimestampType(), True)
])

# Data for the Signups table (converting strings to datetime objects)
signups_data = [
    (3, datetime(2020, 3, 21, 10, 16, 13)),
    (7, datetime(2020, 1, 4, 13, 57, 59)),
    (2, datetime(2020, 7, 29, 23, 9, 44)),
    (6, datetime(2020, 12, 9, 10, 39, 37))
]

# Create Signups DataFrame
signups_df = spark.createDataFrame(signups_data, schema=signups_schema)

# Schema for the Confirmations table
confirmations_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("time_stamp", TimestampType(), True),
    StructField("action", StringType(), True)  # ENUM types can be treated as strings
])

# Data for the Confirmations table (converting strings to datetime objects)
confirmations_data = [
    (3, datetime(2021, 1, 6, 3, 30, 46), "timeout"),
    (3, datetime(2021, 7, 14, 14, 0, 0), "timeout"),
    (7, datetime(2021, 6, 12, 11, 57, 29), "confirmed"),
    (7, datetime(2021, 6, 13, 12, 58, 28), "confirmed"),
    (7, datetime(2021, 6, 14, 13, 59, 27), "confirmed"),
    (2, datetime(2021, 1, 22, 0, 0, 0), "confirmed"),
    (2, datetime(2021, 2, 28, 23, 59, 59), "timeout")
]

# Create Confirmations DataFrame
confirmations_df = spark.createDataFrame(confirmations_data, schema=confirmations_schema)

# Show the data for Signups and Confirmations
signups_df.show(truncate=False)
confirmations_df.show(truncate=False)


25/01/31 17:19:05 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+-------+-------------------+
|user_id|time_stamp         |
+-------+-------------------+
|3      |2020-03-21 10:16:13|
|7      |2020-01-04 13:57:59|
|2      |2020-07-29 23:09:44|
|6      |2020-12-09 10:39:37|
+-------+-------------------+

+-------+-------------------+---------+
|user_id|time_stamp         |action   |
+-------+-------------------+---------+
|3      |2021-01-06 03:30:46|timeout  |
|3      |2021-07-14 14:00:00|timeout  |
|7      |2021-06-12 11:57:29|confirmed|
|7      |2021-06-13 12:58:28|confirmed|
|7      |2021-06-14 13:59:27|confirmed|
|2      |2021-01-22 00:00:00|confirmed|
|2      |2021-02-28 23:59:59|timeout  |
+-------+-------------------+---------+



In [21]:
from pyspark.sql.functions import expr,when,round,format_number
dfs = signups_df
dfc = confirmations_df
# dfc.show()

dfc_cnf = dfc.filter("action = 'confirmed'").groupby("user_id").agg(count('*').alias("cnt"))
dfc_cnf.show()

dfc_cnf_percent = dfc.alias("dfc").groupby("user_id").agg(count("*").alias("cnt_total"))\
    .join(dfc_cnf.alias("dfc_cnf"),col("dfc_cnf.user_id") == col("dfc.user_id"),"left")\
    .select("dfc.user_id",expr("COALESCE(cnt/cnt_total,0)").alias("conf_percent"))

dfc_cnf_percent.show()

res = dfs.alias("dfs").join(dfc_cnf_percent.alias("dfc_cnf_percent"),col("dfs.user_id")==col("dfc_cnf_percent.user_id"),"left")
res = res.select("dfs.user_id",round(when(col("conf_percent").isNull(),0).otherwise(col("conf_percent")),2).alias("cnf_percent"))
res.show()

# to force the dataframe to show 2 decimals we can use the format_number function in pyspark
res = res.select("dfs.user_id",format_number(col("cnf_percent"),2).alias("cnf_percent_forced_2_decimals"))
res.show()

+-------+---+
|user_id|cnt|
+-------+---+
|      7|  3|
|      2|  1|
+-------+---+



                                                                                

+-------+------------+
|user_id|conf_percent|
+-------+------------+
|      3|         0.0|
|      7|         1.0|
|      2|         0.5|
+-------+------------+



                                                                                

+-------+-----------+
|user_id|cnf_percent|
+-------+-----------+
|      2|        0.5|
|      3|        0.0|
|      6|        0.0|
|      7|        1.0|
+-------+-----------+



                                                                                

+-------+-----------------------------+
|user_id|cnf_percent_forced_2_decimals|
+-------+-----------------------------+
|      2|                         0.50|
|      3|                         0.00|
|      6|                         0.00|
|      7|                         1.00|
+-------+-----------------------------+



## PySpark

In [22]:
from pyspark.sql.functions import max
s_df = signups_df
c_df = confirmations_df

df_cnfrm = c_df.filter("action == 'confirmed'").groupBy("user_id").agg(count("user_id").alias("cnt_cnfrm"))

s_df.alias('a').join(c_df.alias('b'),col("a.user_id") == col("b.user_id"),"left")\
    .join(df_cnfrm.alias('c'),col("c.user_id")==col("a.user_id"),"left")\
    .groupBy("a.user_id").agg(count("*").alias("cnt_all"),max("cnt_cnfrm").alias("cnt_cnfrm"))\
    .fillna(0,subset=['cnt_cnfrm'])\
    .select("user_id",col("cnt_cnfrm")/col("cnt_all"))

                                                                                

user_id,(cnt_cnfrm / cnt_all)
3,0.0
7,1.0
2,0.5
6,0.0


## Pandas

In [23]:
signups = signups_df.toPandas()
confirmations = confirmations_df.toPandas()

df_total = confirmations.groupby('user_id')['action'].count().reset_index()
df_conf = confirmations[confirmations.action =='confirmed'].groupby('user_id')['action'].count().reset_index()
df = signups.merge(df_total, how = 'left').merge(df_conf , how = 'left', on = 'user_id')
df

df['confirmation_rate'] =  ((df.action_y)/ (df.action_x)).round(2)   
df.loc[:,["user_id","confirmation_rate"]].fillna(0)

Unnamed: 0,user_id,confirmation_rate
0,3,0.0
1,7,1.0
2,2,0.5
3,6,0.0


# Problem 15

Write a solution to report the movies with an odd-numbered ID and a description that is not "boring".
Return the result table ordered by rating in descending order.

In [24]:
from pyspark.sql.types import FloatType
# Define the schema for the Cinema table
cinema_schema = StructType([
    StructField("id", IntegerType(), nullable=False),
    StructField("movie", StringType(), nullable=False),
    StructField("description", StringType(), nullable=False),
    StructField("rating", FloatType(), nullable=False)
])

# Create the data for the Cinema table
cinema_data = [
    (1, "War", "great 3D", 8.9),
    (2, "Science", "fiction", 8.5),
    (3, "irish", "boring", 6.2),
    (4, "Ice song", "Fantacy", 8.6),
    (5, "House card", "Interesting", 9.1)
]

# Create the DataFrame
cinema_df = spark.createDataFrame(data=cinema_data, schema=cinema_schema)

# Show the DataFrame
cinema_df.show()

+---+----------+-----------+------+
| id|     movie|description|rating|
+---+----------+-----------+------+
|  1|       War|   great 3D|   8.9|
|  2|   Science|    fiction|   8.5|
|  3|     irish|     boring|   6.2|
|  4|  Ice song|    Fantacy|   8.6|
|  5|House card|Interesting|   9.1|
+---+----------+-----------+------+



In [25]:
## Pyspark

from pyspark.sql.functions import like

# here the use of ~ is giving the inverse affect of LIKE and acting like "NOT LIKE"
dfc = cinema_df
dfc = dfc.filter(\
    (col("id")%2==1) & ( ~col("description").like("%bori%") ) \
).orderBy("rating",ascending=False)
dfc.show()

+---+----------+-----------+------+
| id|     movie|description|rating|
+---+----------+-----------+------+
|  5|House card|Interesting|   9.1|
|  1|       War|   great 3D|   8.9|
+---+----------+-----------+------+



# Problem 16

Write a solution to find the average selling price for each product. average_price should be rounded to 2 decimal places. If a product does not have any sold units, its average selling price is assumed to be 0.

In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DateType
from datetime import datetime

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Create DataFrames") \
    .getOrCreate()

# Define schema for Prices table
prices_schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("start_date", DateType(), True),
    StructField("end_date", DateType(), True),
    StructField("price", IntegerType(), True)
])

# Create Prices DataFrame
prices_data = [
    (1, datetime.strptime('2019-02-17', '%Y-%m-%d'), datetime.strptime('2019-02-28', '%Y-%m-%d'), 5),
    (1, datetime.strptime('2019-03-01', '%Y-%m-%d'), datetime.strptime('2019-03-22', '%Y-%m-%d'), 20),
    (2, datetime.strptime('2019-02-01', '%Y-%m-%d'), datetime.strptime('2019-02-20', '%Y-%m-%d'), 15),
    (2, datetime.strptime('2019-02-21', '%Y-%m-%d'), datetime.strptime('2019-03-31', '%Y-%m-%d'), 30),
    (3, datetime.strptime('2019-02-21', '%Y-%m-%d'), datetime.strptime('2019-03-31', '%Y-%m-%d'), 55)
]

prices_df = spark.createDataFrame(prices_data, schema=prices_schema)

# Define schema for UnitsSold table
units_sold_schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("purchase_date", DateType(), True),
    StructField("units", IntegerType(), True)
])

# Create UnitsSold DataFrame
units_sold_data = [
    (1, datetime.strptime('2019-02-25', '%Y-%m-%d'), 100),
    (1, datetime.strptime('2019-03-01', '%Y-%m-%d'), 15),
    # (1, datetime.strptime('2019-03-20', '%Y-%m-%d'), 10),
    (2, datetime.strptime('2019-02-10', '%Y-%m-%d'), 200),
    (2, datetime.strptime('2019-03-22', '%Y-%m-%d'), 30)
]

units_sold_df = spark.createDataFrame(units_sold_data, schema=units_sold_schema)

# Show DataFrames
prices_df.show()
units_sold_df.show()

25/01/31 17:19:16 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+----------+----------+----------+-----+
|product_id|start_date|  end_date|price|
+----------+----------+----------+-----+
|         1|2019-02-17|2019-02-28|    5|
|         1|2019-03-01|2019-03-22|   20|
|         2|2019-02-01|2019-02-20|   15|
|         2|2019-02-21|2019-03-31|   30|
|         3|2019-02-21|2019-03-31|   55|
+----------+----------+----------+-----+

+----------+-------------+-----+
|product_id|purchase_date|units|
+----------+-------------+-----+
|         1|   2019-02-25|  100|
|         1|   2019-03-01|   15|
|         2|   2019-02-10|  200|
|         2|   2019-03-22|   30|
+----------+-------------+-----+



In [27]:
dfu = units_sold_df
dfp = prices_df

res = dfp.alias("dfp").join(dfu.alias('dfu'),\
                            ( col("dfu.product_id")==col("dfp.product_id") ) &\
                            ( col("dfu.purchase_date") >= col("dfp.start_date") ) &\
                            ( col("dfu.purchase_date") <= col("dfp.end_date") )\
                            ,"outer")
res = res.withColumn("sales_price",col("units")*col("price")).groupby(col("dfp.product_id")).agg(sum(col("sales_price")).alias("sum_sales_price"),sum(col("units")).alias("sum_units"))
res = res.select("product_id",expr("FORMAT_NUMBER (ROUND ( coalesce(sum_sales_price/sum_units,0) , 2),2) as avg_sales_price")).orderBy("product_id")
res.show()

+----------+---------------+
|product_id|avg_sales_price|
+----------+---------------+
|         1|           6.96|
|         2|          16.96|
|         3|           0.00|
+----------+---------------+



# Problem 17
Write an SQL query that reports the average experience years of all the employees for each project, rounded to 2 digits.

In [28]:

# Define schema for Project table
project_schema = StructType([
    StructField("project_id", IntegerType(), True),
    StructField("employee_id", IntegerType(), True)
])

# Create Project DataFrame
project_data = [
    (1, 1),
    (1, 2),
    (1, 3),
    (2, 1),
    (2, 4)
]

project_df = spark.createDataFrame(project_data, schema=project_schema)

# Define schema for Employee table
employee_schema = StructType([
    StructField("employee_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("experience_years", IntegerType(), True)
])

# Create Employee DataFrame
employee_data = [
    (1, "Khaled", 3),
    (2, "Ali", 2),
    (3, "John", 1),
    (4, "Doe", 2)
]

employee_df = spark.createDataFrame(employee_data, schema=employee_schema)

# Show DataFrames
project_df.show()
employee_df.show()

+----------+-----------+
|project_id|employee_id|
+----------+-----------+
|         1|          1|
|         1|          2|
|         1|          3|
|         2|          1|
|         2|          4|
+----------+-----------+

+-----------+------+----------------+
|employee_id|  name|experience_years|
+-----------+------+----------------+
|          1|Khaled|               3|
|          2|   Ali|               2|
|          3|  John|               1|
|          4|   Doe|               2|
+-----------+------+----------------+



In [29]:
from pyspark.sql.functions import avg
dfp = project_df
dfe = employee_df

dfp.alias("dfp").join(dfe.alias("dfe"),col("dfe.employee_id")==col("dfp.employee_id"),"inner")\
    .groupby(col("dfp.project_id")).agg(avg(col("dfe.experience_years"))).show()



+----------+-------------------------+
|project_id|avg(dfe.experience_years)|
+----------+-------------------------+
|         1|                      2.0|
|         2|                      2.5|
+----------+-------------------------+



                                                                                

# Problem 18

Write a solution to find the percentage of the users registered in each contest rounded to two decimals.

Return the result table ordered by percentage in descending order. In case of a tie, order it by contest_id in ascending order.

In [30]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType


# Define schema for Users table
users_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("user_name", StringType(), True)
])

# Create Users DataFrame
users_data = [
    (6, "Alice"),
    (2, "Bob"),
    (7, "Alex")
]

users_df = spark.createDataFrame(users_data, schema=users_schema)

# Define schema for Register table
register_schema = StructType([
    StructField("contest_id", IntegerType(), True),
    StructField("user_id", IntegerType(), True)
])

# Create Register DataFrame
register_data = [
    (215, 6),
    (215, 6),
    (209, 2),
    (208, 2),
    (210, 6),
    (208, 6),
    (209, 7),
    (209, 6),
    (215, 7),
    (208, 7),
    (210, 2),
    (207, 2),
    (210, 7)
]

register_df = spark.createDataFrame(register_data, schema=register_schema)

# Show DataFrames
users_df.show()
register_df.show()

+-------+---------+
|user_id|user_name|
+-------+---------+
|      6|    Alice|
|      2|      Bob|
|      7|     Alex|
+-------+---------+

+----------+-------+
|contest_id|user_id|
+----------+-------+
|       215|      6|
|       215|      6|
|       209|      2|
|       208|      2|
|       210|      6|
|       208|      6|
|       209|      7|
|       209|      6|
|       215|      7|
|       208|      7|
|       210|      2|
|       207|      2|
|       210|      7|
+----------+-------+



In [31]:
from pyspark.sql.functions import count

dfu = users_df
dfr = register_df

# We are removing any duplicate registrations
dfr = dfr.distinct()

res_r = dfr.groupBy("contest_id").agg(count("user_id").alias("count_users"))
res_r.show()
res_u = dfu.agg(count(col("*")).alias("count_total_users"))
res_u.show()

res_final = res_r.alias("res_r").join(res_u.alias("res_u"),how="cross")
res_final.show()
res_final = res_final.select("contest_id",expr("ROUND((count_users/count_total_users)*100,2) as user_percent"))
res_final = res_final.orderBy(col("user_percent").desc(),col("contest_id").asc())
res_final.show()

+----------+-----------+
|contest_id|count_users|
+----------+-----------+
|       210|          3|
|       209|          3|
|       207|          1|
|       215|          2|
|       208|          3|
+----------+-----------+

+-----------------+
|count_total_users|
+-----------------+
|                3|
+-----------------+



                                                                                

+----------+-----------+-----------------+
|contest_id|count_users|count_total_users|
+----------+-----------+-----------------+
|       210|          3|                3|
|       209|          3|                3|
|       207|          1|                3|
|       215|          2|                3|
|       208|          3|                3|
+----------+-----------+-----------------+





+----------+------------+
|contest_id|user_percent|
+----------+------------+
|       208|       100.0|
|       209|       100.0|
|       210|       100.0|
|       215|       66.67|
|       207|       33.33|
+----------+------------+



                                                                                

# Problem 19

Write an SQL query to find for each month and country, the number of transactions and their total amount, the number of approved transactions and their total amount.

In [44]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from datetime import datetime

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Create DataFrames") \
    .getOrCreate()

# Define schema for Transactions table
transactions_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("country", StringType(), True),
    StructField("state", StringType(), True),
    StructField("amount", IntegerType(), True),
    StructField("trans_date", DateType(), True)
])

# Sample data for Transactions table
transactions_data = [
    (121, "US", "approved", 1000, datetime.strptime("2018-12-18", "%Y-%m-%d")),
    (122, "US", "declined", 2000, datetime.strptime("2018-12-19", "%Y-%m-%d")),
    (123, "US", "approved", 2000, datetime.strptime("2019-01-01", "%Y-%m-%d")),
    (124, "DE", "approved", 2000, datetime.strptime("2019-01-07", "%Y-%m-%d")),
    (125, "DE", "declined", 2000, datetime.strptime("2020-01-07", "%Y-%m-%d"))
]

# Create Transactions DataFrame
transactions_df = spark.createDataFrame(transactions_data, schema=transactions_schema)

# Show the Transactions DataFrame
transactions_df.show()

+---+-------+--------+------+----------+
| id|country|   state|amount|trans_date|
+---+-------+--------+------+----------+
|121|     US|approved|  1000|2018-12-18|
|122|     US|declined|  2000|2018-12-19|
|123|     US|approved|  2000|2019-01-01|
|124|     DE|approved|  2000|2019-01-07|
|125|     DE|declined|  2000|2020-01-07|
+---+-------+--------+------+----------+



In [56]:
from pyspark.sql.functions import month, year

dft = transactions_df
dft = dft.withColumn("date_month",month(col("trans_date"))).withColumn("date_year",year(col("trans_date")))
dft_apr = dft.filter(col("state")=='approved').groupBy("date_month","date_year","country").agg(count("*").alias("count_apr"),sum("amount").alias("sum_amount_apr"))
dft_apr.show()

dft_total = dft.groupBy("date_month","date_year","country").agg(count("*").alias("count_total"),sum("amount").alias("sum_amount_total"))
dft_total.show()

res = dft_total.alias("dft_total").join(dft_apr.alias("dft_apr"),\
                                           ( col("dft_total.country")==col("dft_apr.country") ) &\
                                            ( col("dft_total.date_year")==col("dft_apr.date_year") ) &\
                                            ( col("dft_total.date_month")==col("dft_apr.date_month") ), \
                                        "left"
                                       )
res.show()

res = res.select("dft_total.*",expr("coalesce(dft_apr.count_apr,0) as count_approved"),expr("coalesce(dft_apr.sum_amount_apr,0) as sum_amount_approved"))
res.show()

+----------+---------+-------+---------+--------------+
|date_month|date_year|country|count_apr|sum_amount_apr|
+----------+---------+-------+---------+--------------+
|        12|     2018|     US|        1|          1000|
|         1|     2019|     US|        1|          2000|
|         1|     2019|     DE|        1|          2000|
+----------+---------+-------+---------+--------------+

+----------+---------+-------+-----------+----------------+
|date_month|date_year|country|count_total|sum_amount_total|
+----------+---------+-------+-----------+----------------+
|        12|     2018|     US|          2|            3000|
|         1|     2019|     US|          1|            2000|
|         1|     2020|     DE|          1|            2000|
|         1|     2019|     DE|          1|            2000|
+----------+---------+-------+-----------+----------------+



                                                                                

+----------+---------+-------+-----------+----------------+----------+---------+-------+---------+--------------+
|date_month|date_year|country|count_total|sum_amount_total|date_month|date_year|country|count_apr|sum_amount_apr|
+----------+---------+-------+-----------+----------------+----------+---------+-------+---------+--------------+
|        12|     2018|     US|          2|            3000|        12|     2018|     US|        1|          1000|
|         1|     2019|     US|          1|            2000|         1|     2019|     US|        1|          2000|
|         1|     2020|     DE|          1|            2000|      NULL|     NULL|   NULL|     NULL|          NULL|
|         1|     2019|     DE|          1|            2000|         1|     2019|     DE|        1|          2000|
+----------+---------+-------+-----------+----------------+----------+---------+-------+---------+--------------+

+----------+---------+-------+-----------+----------------+--------------+-------------

# Problem 20

If the customer's preferred delivery date is the same as the order date, then the order is called immediate; otherwise, it is called scheduled.

The first order of a customer is the order with the earliest order date that the customer made. It is guaranteed that a customer has precisely one first order.

Write a solution to find the percentage of immediate orders in the first orders of all customers, rounded to 2 decimal places.

In [57]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from datetime import datetime


# Define schema for Delivery table
delivery_schema = StructType([
    StructField("delivery_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("order_date", DateType(), True),
    StructField("customer_pref_delivery_date", DateType(), True)
])

# Sample data for Delivery table
delivery_data = [
    (1, 1, datetime.strptime("2019-08-01", "%Y-%m-%d"), datetime.strptime("2019-08-02", "%Y-%m-%d")),
    (2, 2, datetime.strptime("2019-08-02", "%Y-%m-%d"), datetime.strptime("2019-08-02", "%Y-%m-%d")),
    (3, 1, datetime.strptime("2019-08-11", "%Y-%m-%d"), datetime.strptime("2019-08-12", "%Y-%m-%d")),
    (4, 3, datetime.strptime("2019-08-24", "%Y-%m-%d"), datetime.strptime("2019-08-24", "%Y-%m-%d")),
    (5, 3, datetime.strptime("2019-08-21", "%Y-%m-%d"), datetime.strptime("2019-08-22", "%Y-%m-%d")),
    (6, 2, datetime.strptime("2019-08-11", "%Y-%m-%d"), datetime.strptime("2019-08-13", "%Y-%m-%d")),
    (7, 4, datetime.strptime("2019-08-09", "%Y-%m-%d"), datetime.strptime("2019-08-09", "%Y-%m-%d"))
]

# Create Delivery DataFrame
delivery_df = spark.createDataFrame(delivery_data, schema=delivery_schema)

# Show the Delivery DataFrame
delivery_df.show()

+-----------+-----------+----------+---------------------------+
|delivery_id|customer_id|order_date|customer_pref_delivery_date|
+-----------+-----------+----------+---------------------------+
|          1|          1|2019-08-01|                 2019-08-02|
|          2|          2|2019-08-02|                 2019-08-02|
|          3|          1|2019-08-11|                 2019-08-12|
|          4|          3|2019-08-24|                 2019-08-24|
|          5|          3|2019-08-21|                 2019-08-22|
|          6|          2|2019-08-11|                 2019-08-13|
|          7|          4|2019-08-09|                 2019-08-09|
+-----------+-----------+----------+---------------------------+



In [77]:
from pyspark.sql.functions import when,rank
from pyspark.sql.functions import lit
from pyspark.sql.window import Window

dfd = delivery_df
dfd = dfd.withColumn("is_Immediate",when( col("order_date") == col("customer_pref_delivery_date"),lit(1) ).otherwise(lit(0)) )
dfd.show()

rank_window_spec = Window.partitionBy("customer_id").orderBy("order_date")

dfd = dfd.withColumn("rank_order_by_emp",rank().over(rank_window_spec))
dfd.show()

dfd = dfd.filter(expr("rank_order_by_emp=1"))
dfd.show()

dfd = dfd.select(expr("format_number(sum(is_Immediate)/count(*),2)").alias("immediate_percentage "))
dfd.show()

+-----------+-----------+----------+---------------------------+------------+
|delivery_id|customer_id|order_date|customer_pref_delivery_date|is_Immediate|
+-----------+-----------+----------+---------------------------+------------+
|          1|          1|2019-08-01|                 2019-08-02|           0|
|          2|          2|2019-08-02|                 2019-08-02|           1|
|          3|          1|2019-08-11|                 2019-08-12|           0|
|          4|          3|2019-08-24|                 2019-08-24|           1|
|          5|          3|2019-08-21|                 2019-08-22|           0|
|          6|          2|2019-08-11|                 2019-08-13|           0|
|          7|          4|2019-08-09|                 2019-08-09|           1|
+-----------+-----------+----------+---------------------------+------------+

+-----------+-----------+----------+---------------------------+------------+-----------------+
|delivery_id|customer_id|order_date|customer_