In [10]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('database').getOrCreate()

In [11]:
from pyspark.sql.functions import (col,expr,count,countDistinct,datediff,to_date,date_add,year,month,lag,lead,rank,max,min,round,
        sum,when,lit,desc,coalesce,abs,greatest,least,array,array_sort,substring, explode,collect_list,array_intersect,
        unix_timestamp,rank,dense_rank,least,greatest,row_number,array_join,expr,trim,lower,array,sort_array,
        array_distinct,size,initcap,length,date_format,to_timestamp,concat,regexp_extract,length,regexp_replace,

                                  )
from pyspark.sql.types import (StructField,StructType,
                    IntegerType,StringType,DateType,TimestampType )
from pyspark.sql import Window
from pyspark.sql import Row
#rlike,contains,array_join,collect_list,substring,array_size,cast,stack(to unpivot),LATERAL VIEW 

 #### 1 Combine Two Tables 

In [16]:
person_df = (spark.read
             .option('header',True)
             .option('inferSchema',True)
             .format('csv')
             .load('../../data/database/1_Person.csv'))

In [17]:
address_df = (spark.read
             .option('header',True)
             .option('inferSchema',True)
             .format('csv')
             .load('../../data/database/1a_Address.csv'))

In [19]:
person_df.printSchema()

root
 |-- personId: integer (nullable = true)
 |-- lastName: string (nullable = true)
 |-- firstName: string (nullable = true)



In [20]:
address_df.printSchema()

root
 |-- addressId: integer (nullable = true)
 |-- personId: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)



In [33]:
(person_df.join(address_df,person_df.personId == address_df.personId,'left')
.select(col('firstName'),col('lastName'),col('city'),col('state')).show() )


+---------+--------+-------------+--------+
|firstName|lastName|         city|   state|
+---------+--------+-------------+--------+
|    Allen|    Wang|         null|    null|
|      Bob|   Alice|New York City|New York|
+---------+--------+-------------+--------+



In [118]:
person_df.createOrReplaceTempView('persondf')
address_df.createOrReplaceTempView('addressdf')

In [121]:
spark.sql('select firstName,lastName,city,state  \
          from persondf left join addressdf on persondf.personId  == addressdf.personId').show() 

+---------+--------+-------------+--------+
|firstName|lastName|         city|   state|
+---------+--------+-------------+--------+
|    Allen|    Wang|         null|    null|
|      Bob|   Alice|New York City|New York|
+---------+--------+-------------+--------+



#### 2 Employees Earning More Than Their Managers 

In [18]:
employee_df = (spark.read
              .option('header',True)
              .option('inferSchema',True)
              .format('csv')
              .load('../../data/database/2_Employee.csv'))

In [65]:
employee_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- managerId: integer (nullable = true)



In [98]:
emp_df = employee_df.filter(col('managerId').isNotNull() )

In [106]:
mgr_df = ( employee_df.filter(col('managerId').isNull() )
          .select(col('id').alias('mgrid'),col('salary').alias('mgrsalary')) )

In [110]:
(
emp_df.join(mgr_df,mgr_df.mgrid == emp_df.managerId,'inner')
    .filter(col('salary') > col('mgrsalary')).select('name').show()
)

+----+
|name|
+----+
| Joe|
+----+



In [122]:
employee_df.createOrReplaceTempView('employeedf')

In [142]:
spark.sql('with mgr as (select id,salary from employeedf where managerId is null),\
                emp as (select id,name,salary,managerId from employeedf where managerId is not null)\
                select emp.name from emp join mgr on emp.managerId == mgr.id where emp.salary > mgr.salary').show()

+----+
|name|
+----+
| Joe|
+----+



#### 3 Duplicate Emails E

In [19]:
person_df = (spark.read
             .option('header',True)
             .option('inferSchema',True)
             .format('csv')
             .load('../../data/database/3_person.csv') )

In [5]:
person_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- email: string (nullable = true)



In [30]:
person_df.groupby(col('email')).count().filter(col('count') >=2).select(col('email')).show()

+-------+
|  email|
+-------+
|a@b.com|
+-------+



In [36]:
person_df.createOrReplaceTempView('person_df')

spark.sql('select email from person_df group by email having count(email) >= 2').show()

+-------+
|  email|
+-------+
|a@b.com|
+-------+



#### 4 Customers Who Never Order E

In [20]:
customer_df = (spark.read
               .option('header',True)
               .option('inferSchema',True)
               .format('csv')
               .load('../../data/database/4_customers.csv') )

In [21]:
orders_df = (spark.read
            .option('header',True)
            .option('inferSchema',True)
            .format('csv')
            .load('../../data/database/4a_orders.csv'))

In [53]:
customer_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)



In [54]:
orders_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- customerId: integer (nullable = true)



In [56]:
customer_df.join(orders_df,customer_df.id == orders_df.customerId,'leftanti').select('name').show()

+-----+
| name|
+-----+
|Henry|
|  Max|
+-----+



In [74]:
customer_df.createOrReplaceTempView('customerdf')
orders_df.createOrReplaceTempView('ordersdf')

spark.sql('select name from customerdf left anti join ordersdf on customerdf.id == ordersdf.customerId ').show()

+-----+
| name|
+-----+
|Henry|
|  Max|
+-----+



#### 5 Delete Duplicate Emails E

In [22]:
person_df = ( spark.read
            .option('header',True)
            .option('inferSchema',True)
            .format('csv')
            .load('../../data/database/5_person.csv'))

In [77]:
person_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- email: string (nullable = true)



In [85]:
person_df.dropDuplicates(['email']).show()

+---+----------------+
| id|           email|
+---+----------------+
|  1|john@example.com|
|  2| bob@example.com|
+---+----------------+



In [None]:
SET SQL_SAFE_UPDATES = 0;
with one as (select id,email, row_number() over (partition by email order by id) as rnumber from db.person)
DELETE FROM db.person where id in (select id from one where rnumber >= 2);

#### 6 Rising Temperature E

In [23]:
weather_schema = StructType([
        StructField('id',IntegerType()),
        StructField('recordDate',DateType()),
        StructField('temperature',IntegerType())
        ])

In [24]:
weather_df = (spark.read
              .option('header',True)
              .schema(weather_schema)
              .format('csv')
              .load('../../data/database/6_weather.csv'))

In [167]:
weather_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- recordDate: date (nullable = true)
 |-- temperature: integer (nullable = true)



In [168]:

weather_df.withColumn('recordDate',to_date(col('recordDate'),'MM-dd-yyyy'))

DataFrame[id: int, recordDate: date, temperature: int]

In [160]:
window_spec = Window.orderBy(weather_df.recordDate)

In [166]:
weather_df.withColumn('nextday',lag(col('temperature'),1).over(window_spec).alias('lags'))\
    .filter(col('temperature') > col('nextday')).select('id').show()

+---+
| id|
+---+
|  2|
|  4|
+---+



24/08/12 15:57:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [170]:
weather_df.createOrReplaceTempView('weatherdf')

In [198]:
spark.sql("with nextday as (select recordDate as date,lag(temperature) over(order by recordDate) as temp from weatherdf) \
                    select id from weatherdf join nextday on recordDate = date \
                      where temperature > temp \
          " ).show()

+---+
| id|
+---+
|  2|
|  4|
+---+



24/08/12 16:31:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [202]:
spark.sql('select id from weatherdf,\
    (select recordDate as date, lag(temperature) over(order by recordDate) as temp from weatherdf) \
          where date = recordDate and temperature > temp ').show()

+---+
| id|
+---+
|  2|
|  4|
+---+



24/08/12 16:36:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


#### 7 Game Play Analysis E

In [25]:
activity_schema = StructType([
                  StructField('player_id',IntegerType()),
                  StructField('device_id',IntegerType()),
                  StructField('event_date',DateType()),
                  StructField('games_played',IntegerType()),
                ])

In [26]:
activity_df = (spark.read
              .option('header',True)
              .schema(activity_schema)
              .format('csv')
              .load('../../data/database/7_activity.csv'))

In [220]:
activity_df.printSchema()

root
 |-- player_id: integer (nullable = true)
 |-- device_id: integer (nullable = true)
 |-- event_date: date (nullable = true)
 |-- games_played: integer (nullable = true)



In [227]:
window_spec = Window.partitionBy('player_id').orderBy('event_date')

In [234]:
activity_df.withColumn('cnt',rank().over(window_spec)).filter(col('cnt') == 1)\
        .select('player_id',col('event_date').alias('first_login')).orderBy('player_id').show()

+---------+-----------+
|player_id|first_login|
+---------+-----------+
|        1| 2016-03-01|
|        2| 2017-06-25|
|        3| 2016-03-02|
+---------+-----------+



In [255]:
activity_df.createOrReplaceTempView('activitydf')
spark.sql('with rnk as (select player_id,event_date as first_login,\
                        rank() over(PARTITION BY player_id ORDER BY event_date) as rnk from activitydf) \
                select player_id,first_login from rnk where rnk == 1 order by player_id ').show()

+---------+-----------+
|player_id|first_login|
+---------+-----------+
|        1| 2016-03-01|
|        2| 2017-06-25|
|        3| 2016-03-02|
+---------+-----------+



#### 8 Game Play Analysis II E

In [27]:
activity_schema = StructType([
                  StructField('player_id',IntegerType()),
                  StructField('device_id',IntegerType()),
                  StructField('event_date',DateType()),
                  StructField('games_played',IntegerType()),
                ])

In [28]:
activity_df = (spark.read
              .option('header',True)
              .schema(activity_schema)
              .format('csv')
              .load('../../data/database/8_activity.csv'))

In [258]:
activity_df.printSchema()

root
 |-- player_id: integer (nullable = true)
 |-- device_id: integer (nullable = true)
 |-- event_date: date (nullable = true)
 |-- games_played: integer (nullable = true)



In [262]:
window_spec = Window.partitionBy('player_id').orderBy('event_date')

In [265]:
activity_df.withColumn('rnk',rank().over(window_spec)).filter(col('rnk') ==1 )\
        .select('player_id','device_id').orderBy('player_id').show()

+---------+---------+
|player_id|device_id|
+---------+---------+
|        1|        2|
|        2|        3|
|        3|        1|
+---------+---------+



In [268]:
activity_df.createOrReplaceTempView('activitydf')
spark.sql('with rnk as (select player_id,device_id,\
                        rank() over(PARTITION BY player_id ORDER BY event_date) as rnk from activitydf) \
                select player_id,device_id from rnk where rnk == 1 order by player_id ').show()

+---------+---------+
|player_id|device_id|
+---------+---------+
|        1|        2|
|        2|        3|
|        3|        1|
+---------+---------+



#### 9 Employee Bonus E

In [29]:
employee_df = (spark.read
               .option('header',True)
               .option('inferSchema',True)
               .format('csv')
               .load('../../data/database/9_employee.csv') )

In [30]:
bonus_df = (spark.read
               .option('header',True)
               .option('inferSchema',True)
               .format('csv')
               .load('../../data/database/9a_bonus.csv') )

In [271]:
employee_df.printSchema()

root
 |-- empid: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- supervisor: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [272]:
bonus_df.printSchema()

root
 |-- empid: integer (nullable = true)
 |-- bonus: integer (nullable = true)



In [279]:
employee_df.join(bonus_df,employee_df.empid == bonus_df.empid, 'left')\
        .filter( (col('bonus') <= 1000) | col('bonus').isNull() ).select('name','bonus').show()

+----+-----+
|name|bonus|
+----+-----+
|Brad| null|
|John| null|
| Dan|  500|
+----+-----+



In [284]:
employee_df.createOrReplaceTempView('employeedf')
bonus_df.createOrReplaceTempView('bonusdf')

spark.sql('select name,bonus from employeedf left join bonusdf on employeedf.empid == bonusdf.empid \
            where bonus <=1000 or bonus is null').show()

+----+-----+
|name|bonus|
+----+-----+
|Brad| null|
|John| null|
| Dan|  500|
+----+-----+



#### 10 Find Customer Referee E

In [31]:
customer_df = (spark.read
              .option('header',True)
              .option('inferSchema',True)
              .format('csv')
              .load('../../data/database/10_customer.csv'))

In [286]:
customer_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- referee_id: integer (nullable = true)



In [289]:
customer_df.filter( (col('referee_id') !=2) | (col('referee_id').isNull()) ).select('name').show()

+----+
|name|
+----+
|Will|
|Jane|
|Bill|
|Zack|
+----+



In [291]:
customer_df.createOrReplaceTempView('customerdf')
spark.sql('select name from customerdf where referee_id !=2 or referee_id is null').show()

+----+
|name|
+----+
|Will|
|Jane|
|Bill|
|Zack|
+----+



#### 11 Customer Placing the Largest Number of Orders E

In [32]:
order_df = (spark.read
           .option('header',True)
           .option('inferSchema',True)
           .format('csv')
           .load('../../data/database/11_orders.csv'))

In [293]:
order_df.printSchema()

root
 |-- order_number: integer (nullable = true)
 |-- customer_number: integer (nullable = true)



In [332]:
order_df.orderBy('order_number',ascending = False).select('customer_number').first()[0]

3

In [341]:
order_df.createOrReplaceTempView('orderdf')
spark.sql('select customer_number,order_number as mxorder from orderdf order by order_number desc limit 1').show()

+---------------+-------+
|customer_number|mxorder|
+---------------+-------+
|              3|      4|
+---------------+-------+



#### 12 Big Countries E

In [33]:
world_df = (spark.read
           .option('header',True)
           .option('inferSchema',True)
           .format('csv')
           .load('../../data/database/12_world.csv'))

In [344]:
world_df.printSchema()  

root
 |-- name: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- area: integer (nullable = true)
 |-- population: integer (nullable = true)
 |-- dgp: long (nullable = true)



In [347]:
world_df.filter( (col('area') >= 3000000) | (col('population') >= 25000000 ) )\
            .select('name','population','area').show()

+-----------+----------+-------+
|       name|population|   area|
+-----------+----------+-------+
|Afghanistan|  25500100| 652230|
|    Algeria|  37100000|2381741|
+-----------+----------+-------+



In [350]:
world_df.createOrReplaceTempView('worlddf')
spark.sql('select name,population,area from worlddf where area >= 3000000 or population >= 25000000 ').show()

+-----------+----------+-------+
|       name|population|   area|
+-----------+----------+-------+
|Afghanistan|  25500100| 652230|
|    Algeria|  37100000|2381741|
+-----------+----------+-------+



#### 13 Classes More Than 5 Students E

In [34]:
courses_df = (spark.read
             .option('header',True)
             .option('inferSchema',True)
             .format('csv')
             .load('../../data/database/13_courses.csv'))

In [354]:
courses_df.printSchema()

root
 |-- student: string (nullable = true)
 |-- class: string (nullable = true)



In [357]:
courses_df.groupBy('class').agg(count('class').alias('cnt')).filter(col('cnt') >=5).select('class').show()

+-----+
|class|
+-----+
| Math|
+-----+



In [359]:
courses_df.createOrReplaceTempView('coursesdf')
spark.sql('select class from coursesdf group by class having count("class") >= 5').show()

+-----+
|class|
+-----+
| Math|
+-----+



#### 14 Friend Requests I: Overall Acceptance Rate E

In [35]:
friendsrequest_schema = StructType([
                        StructField('sender_id',IntegerType()),
                        StructField('send_to_id',IntegerType()),
                        StructField('request_date',DateType()),
                     ])

In [36]:
requestaccepted_schema = StructType([
                         StructField('requester_id',IntegerType()),
                         StructField('accepter_id',IntegerType()),
                         StructField('accept_date',DateType()),
                       ])

In [37]:
friendsrequest_df = (spark.read
                    .option('header',True)
                    .schema(friendsrequest_schema)
                    .format('csv')
                    .load('../../data/database/14_friendrequest.csv'))

In [38]:
requestaccepted_df = (spark.read
                    .option('header',True)
                    .schema(requestaccepted_schema)
                    .format('csv')
                    .load('../../data/database/14a_requestaccepted.csv'))

In [382]:
friendsrequest_df.printSchema()

root
 |-- sender_id: integer (nullable = true)
 |-- send_to_id: integer (nullable = true)
 |-- request_date: date (nullable = true)



In [383]:
requestaccepted_df.printSchema()

root
 |-- requester_id: integer (nullable = true)
 |-- accepter_id: integer (nullable = true)
 |-- accept_date: date (nullable = true)



In [440]:
requestaccepted_df.select(round(countDistinct('requester_id','accepter_id')/count('requester_id'),2)\
                    .alias('acceptance_ratio')).show()

+----------------+
|acceptance_ratio|
+----------------+
|             0.8|
+----------------+



In [427]:
requestaccepted_df.createOrReplaceTempView('requestaccepteddf')
spark.sql('select round(count(distinct requester_id,accepter_id)/count(requester_id),2) \
            as acceptance_rate from requestaccepteddf').show()

+---------------+
|acceptance_rate|
+---------------+
|            0.8|
+---------------+



#### 15 Consecutive Available Seats E

In [39]:
cinema_df = (spark.read
            .option('header',True)
            .option('inferSchema',True)
            .format('csv')
            .load('../../data/database/15_cinema.csv'))

In [68]:
cinema_df.printSchema()

root
 |-- seat_id: integer (nullable = true)
 |-- free: integer (nullable = true)



In [69]:
window_spec = Window.orderBy('seat_id')

In [70]:
cinema_df.withColumn('rowsdiff',sum(when(lag('free').over(window_spec) != col('free'),1) \
                    .otherwise(0)).over(window_spec)).filter( (col('free') != 0) & (col('rowsdiff') != 0) )\
                   .select('seat_id').show()

+-------+
|seat_id|
+-------+
|      3|
|      4|
|      5|
+-------+



24/08/13 11:07:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/13 11:07:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [90]:
cinema_df.createOrReplaceTempView('cinemadf')
spark.sql('with diff as  (select seat_id,free,sum( case when (lag(free) over( order by seat_id)) != free then 1 \
                                                   else 0 end) over(order by seat_id) as diff from cinemadf order by seat_id) \
                  select seat_id from diff where free !=0 and diff !=0 \
                            ').show()


+-------+
|seat_id|
+-------+
|      3|
|      4|
|      5|
+-------+



24/08/13 11:37:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/13 11:37:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


#### 16 Sales Person E

In [40]:
salesperson_schema = StructType([
                     StructField('sales_id',IntegerType()),
                     StructField('name',StringType()),
                     StructField('salary',IntegerType()),
                     StructField('commission_rate',IntegerType()),
                     StructField('hire_date',DateType())
                    ])

In [41]:
order_schema = StructType([
                     StructField('order_id',IntegerType()),
                     StructField('order_date',DateType()),
                     StructField('com_id',IntegerType()),
                     StructField('sales_id',IntegerType()),
                     StructField('amount',IntegerType())
                    ])

In [42]:
salesperson_df = (spark.read
                 .option('header',True)
                 .schema(salesperson_schema) 
                 .format('csv')
                 .load('../../data/database/16_salesperson.csv')
                 .withColumn('hire_date',to_date('hire_date','MM-dd-yyyy')))

In [45]:
company_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/16a_company.csv'))

In [46]:
order_df = (spark.read
                 .option('header',True)
                 .schema(order_schema) 
                 .format('csv')
                 .load('../../data/database/16b_order.csv')
                 .withColumn('order_date',to_date('order_date','MM-dd-yyyy')))

In [104]:
salesperson_df.printSchema()

root
 |-- sales_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- commission_rate: integer (nullable = true)
 |-- hire_date: date (nullable = true)



In [105]:
company_df.printSchema()

root
 |-- com_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)



In [106]:
order_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- com_id: integer (nullable = true)
 |-- sales_id: integer (nullable = true)
 |-- amount: integer (nullable = true)



In [147]:
ordf = order_df.join(company_df,company_df.com_id == order_df.com_id,'inner').filter(col('name') == 'RED')\
    .select('sales_id')
salesperson_df.join(ordf,salesperson_df.sales_id == ordf.sales_id,'leftanti').select('name').show()

+----+
|name|
+----+
| Amy|
|Mark|
|Alex|
+----+



In [161]:
salesperson_df.createOrReplaceTempView('salespersondf')
company_df.createOrReplaceTempView('companydf')
order_df.createOrReplaceTempView('orderdf')

spark.sql('with one as (select sales_id from orderdf  \
                          join companydf on companydf.com_id == orderdf.com_id where name == "RED") \
                 select name from salespersondf left anti join one on salespersondf.sales_id == one.sales_id \
         ').show()

+----+
|name|
+----+
| Amy|
|Mark|
|Alex|
+----+



In [162]:

spark.sql('select name from salespersondf where sales_id not in (select sales_id from orderdf  \
                          join companydf on companydf.com_id == orderdf.com_id where name == "RED")\
         ').show()

+----+
|name|
+----+
| Amy|
|Mark|
|Alex|
+----+



#### 17 Triangle Judgement E

In [47]:
triangle_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/17_triangle.csv'))

In [165]:
triangle_df.printSchema()

root
 |-- x: integer (nullable = true)
 |-- y: integer (nullable = true)
 |-- z: integer (nullable = true)



In [173]:
triangle_df.withColumn('triangle',when( (col('x')+col('y') > col('z')) & (col('y')+col('z') > col('x')) &\
                                    (col('x')+col('z') > col('y')),'Yes').otherwise('No')).show()

+---+---+---+--------+
|  x|  y|  z|triangle|
+---+---+---+--------+
| 13| 15| 30|      No|
| 10| 20| 15|     Yes|
+---+---+---+--------+



In [175]:
triangle_df.createOrReplaceTempView('triangledf')
spark.sql('select x,y,z,case when x+y>z and y+z>x and x+z>y then "Yes" \
                             else "No" end as traiangle from triangledf').show()

+---+---+---+---------+
|  x|  y|  z|traiangle|
+---+---+---+---------+
| 13| 15| 30|       No|
| 10| 20| 15|      Yes|
+---+---+---+---------+



#### 18 Shortest Distance in a Line E

In [48]:
point_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/18_point.csv'))

In [177]:
point_df.printSchema()

root
 |-- x: integer (nullable = true)



In [178]:
point_df.show()

+---+
|  x|
+---+
| -1|
|  0|
|  2|
+---+



In [180]:
window_spec =  Window.orderBy('x')

In [190]:
point_df.withColumn('xx',lag('x',1).over(window_spec)).select(expr('x-xx')\
                        .alias('xxx')).agg(min('xxx').alias('min_distancee')).show()

+-------------+
|min_distancee|
+-------------+
|            1|
+-------------+



24/08/13 15:37:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [198]:
point_df.createOrReplaceTempView('pointdf')
spark.sql('with one as (select x,lag(x) over(order by x) as xx from pointdf) \
                   select min(x - xx) as min_distance from one').show()

+------------+
|min_distance|
+------------+
|           1|
+------------+



24/08/13 15:42:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


#### 19  Biggest Single Number E

In [49]:
mynumber_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/19_mynumber.csv'))

In [211]:
mynumber_df.printSchema()

root
 |-- num: integer (nullable = true)



In [212]:
mynumber_df.groupBy('num').count().filter(col('count') == 1).agg(max(col('num'))\
                                        .alias('single_largest_number')).show()

+---------------------+
|single_largest_number|
+---------------------+
|                    6|
+---------------------+



In [217]:
mynumber_df.createOrReplaceTempView('mynumberdf')
spark.sql('select max(num) as single_largest_number from (select num from mynumberdf group by num having count(num)==1)').show()

+---------------------+
|single_largest_number|
+---------------------+
|                    6|
+---------------------+



#### 20 Not Boring Movies E

In [50]:
cinima_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/20_cinima.csv'))

In [219]:
cinima_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- movie: string (nullable = true)
 |-- description: string (nullable = true)
 |-- rating: double (nullable = true)



In [233]:
cinima_df.filter( (col('description') != lit('boring')) & (col('id')%2 == 1))\
                    .orderBy(desc('rating')).show()


+---+----------+-----------+------+
| id|     movie|description|rating|
+---+----------+-----------+------+
|  5|House card|Interesting|   9.1|
|  1|       War|   great 3D|   8.9|
+---+----------+-----------+------+



In [239]:
cinima_df.createOrReplaceTempView('cinimadf')
spark.sql('select id,movie,description,rating from cinimadf \
              where description != "boring" and id%2 == 1 order by rating desc').show()

+---+----------+-----------+------+
| id|     movie|description|rating|
+---+----------+-----------+------+
|  5|House card|Interesting|   9.1|
|  1|       War|   great 3D|   8.9|
+---+----------+-----------+------+



#### 21 Swap Salary E

In [51]:
salary_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/21_salary.csv'))

In [241]:
salary_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- salary: integer (nullable = true)



In [None]:
LOAD DATA  LOCAL INFILE '../../data/database/21_salary.csv'
INTO TABLE db.salary
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
IGNORE 1 ROWS
(id,name,sex,salary);


SET SQL_SAFE_UPDATES = 0;
UPDATE db.salary
SET sex = 
    CASE 
        WHEN sex = 'm' THEN 'f'
        WHEN sex = 'f' THEN 'm'
    END;

#### 22 Actors and Directors Who Cooperated At Least Three Times E

In [53]:
actordirector_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/22_actordirector.csv'))

In [244]:
actordirector_df.printSchema()

root
 |-- actor_id: integer (nullable = true)
 |-- director_id: integer (nullable = true)
 |-- timestamp: integer (nullable = true)



In [249]:
actordirector_df.groupby('director_id','actor_id').count()\
                .filter(col('count') >=3 ).select('director_id','actor_id').show()

+-----------+--------+
|director_id|actor_id|
+-----------+--------+
|          1|       1|
+-----------+--------+



In [257]:
actordirector_df.createOrReplaceTempView('actordirectordf')
spark.sql('select director_id,actor_id from  actordirectordf \
                group by director_id,actor_id having count(actor_id) >= 3').show()

+-----------+--------+
|director_id|actor_id|
+-----------+--------+
|          1|       1|
+-----------+--------+



#### 23 Product Sales Analysis I E

In [54]:
sales_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/23_sales.csv'))

In [55]:
product_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/23_product.csv'))

In [260]:
sales_df.printSchema()

root
 |-- sale_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: integer (nullable = true)



In [261]:
product_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)



In [263]:
sales_df.join(product_df,sales_df.product_id == product_df.product_id,'inner')\
                .select('product_name','year','price').show()

+------------+----+-----+
|product_name|year|price|
+------------+----+-----+
|       Nokia|2008| 5000|
|       Nokia|2009| 5000|
|       Apple|2011| 9000|
+------------+----+-----+



In [265]:
product_df.createOrReplaceTempView('productdf')
sales_df.createOrReplaceTempView('salesdf')

spark.sql('select product_name,year,price from salesdf \
               join productdf on productdf.product_id == salesdf.product_id').show()

+------------+----+-----+
|product_name|year|price|
+------------+----+-----+
|       Nokia|2008| 5000|
|       Nokia|2009| 5000|
|       Apple|2011| 9000|
+------------+----+-----+



#### 24 54 Product Sales Analysis II E

In [56]:
sales_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/24_sales.csv'))

In [57]:
product_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/24_product.csv'))

In [268]:
sales_df.printSchema()

root
 |-- sale_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: integer (nullable = true)



In [269]:
product_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)



In [270]:
sales_df.groupby('product_id').agg(sum(col('quantity')).alias('total_quandity')).show()

+----------+--------------+
|product_id|total_quandity|
+----------+--------------+
|       100|            22|
|       200|            15|
+----------+--------------+



In [271]:
sales_df.createOrReplaceTempView('salesdf')

spark.sql('select product_id,sum(quantity) as total_quandity from salesdf group by product_id').show()

+----------+--------------+
|product_id|total_quandity|
+----------+--------------+
|       100|            22|
|       200|            15|
+----------+--------------+



#### 25 56 Project Employees I E

In [58]:
employee_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/25_employee.csv'))

In [59]:
project_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/25_project.csv')
                 .withColumnRenamed('employee_id','emp_id'))

In [286]:
employee_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- experience_years: integer (nullable = true)



In [287]:
project_df.printSchema()

root
 |-- project_id: integer (nullable = true)
 |-- emp_id: integer (nullable = true)



In [294]:
project_df.join(employee_df,project_df.emp_id == employee_df.employee_id, 'inner')\
        .groupby('project_id').agg((sum(col('experience_years'))/count(col('project_id'))).alias('average') ).show()

+----------+-------+
|project_id|average|
+----------+-------+
|         1|    2.0|
|         2|    2.5|
+----------+-------+



In [298]:
employee_df.createOrReplaceTempView('employeedf')
project_df.createOrReplaceTempView('projectdf')

spark.sql('select project_id,sum(experience_years)/count(project_id) as average from employeedf \
           join projectdf on projectdf.emp_id == employeedf.employee_id group by project_id').show()

+----------+-------+
|project_id|average|
+----------+-------+
|         1|    2.0|
|         2|    2.5|
+----------+-------+



#### 26 57 Project Employees II E

In [60]:
employee_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/26_employee.csv'))

In [61]:
project_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/26_project.csv')
                 .withColumnRenamed('employee_id','emp_id'))

In [302]:
employee_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- experience_years: integer (nullable = true)



In [303]:
project_df.printSchema()

root
 |-- project_id: integer (nullable = true)
 |-- emp_id: integer (nullable = true)



In [340]:
mx_df = project_df.groupby('project_id').agg(count(col('emp_id')).alias('emp')).collect()[0][1]
project_df.groupby('project_id').agg(count(col('emp_id')).alias('emp'))\
                .filter(col('emp') == mx_df).select('project_id').show()

+----------+
|project_id|
+----------+
|         1|
+----------+



In [356]:
project_df.createOrReplaceTempView('projectdf')
spark.sql('select project_id from (select project_id,count(emp_id) as cnt \
          from projectdf group by project_id order by cnt desc limit 1)').show()

+----------+
|project_id|
+----------+
|         1|
+----------+



#### 27 59 Sales Analysis I E

In [62]:
sales_schema = StructType([
                     StructField('seller_id',IntegerType()),
                     StructField('product_id',IntegerType()),
                     StructField('buyer_id',IntegerType()),
                     StructField('sale_date',DateType()),
                     StructField('quantity',IntegerType()),
                     StructField('price',IntegerType()),
                    ])

In [65]:
sales_df = (spark.read
                 .option('header',True)
                 .schema(sales_schema)
                 .format('csv')
                 .load('../../data/database/27_sales.csv')
                 .withColumn('sale_date',to_date('sale_date','MM-dd-yyyy')))

In [66]:
product_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/27_product.csv'))

In [364]:
sales_df.printSchema()

root
 |-- seller_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- buyer_id: integer (nullable = true)
 |-- sale_date: date (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: integer (nullable = true)



In [365]:
product_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- unit_price: integer (nullable = true)



In [391]:
ms_df = sales_df.groupby('seller_id').agg(sum(col('price')).alias('mx')).select(max('mx').alias('mx')).collect()[0][0]
sales_df.groupby('seller_id').agg(sum(col('price')).alias('mx')).filter(col('mx') == ms_df).select('seller_id').show()

+---------+
|seller_id|
+---------+
|        1|
|        3|
+---------+



In [409]:
result_df = sales_df.groupBy("seller_id") \
                   .agg(sum("price").alias("total_price")) \
                   .orderBy(desc("total_price")) \
                   .limit(1)
result_df.select('seller_id').show()

+---------+
|seller_id|
+---------+
|        1|
+---------+



In [419]:
spark.sql('with one as (select sum(price) as price from salesdf group by seller_id order by price desc limit 1),\
                two as (select seller_id,sum(price) as price from salesdf group by seller_id)\
                    select seller_id from two where price in (select * from one) \
                 ').show()

+---------+
|seller_id|
+---------+
|        1|
|        3|
+---------+



#### 28 60 Sales Analysis II E

In [67]:
sales_schema = StructType([
                     StructField('seller_id',IntegerType()),
                     StructField('product_id',IntegerType()),
                     StructField('buyer_id',IntegerType()),
                     StructField('sale_date',DateType()),
                     StructField('quantity',IntegerType()),
                     StructField('price',IntegerType()),
                    ])

In [68]:
sales_df = (spark.read
                 .option('header',True)
                 .schema(sales_schema)
                 .format('csv')
                 .load('../../data/database/28_sales.csv')
                 .withColumn('sale_date',to_date('sale_date','MM-dd-yyyy')))

In [69]:
product_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/28_product.csv'))

In [423]:
sales_df.printSchema()

root
 |-- seller_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- buyer_id: integer (nullable = true)
 |-- sale_date: date (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: integer (nullable = true)



In [424]:
product_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- unit_price: integer (nullable = true)



In [444]:
sales = sales_df.join(product_df,sales_df.product_id == product_df.product_id,'inner')\
            .select('buyer_id','product_name')

s8_df = sales.filter(col('product_name') == lit('S8'))
gp_buyer = sales.groupBy('buyer_id').agg(countDistinct(col('product_name')).alias('gp_buyer_product'))
s8_df.join(gp_buyer,'buyer_id').filter(gp_buyer.gp_buyer_product == 1).select('buyer_id').show()

+--------+
|buyer_id|
+--------+
|       1|
+--------+



In [469]:
sales_df.createOrReplaceTempView('salesdf')
product_df.createOrReplaceTempView('productdf')

spark.sql('with sales as (select buyer_id,product_name from salesdf \
                                 join productdf on salesdf.product_id  == productdf.product_id ), \
                   S  as (select buyer_id,product_name from sales where product_name == "S8"), \
        buyer_product as (select buyer_id,count(distinct product_name) as buyer_product from sales group by buyer_id), \
           final_join as (select S.buyer_id from S join buyer_product on S.buyer_id  == buyer_product.buyer_id \
                                 where buyer_product.buyer_product == 1) \
                                 select * from final_join \
                                 ').show()

+--------+
|buyer_id|
+--------+
|       1|
+--------+



#### 29 61 Sales Analysis III E

In [70]:
sales_schema = StructType([
                     StructField('seller_id',IntegerType()),
                     StructField('product_id',IntegerType()),
                     StructField('buyer_id',IntegerType()),
                     StructField('sale_date',DateType()),
                     StructField('quantity',IntegerType()),
                     StructField('price',IntegerType()),
                    ])

In [71]:
sales_df = (spark.read
                 .option('header',True)
                 .schema(sales_schema)
                 .format('csv')
                 .load('../../data/database/29_sales.csv')
                 .withColumn('sale_date',to_date('sale_date','MM-dd-yyyy')))

In [72]:
product_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True) 
                 .format('csv')
                 .load('../../data/database/29_product.csv')
                 .withColumnRenamed('product_id','productid'))

In [479]:
sales_df.printSchema()

root
 |-- seller_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- buyer_id: integer (nullable = true)
 |-- sale_date: date (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: integer (nullable = true)



In [480]:
product_df.printSchema()

root
 |-- productid: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- unit_price: integer (nullable = true)



In [487]:
sales = sales_df.join(product_df,sales_df.product_id == product_df.productid,'inner')\
            .select('product_id','product_name','sale_date')

In [550]:
date_range = sales.filter( (col('sale_date') >= lit('2019-04-01')) | (col('sale_date') <= lit('2018-12-31') ) )\
    .select('product_id')
out = [ date_range.collect()[i][0] for i in range(0,len(date_range.collect()))]

In [553]:
sales.filter(~col('product_id').isin(out)).select('product_id','product_name').show()

+----------+------------+
|product_id|product_name|
+----------+------------+
|         1|          S8|
+----------+------------+



In [562]:
sales_df.createOrReplaceTempView('salesdf')
product_df.createOrReplaceTempView('productdf')

spark.sql('with sales as (select product_id,product_name,sale_date from salesdf \
                                join productdf on salesdf.product_id == productdf.productid),\
           date_range as (select product_id from sales where sale_date >= "2019-04-01" or sale_date <= "2018-12-31")\
                          select product_id,product_name from sales where product_id not in (select * from date_range)\
                              ').show()

+----------+------------+
|product_id|product_name|
+----------+------------+
|         1|          S8|
+----------+------------+



#### 30 66 Reported Posts E

In [73]:
action_schema = StructType([
                     StructField('user_id',IntegerType()),
                     StructField('post_id',IntegerType()),
                     StructField('action_date',DateType()),
                     StructField('action',StringType()),
                     StructField('extra',StringType()),
                    ])

In [74]:
action_df = (spark.read
                 .option('header',True)
                 .schema(action_schema)
                 .format('csv')
                 .load('../../data/database/30_action.csv')
                 .withColumn('action_date',to_date('action_date','MM-dd-yyyy')))

In [17]:
action_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- post_id: integer (nullable = true)
 |-- action_date: date (nullable = true)
 |-- action: string (nullable = true)
 |-- extra: string (nullable = true)



In [18]:
action_df.filter( (col('action_date') == lit('2019-07-04')) & (col('action') == lit('report')))\
        .groupBy('extra').agg(count(col('extra')).alias('report_count')).show()

+------+------------+
| extra|report_count|
+------+------------+
|  spam|           2|
|racism|           2|
+------+------------+



In [25]:
action_df.createOrReplaceTempView('actiondf')

spark.sql('select extra,count(extra) as report_couunt from actiondf \
                where action_date == "2019-07-04" and action == "report" \
                group by extra').show()

+------+-------------+
| extra|report_couunt|
+------+-------------+
|  spam|            2|
|racism|            2|
+------+-------------+



#### 31 70 User Activity for the Past 30 Days I E

In [188]:
activity_schema = StructType([
                     StructField('user_id',IntegerType()),
                     StructField('session_id',IntegerType()),
                     StructField('activity_date',DateType()),
                     StructField('activity_type',StringType()),
                    ])

In [189]:
activity_df = (spark.read
                 .option('header',True)
                 .schema(action_schema)
                 .format('csv')
                 .load('../../data/database/31_activity.csv')
                 .withColumn('action_date',to_date('action_date','MM-dd-yyyy')))

In [190]:
activity_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- post_id: integer (nullable = true)
 |-- action_date: date (nullable = true)
 |-- action: string (nullable = true)
 |-- extra: string (nullable = true)



In [None]:
activity_df.filter( (col('action_date') <= lit('2019-07-27')) & \
                       (col('action_date') >= date_add(lit('2019-07-27'), -30) ))\
                        .groupby('action_date').agg(countDistinct(col('user_id')).alias('active_users')).show()

In [None]:
activity_df.createOrReplaceTempView('activitydf')

spark.sql('select action_date, count(distinct user_id) active_users from activitydf \
                where action_date between date_add("2019-07-27",-30) and "2019-07-27" \
                 group by action_date').show()

#### 32 71 User Activity for the Past 30 Days II E

In [77]:
activity32_schema = StructType([
                     StructField('user_id',IntegerType()),
                     StructField('session_id',IntegerType()),
                     StructField('activity_date',DateType()),
                     StructField('activity_type',StringType()),
                    ])

In [78]:
activity32_df = (spark.read
                 .option('header',True)
                 .schema(activity32_schema)
                 .format('csv')
                 .load('../../data/database/32_activity.csv')
                 .withColumn('activity_date',to_date('activity_date','MM-dd-yyyy')))

In [101]:
activity32_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- session_id: integer (nullable = true)
 |-- activity_date: date (nullable = true)
 |-- activity_type: string (nullable = true)



In [130]:
activity32_df.filter( (col('activity_date') <= lit('2019-07-27')) & \
                       (col('activity_date') >= date_add(lit('2019-07-27'), -30) )) \
                      .filter(col('activity_type') == lit('open_session'))\
                      .groupBy('user_id').agg(count(col('user_id')).alias('cnt'))\
                      .select(round(sum(col('cnt'))/count('user_id'),2).alias('average_sessions_per_user')).show()
                        

+-------------------------+
|average_sessions_per_user|
+-------------------------+
|                     1.33|
+-------------------------+



In [136]:
activity32_df.createOrReplaceTempView('activity32df')

spark.sql('with one as (select user_id,count(user_id) as cnt from activity32df where activity_date between \
                        date_add("2019-07-27",-30) and "2019-07-27" and activity_type= "open_session" group by user_id) \
                    select round(sum(cnt)/count(user_id),2) as average_sessions_per_user from one \
            ').show()

+-------------------------+
|average_sessions_per_user|
+-------------------------+
|                     1.33|
+-------------------------+



#### 33 72  Article Views I E 

In [79]:
views_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/33_views.csv'))

In [6]:
views_df.printSchema()

root
 |-- article_id: integer (nullable = true)
 |-- author_id: integer (nullable = true)
 |-- viewer_id: integer (nullable = true)
 |-- view_date: string (nullable = true)



In [12]:
views_df.filter(col('author_id') == col('viewer_id') ).select(col('author_id').alias('id')).distinct().show()

+---+
| id|
+---+
|  4|
|  7|
+---+



In [40]:
views_df.createOrReplaceTempView('viewsdf')

spark.sql('''
           select distinct author_id from viewsdf where author_id == viewer_id
         ''').show()

+---------+
|author_id|
+---------+
|        4|
|        7|
+---------+



In [80]:
'''
Python
x = open('../../python/15_Maximumarray.txt','r')
y=x.readlines()
x.close()
z =[]
for i in y[0].split(' ')[0:]:
'''

x = open('../../data/database/33_views.csv','r')
y = x.readlines()
x.close()

lst = []
for i in y[1:]:
    if i.split(',')[1] == i.split(',')[2]:
        lst.append(i.split(',')[1])
print(set(lst))

{'4', '7'}


#### 34 73 Article Views II M

In [81]:
views_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/34_views.csv'))

In [49]:
views_df.printSchema()

root
 |-- article_id: integer (nullable = true)
 |-- author_id: integer (nullable = true)
 |-- viewer_id: integer (nullable = true)
 |-- view_date: string (nullable = true)



In [60]:
window_spec = Window.partitionBy('view_date','viewer_id').orderBy('article_id')
views_df.withColumn('rnk',rank().over(window_spec)).filter(col('rnk') == 2).select(col('viewer_id').alias('id')).show()

+---+
| id|
+---+
|  5|
|  6|
+---+



In [64]:
views_df.createOrReplaceTempView('viewsdf')

spark.sql('''
           select id from 
           (select viewer_id as id, rank() OVER(PARTITION BY view_date,viewer_id ORDER BY article_id) as rnk 
                from viewsdf) where rnk = 2
         ''').show()

+---+
| id|
+---+
|  5|
|  6|
+---+



#### 35 74 Market Analysis I M 

In [82]:
users_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/35_users.csv'))

In [83]:
orders_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/35_orders.csv'))

In [84]:
items_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/35_items.csv'))

In [69]:
users_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- join_date: string (nullable = true)
 |-- favorite_brand: string (nullable = true)



In [70]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- buyer_id: integer (nullable = true)
 |-- seller_id: integer (nullable = true)



In [71]:
items_df.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- item_brand: string (nullable = true)



In [89]:
orders_df.filter( year(to_date('order_date','yyyy-MM-dd')) == 2019)\
.groupBy('buyer_id').agg(count(col('buyer_id')).alias('orders_in_2019'))\
.join(users_df,orders_df.buyer_id == users_df.user_id,'right')\
.select('user_id','join_date',coalesce(col('orders_in_2019'),lit('0')).alias('orders_in_2019') ).show()


+-------+----------+--------------+
|user_id| join_date|orders_in_2019|
+-------+----------+--------------+
|      1|2018-01-01|             1|
|      2|2018-02-09|             2|
|      3|2018-01-19|             0|
|      4|2018-05-21|             0|
+-------+----------+--------------+



In [104]:
users_df.createOrReplaceTempView('usersdf')
orders_df.createOrReplaceTempView('ordersdf')

spark.sql('''
            select user_id,join_date,coalesce(orders_in_2019,0) as orders_in_2019 from usersdf left join
            (select buyer_id,count(buyer_id) as orders_in_2019 from ordersdf 
            where year(to_date(order_date,"yyyy-MM-dd")) == 2019 group by buyer_id) on user_id == buyer_id
         ''').show()

+-------+----------+--------------+
|user_id| join_date|orders_in_2019|
+-------+----------+--------------+
|      1|2018-01-01|             1|
|      2|2018-02-09|             2|
|      3|2018-01-19|             0|
|      4|2018-05-21|             0|
+-------+----------+--------------+



#### 36 77 mmediate Food Delivery I E 

In [85]:
delivery_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/36_delivery.csv'))

In [106]:
delivery_df.printSchema()

root
 |-- delivery_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_pref_delivery_date: string (nullable = true)



In [125]:
delivery_df.filter(col('order_date') == col('customer_pref_delivery_date'))\
.agg( (round(count(col('delivery_id'))/delivery_df.agg(count(col('delivery_id')))\
.collect()[0][0],4) *100).alias('immediate_percentage') ).show()

+--------------------+
|immediate_percentage|
+--------------------+
|               33.33|
+--------------------+



In [148]:
delivery_df.createOrReplaceTempView('deliverydf')

spark.sql('''
  with main as (select count(delivery_id) as equ_cnt from deliverydf 
                     where order_date == customer_pref_delivery_date),
        one as (select  count(delivery_id) as cnt from deliverydf)
                 select round(main.equ_cnt / one.cnt * 100,2) as immediate_percentage from main,one
         ''').show()

+--------------------+
|immediate_percentage|
+--------------------+
|               33.33|
+--------------------+



#### 37 79 Reformat Department Table E

In [86]:
department_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/37_department.csv'))

In [5]:
department_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- revenue: integer (nullable = true)
 |-- month: string (nullable = true)



In [81]:
pivot_df = department_df.groupBy("id").pivot("month").agg(max("revenue"))
pivot_df.withColumn('Apr',lit('null')).show()

+---+-----+----+----+----+
| id|  Feb| Jan| Mar| Apr|
+---+-----+----+----+----+
|  1| 7000|8000|6000|null|
|  3|10000|null|null|null|
|  2| null|9000|null|null|
+---+-----+----+----+----+



In [None]:
department_df.groupBy("id").pivot("month").agg(max("revenue"))

In [178]:
department_df.withColumn('jan',when( (col('month') == lit('Jan')),col('revenue') ) \
.when( (col('month') == lit('Feb')),col('revenue') ) \
.when( (col('month') == lit('Mar')),col('revenue') ) \
.when( (col('month') == lit('Apr')),col('revenue') ) 
.when( (col('month') == lit('Apr')),col('revenue') )                         )\
.select('id','month','revenue','jan').show()

+---+-----+-------+-----+
| id|month|revenue|  jan|
+---+-----+-------+-----+
|  1|  Jan|   8000| 8000|
|  2|  Jan|   9000| 9000|
|  3|  Feb|  10000|10000|
|  1|  Feb|   7000| 7000|
|  1|  Mar|   6000| 6000|
+---+-----+-------+-----+



In [145]:
jan_df = department_df.withColumn('Jan_rev',when( (col('month') == lit('Jan')),col('revenue') )) \
.groupBy(col('id')).agg(max(col('Jan_rev')).alias('Jan_rev'))\
.select('id','Jan_rev',lit('').alias('Feb_rev'),lit('').alias('Mar_rev'),lit('').alias('Apr_rev')
       ,lit('').alias('May_rev'),lit('').alias('Jun_rev'),lit('').alias('Jul_rev'),lit('').alias('Aug_rev')
       ,lit('').alias('Sep_rev'),lit('').alias('Oct_rev'),lit('').alias('Nov_rev'),lit('').alias('Dec_rev')   
       )

In [146]:
feb_df = department_df.withColumn('Feb_rev',when( (col('month') == lit('Feb')),col('revenue') )) \
.groupBy(col('id')).agg(max(col('Feb_rev')).alias('Feb_rev'))\
.select('id',lit('').alias('Jan_rev'),'Feb_rev',lit('').alias('Mar_rev'),lit('').alias('Apr_rev') 
       ,lit('').alias('May_rev'),lit('').alias('Jun_rev'),lit('').alias('Jul_rev'),lit('').alias('Aug_rev') 
       ,lit('').alias('Sep_rev'),lit('').alias('Oct_rev'),lit('').alias('Nov_rev'),lit('').alias('Dec_rev')  
       )

In [147]:
mar_df = department_df.withColumn('Mar_rev',when( (col('month') == lit('Mar')),col('revenue') )) \
.groupBy(col('id')).agg(max(col('Mar_rev')).alias('Mar_rev'))\
.select('id',lit('').alias('Jan_rev'),lit('').alias('Feb_rev'),'Mar_rev',lit('').alias('Apr_rev')
       ,lit('').alias('May_rev'),lit('').alias('Jun_rev'),lit('').alias('Jul_rev'),lit('').alias('Aug_rev')
       ,lit('').alias('Sep_rev'),lit('').alias('Oct_rev'),lit('').alias('Nov_rev'),lit('').alias('Dec_rev')    
       )

In [152]:
apr_df = department_df.withColumn('Apr_rev',when( (col('month') == lit('Apr')),col('revenue') )) \
.groupBy(col('id')).agg(max(col('Apr_rev')).alias('Apr_rev'))\
.select('id',lit('').alias('Jan_rev'),lit('').alias('Feb_rev'),lit('').alias('Mar_rev'),'Apr_rev'
       ,lit('').alias('May_rev'),lit('').alias('Jun_rev'),lit('').alias('Jul_rev'),lit('').alias('Aug_rev')
       ,lit('').alias('Sep_rev'),lit('').alias('Oct_rev'),lit('').alias('Nov_rev'),lit('').alias('Dec_rev') 
       )

In [153]:
jan_df\
.union(feb_df)\
.union(mar_df)\
.union(apr_df)\
.groupBy('id').agg(sum(col('Jan_rev')).cast('int').alias('Jan_rev'), \
                   sum(col('Feb_rev')).cast('int').alias('Feb_rev'), \
                   sum(col('Mar_rev')).cast('int').alias('Mar_rev'), \
                   sum(col('Apr_rev')).cast('int').alias('Apr_rev'), \
                   sum(col('May_rev')).cast('int').alias('May_rev'), \
                   sum(col('Jun_rev')).cast('int').alias('Jun_rev'), \
                   sum(col('Jul_rev')).cast('int').alias('Jul_rev'), \
                   sum(col('Aug_rev')).cast('int').alias('Aug_rev'), \
                   sum(col('Sep_rev')).cast('int').alias('Sep_rev'), \
                   sum(col('Oct_rev')).cast('int').alias('Oct_rev'), \
                   sum(col('Nov_rev')).cast('int').alias('Nov_rev'), \
                   sum(col('Dec_rev')).cast('int').alias('Dec_rev'), \
                  ).show()



                                                                                

+---+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
| id|Jan_rev|Feb_rev|Mar_rev|Apr_rev|May_rev|Jun_rev|Jul_rev|Aug_rev|Sep_rev|Oct_rev|Nov_rev|Dec_rev|
+---+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|  1|   8000|   7000|   6000|   null|   null|   null|   null|   null|   null|   null|   null|   null|
|  3|   null|  10000|   null|   null|   null|   null|   null|   null|   null|   null|   null|   null|
|  2|   9000|   null|   null|   null|   null|   null|   null|   null|   null|   null|   null|   null|
+---+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+



In [162]:
department_df.withColumn('Jan_rev',when( (col('month') == lit('Jan')),col('revenue') ))\
.withColumn('Feb_rev',when( (col('month') == lit('Feb')),col('revenue') ))\
.withColumn('Mar_rev',when( (col('month') == lit('Mar')),col('revenue') ))\
.withColumn('Apr_rev',when( (col('month') == lit('Apr')),col('revenue') ))\
.select('id','Jan_rev','Feb_rev','Mar_rev','Apr_rev',lit('').alias('May_rev'),lit('').alias('Jun_rev'),lit('').alias('Jul_rev'),lit('').alias('Aug_rev')
       ,lit('').alias('Sep_rev'),lit('').alias('Oct_rev'),lit('').alias('Nov_rev'),lit('').alias('Dec_rev') 
       ).groupBy('id').agg(sum(col('Jan_rev')).cast('int').alias('Jan_rev'), \
                   sum(col('Feb_rev')).cast('int').alias('Feb_rev'), \
                   sum(col('Mar_rev')).cast('int').alias('Mar_rev'), \
                   sum(col('Apr_rev')).cast('int').alias('Apr_rev'), \
                   sum(col('May_rev')).cast('int').alias('May_rev'), \
                   sum(col('Jun_rev')).cast('int').alias('Jun_rev'), \
                   sum(col('Jul_rev')).cast('int').alias('Jul_rev'), \
                   sum(col('Aug_rev')).cast('int').alias('Aug_rev'), \
                   sum(col('Sep_rev')).cast('int').alias('Sep_rev'), \
                   sum(col('Oct_rev')).cast('int').alias('Oct_rev'), \
                   sum(col('Nov_rev')).cast('int').alias('Nov_rev'), \
                   sum(col('Dec_rev')).cast('int').alias('Dec_rev'), \
                  ).show()

+---+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
| id|Jan_rev|Feb_rev|Mar_rev|Apr_rev|May_rev|Jun_rev|Jul_rev|Aug_rev|Sep_rev|Oct_rev|Nov_rev|Dec_rev|
+---+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|  1|   8000|   7000|   6000|   null|   null|   null|   null|   null|   null|   null|   null|   null|
|  3|   null|  10000|   null|   null|   null|   null|   null|   null|   null|   null|   null|   null|
|  2|   9000|   null|   null|   null|   null|   null|   null|   null|   null|   null|   null|   null|
+---+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+



In [154]:
department_df.createOrReplaceTempView('departmentdf')

spark.sql('''
          select id, Min(case when month == "Jan" then revenue end) Jan_rev,
                     Min(case when month == "Feb" then revenue end) Feb_rev,
                     Min(case when month == "Mar" then revenue end) Mar_rev,
                     Min(case when month == "Apr" then revenue end) Apr_rev, 
                     Min(case when month == "May" then revenue end) May_rev,
                     Min(case when month == "Jun" then revenue end) Jun_rev,
                     Min(case when month == "Jul" then revenue end) Jul_rev,
                     Min(case when month == "Aug" then revenue end) Aug_rev,
                     Min(case when month == "Sep" then revenue end) Sep_rev,
                     Min(case when month == "Oct" then revenue end) Oct_rev,
                     Min(case when month == "Nov" then revenue end) Nov_rev,
                     Min(case when month == "Dec" then revenue end) Dec_rev
                     from departmentdf group by id order by id
          ''').show()

+---+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
| id|Jan_rev|Feb_rev|Mar_rev|Apr_rev|May_rev|Jun_rev|Jul_rev|Aug_rev|Sep_rev|Oct_rev|Nov_rev|Dec_rev|
+---+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|  1|   8000|   7000|   6000|   null|   null|   null|   null|   null|   null|   null|   null|   null|
|  2|   9000|   null|   null|   null|   null|   null|   null|   null|   null|   null|   null|   null|
|  3|   null|  10000|   null|   null|   null|   null|   null|   null|   null|   null|   null|   null|
+---+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+



#### 38 84 Queries Quality and Percentage E

In [87]:
queries_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/38_queries.csv'))

In [4]:
queries_df.printSchema()

root
 |-- query_name: string (nullable = true)
 |-- result: string (nullable = true)
 |-- position: integer (nullable = true)
 |-- rating: integer (nullable = true)



In [40]:
queries_df.withColumn('poor_query_percentage',round(count(when(col('rating') < 3,col('rating') )).over(window_spec) 
                     / count('rating').over(window_spec),2 ))\
          .withColumn('quality', round(sum(col('rating')/col('position')).over(window_spec)
                      / count(col('query_name')).over(window_spec),2) ) \
.select('query_name','quality','poor_query_percentage').distinct().show()

+----------+-------+---------------------+
|query_name|quality|poor_query_percentage|
+----------+-------+---------------------+
|       Cat|   0.66|                 0.33|
|       Dog|    2.5|                 0.33|
+----------+-------+---------------------+



In [53]:
queries_df.createOrReplaceTempView('queriesdf')

spark.sql('''
          select query_name, round(sum(rating/position)/count(rating),2) as quality,
         round((count(case when rating < 3 then rating  end ) / count(rating)),2) as poor_query_percentage
          from queriesdf group by query_name
         ''').show()

+----------+-------+---------------------+
|query_name|quality|poor_query_percentage|
+----------+-------+---------------------+
|       Cat|   0.66|                 0.33|
|       Dog|    2.5|                 0.33|
+----------+-------+---------------------+



#### 39 87 Number of Comments per Post E

In [89]:
submissions_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/39_submissions.csv'))

In [55]:
submissions_df.printSchema()

root
 |-- sub_id: integer (nullable = true)
 |-- parent_id: integer (nullable = true)



In [88]:
x = submissions_df.withColumn('ids',when(col('parent_id').isNull(),col('sub_id'))).select('ids')
x.join(submissions_df,col('ids') == col('parent_id'),'left').distinct()\
.groupBy(col('ids')).agg(count(col('parent_id')).alias('number_of_comments')).filter(~col('ids').isNull())\
.select('ids','number_of_comments').orderBy('ids').show()

+---+------------------+
|ids|number_of_comments|
+---+------------------+
|  1|                 3|
|  2|                 2|
| 12|                 0|
+---+------------------+



In [119]:
submissions_df.createOrReplaceTempView('submissionsdf')

spark.sql('''
  with main as (select sub_id  as id from submissionsdf where parent_id is null)
               select id,count(parent_id) as number_of_comments from submissionsdf 
                     right join main on id == parent_id group by id order by id
         ''').show()

+---+------------------+
| id|number_of_comments|
+---+------------------+
|  1|                 8|
|  2|                 2|
| 12|                 0|
+---+------------------+



#### 40 88 Average Selling Price E

In [90]:
prices_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/40_prices.csv'))

In [91]:
unitsold_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/40_unitsold.csv'))

In [122]:
prices_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- start_date: string (nullable = true)
 |-- end_date: string (nullable = true)
 |-- price: integer (nullable = true)



In [123]:
unitsold_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- purchase_date: string (nullable = true)
 |-- units: integer (nullable = true)



In [158]:
prices_df.withColumnRenamed('product_id','prdid').join(unitsold_df,(col('prdid')== unitsold_df.product_id) & 
               (unitsold_df.purchase_date.between(prices_df.start_date,prices_df.end_date)),'inner')\
.select('prdid', (col('price') * col('units') ).alias('salsprice'),'units' ).groupBy('prdid')\
.agg(round(sum(col('salsprice'))/sum('units'),2 ).alias('average_price')).show()

+-----+-------------+
|prdid|average_price|
+-----+-------------+
|    1|         6.96|
|    2|        16.96|
+-----+-------------+



In [178]:
prices_df.createOrReplaceTempView('pricesdf')
unitsold_df.createOrReplaceTempView('unitsolddf')

spark.sql('''
 select pricesdf.product_id as product_id, round(sum(price * units)/sum(units),2) as average_price from pricesdf  
        join unitsolddf on unitsolddf.product_id == pricesdf.product_id 
        and purchase_date between start_date and end_date
        group by pricesdf.product_id
          ''').show()

+----------+-------------+
|product_id|average_price|
+----------+-------------+
|         1|         6.96|
|         2|        16.96|
+----------+-------------+



#### 41 91 Students and Examinations E

In [92]:
students_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/41_students.csv'))

In [93]:
subjects_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/41_subjects.csv'))

In [94]:
examinations_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/41_examinations.csv')
                 .withColumnRenamed('student_id','studentid')
                 .withColumnRenamed('subject_name','subjectname') )

In [234]:
students_df.printSchema()

root
 |-- student_id: integer (nullable = true)
 |-- student_name: string (nullable = true)



In [235]:
subjects_df.printSchema()

root
 |-- subject_name: string (nullable = true)



In [236]:
examinations_df.printSchema()

root
 |-- studentid: integer (nullable = true)
 |-- subjectname: string (nullable = true)



In [249]:
students_df.join(subjects_df)\
           .join(examinations_df,(col('student_id') == col('studentid')) &
            (col('subject_name') == col('subjectname')) ,'left')\
            .groupBy('student_id','student_name','subject_name')\
            .agg(count(col('studentid')).alias('attended_exams')).orderBy('student_id').show()


+----------+------------+------------+--------------+
|student_id|student_name|subject_name|attended_exams|
+----------+------------+------------+--------------+
|         1|       Alice|        Math|             3|
|         1|       Alice| Programming|             1|
|         1|       Alice|     Physics|             2|
|         2|         Bob|     Physics|             0|
|         2|         Bob| Programming|             1|
|         2|         Bob|        Math|             1|
|         6|        Alex| Programming|             0|
|         6|        Alex|     Physics|             0|
|         6|        Alex|        Math|             0|
|        13|        John|        Math|             1|
|        13|        John| Programming|             1|
|        13|        John|     Physics|             1|
+----------+------------+------------+--------------+



In [256]:
students_df.createOrReplaceTempView('studentsdf')
subjects_df.createOrReplaceTempView('subjects_df')
examinations_df.createOrReplaceTempView('examinations_df')

spark.sql('''
          select student_id,student_name,subject_name,count(subjectname) as attended_exams from studentsdf 
          left join subjects_df 
          left join examinations_df on student_id = studentid and subject_name = subjectname
          group by student_id,student_name,subject_name order by student_id
        ''').show()

+----------+------------+------------+--------------+
|student_id|student_name|subject_name|attended_exams|
+----------+------------+------------+--------------+
|         1|       Alice| Programming|             1|
|         1|       Alice|     Physics|             2|
|         1|       Alice|        Math|             3|
|         2|         Bob|     Physics|             0|
|         2|         Bob| Programming|             1|
|         2|         Bob|        Math|             1|
|         6|        Alex|        Math|             0|
|         6|        Alex| Programming|             0|
|         6|        Alex|     Physics|             0|
|        13|        John|        Math|             1|
|        13|        John| Programming|             1|
|        13|        John|     Physics|             1|
+----------+------------+------------+--------------+



#### 42 93 Weather Type in Each Country E

In [95]:
countries_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/42_countries.csv'))

In [96]:
weather_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/42_weather.csv'))

In [295]:
countries_df.printSchema()

root
 |-- country_id: integer (nullable = true)
 |-- country_name: string (nullable = true)



In [296]:
weather_df.printSchema()

root
 |-- country_id: integer (nullable = true)
 |-- weather_state: integer (nullable = true)
 |-- day: string (nullable = true)



In [272]:
window_spec = Window.partitionBy('country_id')
weather_df.filter( (year(col('day')) == lit('2019')) & (month(col('day')) == lit('11'))  )\
.withColumn('weather_avg',round(sum('weather_state').over(window_spec)/count('country_id').over(window_spec),2) )\
.withColumn('weather_type',when( col('weather_avg') <= 15, lit('cold'))
                          .when( col('weather_avg') >= 25, lit('Hot')).otherwise('warm'))\
.join(countries_df, on = 'country_id')\
.select('country_name','weather_type').distinct().orderBy('weather_type').show()




+------------+------------+
|country_name|weather_type|
+------------+------------+
|        Peru|         Hot|
|     Morocco|         Hot|
|   Australia|        cold|
|         USA|        cold|
|       China|        warm|
+------------+------------+



                                                                                

In [310]:
countries_df.createOrReplaceTempView('countriesdf')
weather_df.createOrReplaceTempView('weatherdf')

spark.sql('''
  with main as (select country_id, round(sum(weather_state)/count(country_id),2) as weather_avg 
                     from weatherdf where year(day) = 2019 and month(day) = 11 group by country_id)
               select country_name, case when weather_avg <= 15 then "Cold"
                                          when weather_avg >= 25 then "Hot"
                                          else "Warm" end as weather_type
                      from main join countriesdf on main.country_id == countriesdf.country_id order by weather_type
         ''').show()

+------------+------------+
|country_name|weather_type|
+------------+------------+
|   Australia|        Cold|
|         USA|        Cold|
|     Morocco|         Hot|
|        Peru|         Hot|
|       China|        Warm|
+------------+------------+



#### 43 94 Find the Team Size E

In [97]:
employee_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/43_employee.csv'))

In [312]:
employee_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- team_id: integer (nullable = true)



In [314]:
employee_df.show()

+-----------+-------+
|employee_id|team_id|
+-----------+-------+
|          1|      8|
|          2|      8|
|          3|      8|
|          4|      7|
|          5|      9|
|          6|      9|
+-----------+-------+



In [317]:
window_spec = Window.partitionBy('team_id')
employee_df.withColumn('team_size',count('team_id').over(window_spec))\
.select('employee_id','team_size').orderBy('employee_id').show()

+-----------+---------+
|employee_id|team_size|
+-----------+---------+
|          1|        3|
|          2|        3|
|          3|        3|
|          4|        1|
|          5|        2|
|          6|        2|
+-----------+---------+



In [323]:
employee_df.createOrReplaceTempView('employeedf')

spark.sql('''
        select employee_id,count(team_id) over(partition by team_id) as  team_size from employeedf order by employee_id
        ''').show()

+-----------+---------+
|employee_id|team_size|
+-----------+---------+
|          1|        3|
|          2|        3|
|          3|        3|
|          4|        1|
|          5|        2|
|          6|        2|
+-----------+---------+



#### 44 97 Ads Performance E

In [98]:
ads_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/44_ads.csv'))

In [325]:
ads_df.printSchema()

root
 |-- ad_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- action: string (nullable = true)



In [None]:
for ad_id = 1, ctr = (2/(2+1)) * 100 = 66.67
for ad_id = 2, ctr = (1/(1+2)) * 100 = 33.33

In [373]:
window_spec = Window.partitionBy('ad_id')
windowspec = Window.partitionBy('ad_id','action')

ads_df.filter(col('action') != lit('Ignored'))\
.withColumn('ctr',(when(col('action') == lit('Clicked'),count('action').over(windowspec) )
                 / when(col('action') != lit('Ignored'),count('action').over(window_spec)))) \
.filter(~col('ctr').isNull()).select('ad_id',(round(col('ctr')*100,2) ).alias('ctr') ).distinct().show()

+-----+-----+
|ad_id|  ctr|
+-----+-----+
|    1|66.67|
|    3| 50.0|
|    2|33.33|
+-----+-----+



In [434]:
ads_df.createOrReplaceTempView('adsdf')

spark.sql('''
 with main as(select distinct ad_id,action ,case when action == "Clicked" then count(action) over(partition by ad_id,action) end as up,
                 case when action != 'Ignored' then count(action) over(partition by ad_id) end as div from adsdf)
    select ad_id,round((up/div)*100,2) as ctr from main where up is not null
          ''').show()

+-----+-----+
|ad_id|  ctr|
+-----+-----+
|    1| 50.0|
|    3| 50.0|
|    2|33.33|
+-----+-----+



#### 45 98 List the Products Ordered in a Period E

In [99]:
products_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/45_products.csv'))

In [100]:
orders_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/45_orders.csv'))

In [447]:
products_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_category: string (nullable = true)



In [448]:
orders_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- unit: integer (nullable = true)



In [457]:
orders_df.filter( (year(col('order_date')) == lit('2020')) & (month(col('order_date')) == lit('02') )  ) \
.groupBy('product_id').agg(sum(col('unit')).alias('unit')).filter(col('unit') >= 100)\
.join(products_df, on = 'product_id').select('product_name','unit').show()

+------------------+----+
|      product_name|unit|
+------------------+----+
|Leetcode Solutions| 130|
|      Leetcode Kit| 100|
+------------------+----+



In [470]:
products_df.createOrReplaceTempView('productsdf')
orders_df.createOrReplaceTempView('ordersdf')

spark.sql('''
 with main as (select product_id,sum(unit) as unit from ordersdf where 
                      year(order_date) == "2020" and month(order_date) == "02" group by product_id)
               select product_name,unit from main join productsdf on productsdf.product_id == main.product_id
                      where unit >= 100
        ''').show()

+------------------+----+
|      product_name|unit|
+------------------+----+
|Leetcode Solutions| 130|
|      Leetcode Kit| 100|
+------------------+----+



#### 46 101 Students With Invalid Departments E

In [101]:
departments_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/46_departments.csv'))

In [102]:
students_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/46_students.csv'))

In [5]:
departments_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)



In [6]:
students_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- deepartment_id: integer (nullable = true)



In [8]:
students_df.join(departments_df,students_df.deepartment_id == departments_df.id,'leftanti')\
.select('id','name').orderBy('id').show()

+---+-------+
| id|   name|
+---+-------+
|  2|   John|
|  3|  Steve|
|  4|Jasmine|
|  7| Daiana|
+---+-------+



In [14]:
departments_df.createOrReplaceTempView('departmentsdf')
students_df.createOrReplaceTempView('studentsdf')

spark.sql('''
             select studentsdf.id as id,studentsdf.name as name from studentsdf left anti join
                   departmentsdf on studentsdf.deepartment_id = departmentsdf.id order by id
         ''').show()

+---+-------+
| id|   name|
+---+-------+
|  2|   John|
|  3|  Steve|
|  4|Jasmine|
|  7| Daiana|
+---+-------+



#### 47 105 Replace Employee ID With The Unique Identifier E

In [103]:
employee_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/47_employee.csv'))

In [104]:
employees_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/47_employees.csv'))

In [17]:
employee_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- unique_id: integer (nullable = true)



In [18]:
employees_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)



In [20]:
employees_df.join(employee_df,employees_df.id == employee_df.id,'left' )\
.select('unique_id','name').show()

+---------+--------+
|unique_id|    name|
+---------+--------+
|     null|   Alice|
|     null|     Bob|
|        2|    Meir|
|        3| Winston|
|        1|Jonathan|
+---------+--------+



In [23]:
employee_df.createOrReplaceTempView('employeedf')
employees_df.createOrReplaceTempView('employeesdf')

spark.sql('''
          select unique_id,name from employeedf right outer join employeesdf on employeedf.id == employeesdf.id
         ''').show()

+---------+--------+
|unique_id|    name|
+---------+--------+
|     null|   Alice|
|     null|     Bob|
|        2|    Meir|
|        3| Winston|
|        1|Jonathan|
+---------+--------+



#### 48 109 Top Travellers E

In [105]:
users_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/48_users.csv'))

In [106]:
rides_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/48_rides.csv'))

In [27]:
users_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)



In [28]:
rides_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- distance: integer (nullable = true)



In [35]:
users_df.join(rides_df,users_df.id == rides_df.user_id,'inner').select('name','distance').groupBy('name')\
.agg(sum(col('distance')).alias('distance')).orderBy(col('distance').desc(),col('name').asc()).show()

+--------+--------+
|    name|distance|
+--------+--------+
|   Elvis|     450|
|     Lee|     450|
|     Bob|     317|
|Jonathan|     312|
|    Alex|     222|
|   Alice|     120|
+--------+--------+



In [40]:
users_df.createOrReplaceTempView('usersdf')
rides_df.createOrReplaceTempView('ridesdf')

spark.sql('''
        select name,sum(distance) as distance from usersdf join ridesdf on usersdf.id ==  ridesdf.user_id
               group by name order by distance desc, name asc
         ''').show()

+--------+--------+
|    name|distance|
+--------+--------+
|   Elvis|     450|
|     Lee|     450|
|     Bob|     317|
|Jonathan|     312|
|    Alex|     222|
|   Alice|     120|
+--------+--------+



####  49 111 NPV Queries E

In [107]:
npv_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/49_npv.csv'))

In [108]:
queries_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/49_queries.csv'))

In [43]:
npv_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- npv: integer (nullable = true)



In [44]:
queries_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- year: integer (nullable = true)



In [97]:
npv_df.join(queries_df, (npv_df.id == queries_df.id) & ( npv_df.year == queries_df.year),'right' )\
.select(queries_df.id ,queries_df.year,coalesce(col('npv'),lit('0')).alias('npv')).orderBy('id').show()

+---+----+---+
| id|year|npv|
+---+----+---+
|  1|2019|113|
|  2|2008|121|
|  3|2009| 12|
|  7|2019|  0|
|  7|2018|  0|
|  7|2020| 30|
| 13|2019| 40|
+---+----+---+



In [100]:
npv_df.createOrReplaceTempView('npvdf')
queries_df.createOrReplaceTempView('queriesdf')

spark.sql('''
          select queriesdf.id,queriesdf.year,coalesce(npv,0) as npv from queriesdf 
              left join npvdf on queriesdf.id == npvdf.id and queriesdf.year == npvdf.year
         ''').show()

+---+----+---+
| id|year|npv|
+---+----+---+
|  1|2019|113|
|  2|2008|121|
|  3|2009| 12|
|  7|2018|  0|
|  7|2019|  0|
|  7|2020| 30|
| 13|2019| 40|
+---+----+---+



#### 50 112 Create a Session Bar Chart E

In [109]:
session_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/50_session.csv'))

In [102]:
session_df.printSchema()

root
 |-- session_id: integer (nullable = true)
 |-- duration: integer (nullable = true)



In [150]:
missing_bin_df = spark.createDataFrame([Row(bin='10-15', total=0, row='3')])

window_spec = Window.partitionBy(col('row'))
session_df.withColumn('bin',(when(col('duration').between(0,300),lit('0-5'))
                            .when(col('duration').between(300,600),lit('5-10'))  
                            .when(col('duration').between(600,900),lit('10-15'))
                            .when(col('duration') > 900,lit('15 or above') )             
                            ).otherwise(0))\
.withColumn('row',(when(col('duration').between(0,300),lit('1'))
                            .when(col('duration').between(300,600),lit('2'))  
                            .when(col('duration').between(600,900),lit('3'))
                            .when(col('duration') > 900,lit('4') )             
                            ).otherwise(0))\
.withColumn('total',count('session_id').over(window_spec))\
.select('bin','total','row').distinct().union(missing_bin_df).select('bin','total').orderBy('row').show()




+-----------+-----+
|        bin|total|
+-----------+-----+
|        0-5|    3|
|       5-10|    1|
|      10-15|    0|
|15 or above|    1|
+-----------+-----+



In [136]:
missing_bin_df.show()

[Stage 267:>                                                        (0 + 1) / 1]

+----+-----+---+
| bin|total|row|
+----+-----+---+
|5-10|    0|  2|
+----+-----+---+



                                                                                

In [188]:
session_df.createOrReplaceTempView('sessiondf')

spark.sql('''
 with main as (select bin,count("session_id") as total from 
              (select case when duration between 0   and 300 then "0-5"
                      when duration between 301 and 600 then "5-10"
                      when duration between 601 and 900 then "10-15"
                      when duration >= 901 then "15 or above" end as bin,
                      session_id from sessiondf) group by bin),
       one as (select bin,total from main union(select "10-15" as bin, "0" as total)),
       two as (select bin,total,case when bin =  "0-5"   then "0"
                                     when bin =  "5-10"  then "1"
                                     when bin = "10-15"  then "2"
                                     when bin = "15 or above" then "3" 
                                     end as row from one)
               select bin,total from two order by row
         ''').show()

+-----------+-----+
|        bin|total|
+-----------+-----+
|        0-5|    3|
|       5-10|    1|
|      10-15|    0|
|15 or above|    1|
+-----------+-----+



In [155]:
spark.sql('''
          select case when duration not between 0   and 300 then "0-5"
                      when duration not between 301 and 600 then "5-10"
                      when duration not between 601 and 900 then "10-15"
                      else "0" end as bin,
                 case when duration not between 0   and 300 then "0"
                      when duration not between 301 and 600 then "1"
                      when duration not between 601 and 900 then "2"
                      else "0" end as row,  
                    session_id from sessiondf
         ''').show()

+----+---+----------+
| bin|row|session_id|
+----+---+----------+
|5-10|  1|         1|
|5-10|  1|         2|
|5-10|  1|         3|
| 0-5|  0|         4|
| 0-5|  0|         5|
+----+---+----------+



In [163]:
spark.sql('''
          
          select "10-15" as bin,"3" as row, "0" as session_id     
                    
         ''').show()

+-----+---+----------+
|  bin|row|session_id|
+-----+---+----------+
|10-15|  3|         0|
+-----+---+----------+



#### 51 119 Group Sold Products By The Date E

In [110]:
activities_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/51_activities.csv'))

In [190]:
activities_df.printSchema()

root
 |-- sell_date: string (nullable = true)
 |-- product: string (nullable = true)



In [191]:
activities_df.show()

+----------+----------+
| sell_date|   product|
+----------+----------+
|2020-05-30| Headphone|
|2020-06-01|    Pencil|
|2020-06-02|      Mask|
|2020-05-30|Basketball|
|2020-06-01|     Bible|
|2020-06-02|      Mask|
|2020-05-30|   T-Shirt|
+----------+----------+



In [237]:
window_spec = Window.partitionBy('sell_date')

activities_df.distinct().withColumn('num_sold',count('product').over(window_spec)) \
.withColumn('product',array_join(collect_list(col('product')).over(window_spec),','))\
.select('sell_date','num_sold','product').distinct().orderBy(col('num_sold').desc()).show(truncate=False)


+----------+--------+----------------------------+
|sell_date |num_sold|product                     |
+----------+--------+----------------------------+
|2020-05-30|3       |Headphone,Basketball,T-Shirt|
|2020-06-01|2       |Bible,Pencil                |
|2020-06-02|1       |Mask                        |
+----------+--------+----------------------------+





In [277]:
activities_df.createOrReplaceTempView('activitiesdf')

spark.sql('''
 with main as (select distinct sell_date,count(distinct product) as num_sold,product 
                    from activitiesdf group by sell_date,product)
               select distinct sell_date,sum(num_sold ) over(partition by sell_date) as num_sold,
                   array_join (collect_list(product) over(partition by sell_date),",") as products from main
        ''').show(truncate = False)

+----------+--------+----------------------------+
|sell_date |num_sold|products                    |
+----------+--------+----------------------------+
|2020-05-30|3       |Headphone,T-Shirt,Basketball|
|2020-06-01|2       |Pencil,Bible                |
|2020-06-02|1       |Mask                        |
+----------+--------+----------------------------+



#### 52 120 Friendly Movies Streamed Last Month E

In [111]:
tvprogram_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/52_tvprogram.csv'))

In [112]:
content_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/52_content.csv'))

In [299]:
tvprogram_df.printSchema()

root
 |-- program_date: string (nullable = true)
 |-- content_id: integer (nullable = true)
 |-- channel: string (nullable = true)



In [300]:
content_df.printSchema()

root
 |-- content_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- Kids_content: string (nullable = true)
 |-- content_type: string (nullable = true)



In [306]:
tvprogram_df.filter((year(col('program_date')) == lit('2020')) & (month(col('program_date')) == lit('06')))\
.join(content_df,(tvprogram_df.content_id == content_df.content_id) &(content_df.Kids_content == 'Y'),'inner' )\
.select('title').show()

+-------+
|  title|
+-------+
|Aladdin|
+-------+



In [295]:
tvprogram_df.createOrReplaceTempView('tvprogramdf')
content_df.createOrReplaceTempView('contentdf')

spark.sql('''
         select title from contentdf join 
         (select  content_id from tvprogramdf where year(program_date) == "2020" and month(program_date) == "06") as x
         on contentdf.content_id = x.content_id and contentdf.Kids_content == "Y"
         ''').show()

+-------+
|  title|
+-------+
|Aladdin|
+-------+



#### 53 122 Customer Order Frequency E

In [113]:
customers_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/53_customers.csv'))

In [114]:
product_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/53_product.csv'))

In [115]:
orders_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/53_orders.csv'))

In [311]:
customers_df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- country: string (nullable = true)



In [312]:
product_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- description: string (nullable = true)
 |-- price: integer (nullable = true)



In [313]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- quantity: integer (nullable = true)



In [344]:
window_spec = Window.partitionBy('customer_id','order_date')
window_rkspec = Window.partitionBy('customer_id').orderBy(col('gpprice').desc())

orders_df.filter( (year(col('order_date')) == lit('2020')) & (month(col('order_date')).isin([6,7])))\
.join(product_df,orders_df.product_id == product_df.product_id,'inner')\
.select('customer_id',substring('order_date',1,7).alias('order_date'),expr("quantity * price").alias('saleprice'))\
.withColumn('gpprice',sum(col('saleprice')).over(window_spec))\
.select('customer_id','gpprice').withColumn('rnk',rank().over(window_rkspec))\
.filter( (col('rnk') == lit('2')) & (col('gpprice') >= lit('100'))  )\
.join(customers_df, on = 'customer_id').select('customer_id','name').distinct().show()

+-----------+-------+
|customer_id|   name|
+-----------+-------+
|          1|Winston|
+-----------+-------+



In [370]:
customers_df.createOrReplaceTempView('customersdf')
product_df.createOrReplaceTempView('productdf')
orders_df.createOrReplaceTempView('ordersdf')

spark.sql('''
  with main as (select customer_id,substring(order_date,1,7) as order_date,quantity,product_id 
                    from ordersdf where year(order_date) == 2020 and month(order_date) in (6,7)),
        one as (select customer_id,order_date, sum(quantity * price) over(partition by customer_id,order_date) as salesprice 
                    from main join productdf on main.product_id == productdf.product_id),
        two as (select customer_id,salesprice ,rank() over(partition by customer_id order by salesprice desc) as x from one)
                select distinct two.customer_id,customersdf.name from two join
                    customersdf on two.customer_id == customersdf.customer_id and two.x = 2 and two.salesprice >= 100
         ''').show()

+-----------+-------+
|customer_id|   name|
+-----------+-------+
|          1|Winston|
+-----------+-------+



#### 54 123 Find Users With Valid E-Mails E

In [116]:
users_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/54_users.csv'))

In [372]:
users_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- mail: string (nullable = true)



In [406]:
users_df.filter(  (col('mail').rlike('@leetcode.com')) 
                & (col('mail').rlike('^[A-Za-z]'))  
                &(~col('mail').contains('#'))  ).show()

+-------+---------+--------------------+
|user_id|     name|                mail|
+-------+---------+--------------------+
|      1|  Winston|winston@leetcode.com|
|      3|Annabelle| bella-@leetcode.com|
|      4|    Sally|sally.come@leetco...|
+-------+---------+--------------------+



In [421]:
users_df.createOrReplaceTempView('usersdf')

spark.sql('''
           select user_id,name,mail from usersdf where 
                mail rlike("@leetcode.com") and mail rlike('^[A-Za-z]') and mail  not like '%#%'       
         ''').show()

+-------+---------+--------------------+
|user_id|     name|                mail|
+-------+---------+--------------------+
|      1|  Winston|winston@leetcode.com|
|      3|Annabelle| bella-@leetcode.com|
|      4|    Sally|sally.come@leetco...|
+-------+---------+--------------------+



#### 55 124 Patients With a Condition E

In [117]:
patients_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/55_patients.csv'))

In [423]:
patients_df.printSchema()

root
 |-- patient_id: integer (nullable = true)
 |-- patient_name: string (nullable = true)
 |-- conditions: string (nullable = true)



In [424]:
patients_df.filter(col('conditions').contains('DIAB1')).show()

+----------+------------+------------+
|patient_id|patient_name|  conditions|
+----------+------------+------------+
|         3|         Bob|DIAB100 MYOP|
|         4|      George|ACNE DIAB100|
+----------+------------+------------+



In [426]:
patients_df.createOrReplaceTempView('patientsdf')

spark.sql('''
             select * from patientsdf where conditions like '%DIAB1%'
        ''').show()

+----------+------------+------------+
|patient_id|patient_name|  conditions|
+----------+------------+------------+
|         3|         Bob|DIAB100 MYOP|
|         4|      George|ACNE DIAB100|
+----------+------------+------------+



#### 56 126  Fix Product Name Format E

In [118]:
sales_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/56_sales.csv'))

In [428]:
sales_df.printSchema()

root
 |-- sale_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- sale_date: string (nullable = true)



In [444]:
window_spec = Window.partitionBy('product_name','sale_date')
sales_df.select(lower(trim('product_name')).alias('product_name'),substring('sale_date',1,7).alias('sale_date') )\
.withColumn('total',count('product_name').over(window_spec)).distinct()\
.orderBy(col('product_name').asc(),col('sale_date').asc()).show()

+------------+---------+-----+
|product_name|sale_date|total|
+------------+---------+-----+
|  lckeychain|  2000-02|    2|
|     lcphone|  2000-01|    2|
|     lcphone|  2000-02|    1|
|  matryoshka|  2000-03|    1|
+------------+---------+-----+



In [450]:
sales_df.createOrReplaceTempView('patientsdf')

spark.sql('''
            select product_name,date,count(product_name) as total from
            (select lower(trim(product_name)) as product_name,substring(sale_date,1,7) as date from patientsdf)
                 group by product_name,date order by product_name,date
        ''').show()

+------------+-------+-----+
|product_name|   date|total|
+------------+-------+-----+
|  lckeychain|2000-02|    2|
|     lcphone|2000-01|    2|
|     lcphone|2000-02|    1|
|  matryoshka|2000-03|    1|
+------------+-------+-----+



#### 57 129 Unique Orders and Customers Per Month E

In [119]:
orders_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/57_orders.csv'))

In [452]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- invoice: integer (nullable = true)



In [471]:
window_spec = Window.partitionBy('order_date')

orders_df.filter( col('invoice') > lit('20') ).select(substring('order_date',1,7).alias('order_date'),'customer_id','invoice')\
.withColumn('order_count',count(col('invoice')).over(window_spec))\
.select('order_date','order_count','customer_id').distinct()\
.withColumn('customer_count',count(col('customer_id')).over(window_spec))\
.select('order_date','order_count','customer_count').distinct().orderBy('order_date').show()

+----------+-----------+--------------+
|order_date|order_count|customer_count|
+----------+-----------+--------------+
|   2020-09|          2|             2|
|   2020-10|          1|             1|
|   2020-12|          2|             1|
|   2021-01|          1|             1|
+----------+-----------+--------------+



In [476]:
orders_df.createOrReplaceTempView('ordersdf')

spark.sql('''
          select order_date,count(invoice) as order_count,count(distinct customer_id) as customer_count from
          (select substring(order_date,1,7) as order_date,customer_id,invoice 
               from ordersdf where invoice > 20)
                     group by order_date order by order_date
        ''').show()

+----------+-----------+--------------+
|order_date|order_count|customer_count|
+----------+-----------+--------------+
|   2020-09|          2|             2|
|   2020-10|          1|             1|
|   2020-12|          2|             1|
|   2021-01|          1|             1|
+----------+-----------+--------------+



#### 58 130 Warehouse Manager E

In [120]:
warehouse_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/58_warehouse.csv'))

In [121]:
products_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/58_products.csv'))

In [479]:
warehouse_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- units: integer (nullable = true)



In [480]:
products_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- Width: integer (nullable = true)
 |-- Length: integer (nullable = true)
 |-- Height: integer (nullable = true)



In [486]:
products_df.select('product_id',expr("Width * Length * Height").alias('size'))\
.join(warehouse_df,products_df.product_id == warehouse_df.product_id,'inner')\
.select('name',expr("size * units").alias('volume'))\
.groupBy('name').agg(sum(col('volume')).alias('volume')).orderBy('name').show()


+--------+------+
|    name|volume|
+--------+------+
|LCHouse1| 12250|
|LCHouse2| 20250|
|LCHouse3|   800|
+--------+------+



In [500]:
warehouse_df.createOrReplaceTempView('warehousedf')
products_df.createOrReplaceTempView('productsdf')

spark.sql('''
with main as (select name, (Width * Length * Height) * (units) as volume
              from productsdf join warehousedf on productsdf.product_id = warehousedf.product_id)
              select name, sum(volume) from main group by name order by name
         ''').show()

+--------+-----------+
|    name|sum(volume)|
+--------+-----------+
|LCHouse1|      12250|
|LCHouse2|      20250|
|LCHouse3|        800|
+--------+-----------+



#### 59 131  Customer Who Visited but Did Not Make Any Transactions E

In [122]:
visits_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/59_visits.csv'))

In [123]:
transactions_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/59_transactions.csv'))

In [503]:
visits_df.printSchema()

root
 |-- visit_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)



In [504]:
transactions_df.printSchema()

root
 |-- transaction_id: integer (nullable = true)
 |-- visit_id: integer (nullable = true)
 |-- amount: integer (nullable = true)



In [506]:
visits_df.join(transactions_df,visits_df.visit_id == transactions_df.visit_id,'leftanti')\
.groupBy('customer_id').agg(count(col('visit_id')).alias('count_no_trans')).show()

+-----------+--------------+
|customer_id|count_no_trans|
+-----------+--------------+
|         54|             2|
|         96|             1|
|         30|             1|
+-----------+--------------+



In [509]:
visits_df.createOrReplaceTempView('visitsdf')
transactions_df.createOrReplaceTempView('transactionsdf')

spark.sql('''
         select customer_id,count(visit_id) as count_no_trans 
              from visitsdf left anti join transactionsdf on visitsdf.visit_id = transactionsdf.visit_id
              group by customer_id
         ''').show()

+-----------+--------------+
|customer_id|count_no_trans|
+-----------+--------------+
|         54|             2|
|         96|             1|
|         30|             1|
+-----------+--------------+



#### 60 132 Bank Account Summary II E

In [124]:
users_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/60_users.csv'))

In [125]:
transactions_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/60_transactions.csv'))

In [512]:
users_df.printSchema()

root
 |-- account: integer (nullable = true)
 |-- name: string (nullable = true)



In [513]:
transactions_df.printSchema()

root
 |-- trans_id: integer (nullable = true)
 |-- account: integer (nullable = true)
 |-- amount: integer (nullable = true)
 |-- transacted_on: string (nullable = true)



In [517]:
transactions_df.groupBy(col('account')).agg(sum(col('amount')).alias('balance')).filter(col('balance') > lit('10000'))\
.join(users_df, on = 'account').select('name','balance').show()

+-----+-------+
| name|balance|
+-----+-------+
|Alice|  11000|
+-----+-------+



In [522]:
users_df.createOrReplaceTempView('usersdf')
transactions_df.createOrReplaceTempView('transactionsdf')

spark.sql('''
           select name,sum(amount) as balance from usersdf join transactionsdf on usersdf.account = transactionsdf.account
                group by name having balance > 10000
         ''').show()

+-----+-------+
| name|balance|
+-----+-------+
|Alice|  11000|
+-----+-------+



#### 61 134 Sellers With No Sales E

In [126]:
customer_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/61_customer.csv'))

In [127]:
orders_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/61_orders.csv'))

In [128]:
seller_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/61_seller.csv'))

In [7]:
customer_df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)



In [8]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- sale_date: string (nullable = true)
 |-- order_cost: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- seller_id: integer (nullable = true)



In [9]:
seller_df.printSchema()

root
 |-- seller_id: integer (nullable = true)
 |-- seller_name: string (nullable = true)



In [24]:
orders_df.filter(year(col('sale_date')) == lit('2020'))\
.join(seller_df,seller_df.seller_id == orders_df.seller_id,'right')\
.filter(col('order_id').isNull()).select('seller_name').show()

+-----------+
|seller_name|
+-----------+
|      Frank|
+-----------+



In [49]:
orders_df.createOrReplaceTempView('ordersdf')
seller_df.createOrReplaceTempView('sellerdf')

spark.sql('''
          select seller_name from sellerdf left join 
          (select seller_id from ordersdf where year(sale_date) = 2020) as x on x.seller_id == sellerdf.seller_id
          where x.seller_id is null
        ''').show()

+-----------+
|seller_name|
+-----------+
|      Frank|
+-----------+



#### 62 136 All Valid Triplets That Can Represent a Country E

In [129]:
schoola_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/62_schoola.csv'))

In [130]:
schoolb_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/62_schoolb.csv'))

In [131]:
schoolc_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/62_schoolc.csv'))

In [56]:
schoola_df.printSchema()

root
 |-- student_id: integer (nullable = true)
 |-- student_name: string (nullable = true)



In [57]:
schoolb_df.printSchema()

root
 |-- student_id: integer (nullable = true)
 |-- student_name: string (nullable = true)



In [58]:
schoolc_df.printSchema()

root
 |-- student_id: integer (nullable = true)
 |-- student_name: string (nullable = true)



In [99]:
schoola_df.join(schoolb_df).join(schoolc_df)\
.select((schoola_df.student_name).alias('member_A'),(schoolb_df.student_name).alias('member_B'),(schoolc_df.student_name).alias('member_C') ).distinct()\
.withColumn('sorted_row',size(array_sort(array_distinct(array('member_A','member_B','member_C')))))\
.filter(col('sorted_row') == 3).select('member_A','member_B','member_C').show()

+--------+--------+--------+
|member_A|member_B|member_C|
+--------+--------+--------+
|     Bob|     Tom|   Alice|
|   Alice|     Tom|   Jerry|
|     Bob|     Tom|   Jerry|
+--------+--------+--------+



In [116]:
schoola_df.createOrReplaceTempView('schooladf')
schoolb_df.createOrReplaceTempView('schoolbdf')
schoolc_df.createOrReplaceTempView('schoolcdf')

spark.sql('''
 with main as (select schooladf.student_name as member_A, schoolbdf.student_name as member_B, schoolcdf.student_name as member_C
                    from schooladf,schoolbdf,schoolcdf),
       one as (select member_A,member_B,member_C,size(array_distinct(array(member_A,member_B,member_c))) as arry_value from main)
              select  member_A,member_B,member_C from one where arry_value = 3       
         ''').show()

+--------+--------+--------+
|member_A|member_B|member_C|
+--------+--------+--------+
|   Alice|     Tom|   Jerry|
|     Bob|     Tom|   Jerry|
|     Bob|     Tom|   Alice|
+--------+--------+--------+



#### 63 137 Percentage of Users Attended a Contest E

In [132]:
users_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/63_users.csv'))

In [133]:
register_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/63_register.csv'))

In [119]:
users_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- user_name: string (nullable = true)



In [120]:
register_df.printSchema()

root
 |-- contest_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)



In [127]:
register_df.groupBy('contest_id').agg(round((count(col('user_id'))/3)*100,2).alias('percentage') )\
.orderBy(col('percentage').desc(),col('contest_id').asc()).show()

+----------+----------+
|contest_id|percentage|
+----------+----------+
|       208|     100.0|
|       209|     100.0|
|       210|     100.0|
|       215|     66.67|
|       207|     33.33|
+----------+----------+



In [131]:
users_df.createOrReplaceTempView('usersdf')
register_df.createOrReplaceTempView('registerdf')

spark.sql('''
     select contest_id, round((count(user_id)/3)*100,2) as percentage from registerdf group by contest_id
          order by percentage desc,contest_id asc
         ''').show()

+----------+----------+
|contest_id|percentage|
+----------+----------+
|       208|     100.0|
|       209|     100.0|
|       210|     100.0|
|       215|     66.67|
|       207|     33.33|
+----------+----------+



#### 64 138 Average Time of Process per Machine E

In [134]:
activity_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/64_actiivity.csv'))

In [134]:
activity_df.printSchema()

root
 |-- machine_id: integer (nullable = true)
 |-- process_id: integer (nullable = true)
 |-- activity_type: string (nullable = true)
 |-- timestamp: double (nullable = true)



In [153]:
start_df = activity_df.filter(col('activity_type') == lit('start'))
end_df = activity_df.filter(col('activity_type') == lit('end')).withColumnRenamed('timestamp','end_timestamp')

start_df.join(end_df,( (start_df.machine_id == end_df.machine_id) & (start_df.process_id == end_df.process_id)),'inner' )\
.select(start_df.machine_id,(end_df.end_timestamp - start_df.timestamp).alias('time_diff'))\
.groupBy('machine_id').agg((sum(col('time_diff'))/count(col('machine_id'))).alias('processing_time'))\
.orderBy('machine_id').show()

+----------+---------------+
|machine_id|processing_time|
+----------+---------------+
|         0|          0.894|
|         1|          0.995|
|         2|          1.456|
+----------+---------------+



In [171]:
activity_df.createOrReplaceTempView('activitydf')

spark.sql('''
             select start.machine_id, sum(end.timestamp - start.timestamp)/count(start.machine_id) as processing_time from 
             (select machine_id,process_id,timestamp from activitydf where activity_type == "end") as end join
             (select machine_id,process_id,timestamp from activitydf where activity_type == "start") as start
             on start.machine_id = end.machine_id and start.process_id = end.process_id group by start.machine_id
             order by machine_id
         ''').show()

+----------+---------------+
|machine_id|processing_time|
+----------+---------------+
|         0|          0.894|
|         1|          0.995|
|         2|          1.456|
+----------+---------------+



#### 65 139 Fix Names in a Table E

In [135]:
users_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/65_users.csv'))

In [173]:
users_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- name: string (nullable = true)



In [180]:
users_df.select('user_id',initcap(lower('name')).alias('name')).show()

+-------+-----+
|user_id| name|
+-------+-----+
|      1|Alice|
|      2|  Bob|
+-------+-----+



In [182]:
users_df.createOrReplaceTempView('usersdf')

spark.sql('''
           select user_id,initcap(lower(name)) as name from usersdf
         ''').show()

+-------+-----+
|user_id| name|
+-------+-----+
|      1|Alice|
|      2|  Bob|
+-------+-----+



#### 66 140 Product's Worth Over Invoices E

In [136]:
product_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/66_product.csv'))

In [137]:
invoice_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/66_invoice.csv'))

In [185]:
product_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- name: string (nullable = true)



In [186]:
invoice_df.printSchema()

root
 |-- invoice_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- rest: integer (nullable = true)
 |-- paid: integer (nullable = true)
 |-- canceled: integer (nullable = true)
 |-- refunded: integer (nullable = true)



In [193]:
invoice_df.groupBy('product_id').agg(sum(col('rest')).alias('rest'),sum(col('paid')).alias('paid'),
 sum(col('canceled')).alias('canceled'),sum(col('refunded')).alias('refunded')).join(product_df,on = 'product_id')\
.select('name','rest','paid','canceled','refunded').show()

+-----+----+----+--------+--------+
| name|rest|paid|canceled|refunded|
+-----+----+----+--------+--------+
|bacon|   3|   3|       3|       3|
|  ham|   2|   4|       5|       3|
+-----+----+----+--------+--------+



In [203]:
product_df.createOrReplaceTempView('productdf')
invoice_df.createOrReplaceTempView('invoicedf')

spark.sql('''
 with main as (select product_id,sum(rest) as rest,sum(paid) as pain,
                    sum(canceled) as canceled,sum(refunded) as refunded from invoicedf group by product_id)
               select name,rest,pain,canceled,refunded 
                    from main join productdf on productdf.product_id = main.product_id
          ''').show()

+-----+----+----+--------+--------+
| name|rest|pain|canceled|refunded|
+-----+----+----+--------+--------+
|bacon|   3|   3|       3|       3|
|  ham|   2|   4|       5|       3|
+-----+----+----+--------+--------+



#### 67 141 Invalid Tweets E

In [138]:
tweets_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/67_tweets.csv'))

In [207]:
tweets_df.printSchema()

root
 |-- tweet_id: integer (nullable = true)
 |-- content: string (nullable = true)



In [209]:
tweets_df.show(truncate = False)

+--------+--------------------------------+
|tweet_id|content                         |
+--------+--------------------------------+
|1       |Vote for Biden                  |
|2       |Let us make America great again!|
+--------+--------------------------------+



In [211]:
tweets_df.filter(length(col('content')) > 20).select('tweet_id').show()

+--------+
|tweet_id|
+--------+
|       2|
+--------+



In [213]:
tweets_df.createOrReplaceTempView('tweetsdf')

spark.sql('''
          select tweet_id from tweetsdf where length(content) > 20
         ''').show()

+--------+
|tweet_id|
+--------+
|       2|
+--------+



#### 68 142 Daily Leads and Partners E

In [139]:
dailysales_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/68_dailysales.csv'))

In [215]:
dailysales_df.printSchema()

root
 |-- date_id: string (nullable = true)
 |-- make_name: string (nullable = true)
 |-- lead_id: integer (nullable = true)
 |-- partner_id: integer (nullable = true)



In [226]:
dailysales_df.groupBy('date_id','make_name').agg(countDistinct(col('lead_id')).alias('unique_leads'),
                                                 countDistinct(col('partner_id')).alias('unique_partners')).show()



+---------+---------+------------+---------------+
|  date_id|make_name|unique_leads|unique_partners|
+---------+---------+------------+---------------+
|2020-12-8|   toyota|           2|              3|
|2020-12-7|    honda|           3|              2|
|2020-12-7|   toyota|           1|              2|
|2020-12-8|    honda|           2|              2|
+---------+---------+------------+---------------+



In [229]:
dailysales_df.createOrReplaceTempView('dailysalesdf')

spark.sql('''
             select date_id,make_name,count(distinct lead_id) as unique_leads, count(distinct partner_id ) as unique_partners
                  from dailysalesdf group by date_id,make_name
         ''').show()

+---------+---------+------------+---------------+
|  date_id|make_name|unique_leads|unique_partners|
+---------+---------+------------+---------------+
|2020-12-8|   toyota|           2|              3|
|2020-12-7|    honda|           3|              2|
|2020-12-7|   toyota|           1|              2|
|2020-12-8|    honda|           2|              2|
+---------+---------+------------+---------------+



#### 69 146 Find Followers Count E

In [140]:
followers_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/69_followers.csv'))

In [233]:
followers_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- follower_id: integer (nullable = true)



In [235]:
followers_df.groupBy('user_id').agg(count(col('follower_id')).alias('followers_count')).orderBy('user_id').show()

+-------+---------------+
|user_id|followers_count|
+-------+---------------+
|      0|              1|
|      1|              1|
|      2|              2|
+-------+---------------+



In [236]:
followers_df.createOrReplaceTempView('followersdf')

spark.sql('''
           select user_id,count(follower_id) as followers_count from followersdf group by user_id order by user_id
         ''').show()

+-------+---------------+
|user_id|followers_count|
+-------+---------------+
|      0|              1|
|      1|              1|
|      2|              2|
+-------+---------------+



#### 70 147  The Number of Employees Which Report to Each Employee E

In [141]:
employees_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/70_employees.csv'))

In [238]:
employees_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- reports_to: integer (nullable = true)
 |-- age: integer (nullable = true)



In [264]:
logic_df = employees_df.filter(~col('reports_to').isNull() ).groupBy('reports_to')\
.agg(count(col('reports_to')).alias('reports_count'),
    ((sum(col('age'))/count(col('reports_to'))).cast(IntegerType())).alias('average_age'))\
.withColumnRenamed('reports_to','reportsto')

employees_df.join(logic_df,employees_df.employee_id == logic_df.reportsto,'inner')\
.select('employee_id','name','reports_count','average_age').show()

+-----------+-------+-------------+-----------+
|employee_id|   name|reports_count|average_age|
+-----------+-------+-------------+-----------+
|          1|Michael|            2|         40|
|          2|  Alice|            2|         37|
|          3|    Bob|            1|         37|
+-----------+-------+-------------+-----------+



In [275]:
employees_df.createOrReplaceTempView('employeesdf')

spark.sql('''
 with main as (select reports_to,count(reports_to) as reports_count,(sum(age)/count(reports_to))as average_age
                    from employeesdf where reports_to is not null group by reports_to)
               select employee_id,name,reports_count,cast(average_age as int)
                    from employeesdf join main on employeesdf.employee_id == main.reports_to
         ''').show()

+-----------+-------+-------------+-----------+
|employee_id|   name|reports_count|average_age|
+-----------+-------+-------------+-----------+
|          1|Michael|            2|         40|
|          2|  Alice|            2|         37|
|          3|    Bob|            1|         37|
+-----------+-------+-------------+-----------+



#### 71 148 Find Total Time Spent by Each Employee E

In [142]:
employees_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/71_employees.csv'))

In [279]:
employees_df.printSchema()

root
 |-- emp_id: integer (nullable = true)
 |-- event_day: string (nullable = true)
 |-- in_time: integer (nullable = true)
 |-- out_time: integer (nullable = true)



In [283]:
employees_df.groupBy('event_day','emp_id').agg(sum(col('out_time') - col('in_time')).alias('total_time'))\
.orderBy('event_day','emp_id').show()

+----------+------+----------+
| event_day|emp_id|total_time|
+----------+------+----------+
|2020-11-28|     1|       173|
|2020-11-28|     2|        30|
|2020-12-03|     1|        41|
|2020-12-09|     2|        27|
+----------+------+----------+



In [285]:
employees_df.createOrReplaceTempView('employeesdf')

spark.sql('''
            select event_day,emp_id, sum(out_time - in_time) as total_time from employeesdf 
                 group by event_day,emp_id order by event_day,emp_id
         ''').show()

+----------+------+----------+
| event_day|emp_id|total_time|
+----------+------+----------+
|2020-11-28|     1|       173|
|2020-11-28|     2|        30|
|2020-12-03|     1|        41|
|2020-12-09|     2|        27|
+----------+------+----------+



#### 72 150 Recyclable and Low Fat Products E

In [143]:
products_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/72_products.csv'))

In [287]:
products_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- low_fats: string (nullable = true)
 |-- recyclable: string (nullable = true)



In [289]:
products_df.filter((col('low_fats') == lit('Y')) & (col('recyclable') == lit('Y'))).select('product_id').show()

+----------+
|product_id|
+----------+
|         1|
|         3|
+----------+



In [292]:
products_df.createOrReplaceTempView('productsdf')

spark.sql('''
          select product_id from productsdf where low_fats = "Y" and recyclable = "Y"
        ''').show()

+----------+
|product_id|
+----------+
|         1|
|         3|
+----------+



#### 73 151 Product's Price for Each Store E

In [144]:
products_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/73_products.csv'))

In [294]:
products_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- store: string (nullable = true)
 |-- price: integer (nullable = true)



In [303]:
products_df.show()

+----------+------+-----+
|product_id| store|price|
+----------+------+-----+
|         0|store1|   95|
|         0|store3|  105|
|         0|store2|  100|
|         1|store1|   70|
|         1|store3|   80|
+----------+------+-----+



In [301]:
products_df.groupBy('product_id').pivot('store').sum('price').orderBy('product_id').show()

+----------+------+------+------+
|product_id|store1|store2|store3|
+----------+------+------+------+
|         0|    95|   100|   105|
|         1|    70|  null|    80|
+----------+------+------+------+



In [327]:

spark.sql('''
with main as((select product_id,case when (product_id in (0,1)) and (store = "store1") then price end as store1,'' as store2,'' as store3 from productsdf)
              union all (select product_id,'' as store1,case when (product_id in (0,1)) and (store = "store2") then price end as store2,'' as store3 from productsdf)
              union all (select product_id,'' as store1,'' as store2,case when (product_id in (0,1)) and (store = "store3") then price end as store3 from productsdf) )
             select product_id,cast(sum(store1) as int) as store1, cast(sum(store2) as int) as store2, cast(sum(store3) as int) as store3 from main
                  group by product_id order by product_id
         ''').show()

+----------+------+------+------+
|product_id|store1|store2|store3|
+----------+------+------+------+
|         0|    95|   100|   105|
|         1|    70|  null|    80|
+----------+------+------+------+



#### 74 153 Primary Department for Each Employee E

In [145]:
employee_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/74_employee.csv'))

In [329]:
employee_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)
 |-- primary_flag: string (nullable = true)



In [358]:
y_df = employee_df.filter(col('primary_flag') == lit('Y')).withColumnRenamed('department_id','departmentid')\
.select('employee_id','departmentid')

employee_df.filter(col('primary_flag') == lit('N')).select('employee_id','department_id')\
.join(y_df,y_df.employee_id == employee_df.employee_id,'leftanti' ).union(y_df).orderBy('employee_id').show()

+-----------+-------------+
|employee_id|department_id|
+-----------+-------------+
|          1|            1|
|          2|            1|
|          3|            3|
|          4|            3|
+-----------+-------------+



In [364]:
employee_df.createOrReplaceTempView('employeedf')

spark.sql('''
  with Y as (select employee_id,department_id from employeedf where primary_flag = "Y"),
       N as (select employee_id,department_id from employeedf where primary_flag = "N")
             (select N.employee_id,N.department_id from N 
                  left anti join Y on N.employee_id == Y.employee_id) union all (select * from Y) order by employee_id
         ''').show()

+-----------+-------------+
|employee_id|department_id|
+-----------+-------------+
|          1|            1|
|          2|            1|
|          3|            3|
|          4|            3|
+-----------+-------------+



#### 75 154  Rearrange Products Table E

In [146]:
products_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/75_products.csv'))

In [366]:
products_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- store1: integer (nullable = true)
 |-- store2: integer (nullable = true)
 |-- store3: integer (nullable = true)



In [368]:
unpivot_expr = "stack(3, 'store1', store1, 'store2', store2, 'store3', store3) as (store, price)"
unpivot_df = products_df.select("product_id", expr(unpivot_expr)).filter("price is not null")
unpivot_df.show()

+----------+------+-----+
|product_id| store|price|
+----------+------+-----+
|         0|store1|   95|
|         0|store2|  100|
|         0|store3|  105|
|         1|store1|   70|
|         1|store3|   80|
+----------+------+-----+



In [404]:
products_df.createOrReplaceTempView('productsdf')

spark.sql('''
           SELECT product_id,store,price FROM productsdf 
                 LATERAL VIEW STACK(3,'store1',store1,'store2',store2,'store3',store3) AS store,price 
                 where price is not null
         ''').show()

+----------+------+-----+
|product_id| store|price|
+----------+------+-----+
|         0|store1|   95|
|         0|store2|  100|
|         0|store3|  105|
|         1|store1|   70|
|         1|store3|   80|
+----------+------+-----+



#### 76 155 Ad-Free Sessions E

In [147]:
playback_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/76_playback.csv'))

In [148]:
ads_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/76_ads.csv'))

In [415]:
playback_df.printSchema()

root
 |-- session_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- start_time: integer (nullable = true)
 |-- end_time: integer (nullable = true)



In [416]:
ads_df.printSchema()

root
 |-- ad_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- timestamp: integer (nullable = true)



In [421]:
playback_df.join(ads_df,playback_df.customer_id == ads_df.customer_id,'inner')\
.filter("timestamp not between start_time and end_time").select('session_id').distinct().orderBy('session_id').show()

+----------+
|session_id|
+----------+
|         2|
|         3|
|         5|
+----------+



In [425]:
playback_df.createOrReplaceTempView('playbackdf')
ads_df.createOrReplaceTempView('adsdf')

spark.sql('''
          select distinct session_id from playbackdf join adsdf on playbackdf.customer_id = adsdf.customer_id
               where adsdf.timestamp not between start_time and end_time 
        ''').show()

+----------+
|session_id|
+----------+
|         3|
|         5|
|         2|
+----------+



#### 77 157 Find Customers With Positive Revenue this Year E

In [149]:
customers_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/77_customers.csv'))

In [427]:
customers_df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- revenue: integer (nullable = true)



In [430]:
customers_df.filter("year == 2021 and revenue > 0 ").select('customer_id').show()

+-----------+
|customer_id|
+-----------+
|          1|
|          4|
+-----------+



In [432]:
customers_df.createOrReplaceTempView('customersdf')

spark.sql('''
           select customer_id from customersdf where year == 2021 and revenue > 0
         ''').show()

+-----------+
|customer_id|
+-----------+
|          1|
|          4|
+-----------+



#### 78 161 Convert Date Format E

In [150]:
days_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/78_days.csv'))

In [434]:
days_df.printSchema()

root
 |-- day: string (nullable = true)



In [448]:
days_df.select(date_format('day',"EEEE,MMMM d yyyy").alias('day')).show(truncate = False)

+---------------------+
|day                  |
+---------------------+
|Tuesday,April 12 2022|
|Monday,August 9 2021 |
|Friday,June 26 2020  |
+---------------------+



In [452]:
days_df.createOrReplaceTempView('daysdf')

spark.sql('''
           select date_format(day,"EEEE,MMMM d yyyy") as day from daysdf
         ''').show(truncate = False)

+---------------------+
|day                  |
+---------------------+
|Tuesday,April 12 2022|
|Monday,August 9 2021 |
|Friday,June 26 2020  |
+---------------------+



#### 79 163 Calculate Special Bonus E

In [151]:
employees_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/79_employees.csv'))

In [455]:
employees_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: integer (nullable = true)



In [472]:
employees_df.withColumn('bonus',when( ((col('employee_id')%2 == lit('1')) 
                                & (substring(col('name'),1,1) != lit('M')) ),col('salary')).otherwise(0))\
             .select('employee_id','bonus').show()

+-----------+-----+
|employee_id|bonus|
+-----------+-----+
|          2|    0|
|          3|    0|
|          7| 7400|
|          8|    0|
|          9| 7700|
+-----------+-----+



In [476]:
employees_df.createOrReplaceTempView('employeesdf')

spark.sql('''
           select employee_id, case when ((employee_id%2 = 1) and (substring(name,1,1) != 'M')) then salary 
                                    else 0 end bonus from employeesdf
         ''').show()

+-----------+-----+
|employee_id|bonus|
+-----------+-----+
|          2|    0|
|          3|    0|
|          7| 7400|
|          8|    0|
|          9| 7700|
+-----------+-----+



#### 80 165 The Latest Login in 2020 E


In [152]:
logins_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/80_logins.csv'))

In [478]:
logins_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- time_stamp: string (nullable = true)



In [482]:
logins_df.filter(year(col('time_stamp')) == lit('2020') ).groupBy('user_id')\
                 .agg(max(col('time_stamp')).alias('last_stamp')).show()

+-------+-------------------+
|user_id|         last_stamp|
+-------+-------------------+
|      6|2020-06-30 15:06:07|
|      8|2020-12-30 00:46:50|
|      2|2020-01-16 02:49:50|
+-------+-------------------+



In [484]:
logins_df.createOrReplaceTempView('loginsdf')

spark.sql('''
          select user_id,max(time_stamp) as last_stamp from loginsdf where year(time_stamp) == "2020"
                  group by user_id
         ''').show()

+-------+-------------------+
|user_id|         last_stamp|
+-------+-------------------+
|      6|2020-06-30 15:06:07|
|      8|2020-12-30 00:46:50|
|      2|2020-01-16 02:49:50|
+-------+-------------------+



#### 81 168 Users That Actively Request Confirmation Messages E

In [153]:
signups_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/81_signups.csv'))

In [154]:
confirmations_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/81_confirmations.csv'))

In [6]:
signups_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- time_stamp: string (nullable = true)



In [7]:
confirmations_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- time_stamp: string (nullable = true)
 |-- action: string (nullable = true)



In [34]:
window_spec = Window.orderBy('user_id','time_stamp')

main_df = confirmations_df.withColumn('rownum',row_number().over(window_spec))

odd_df = main_df.filter(col('rownum')%2 == 1)
even_df = main_df.filter(col('rownum')%2 == 0).withColumnRenamed('time_stamp','even_timestamp')

odd_df.join(even_df,odd_df.user_id == even_df.user_id,'inner')\
.select(odd_df.user_id, (to_timestamp('even_timestamp','yyyy-MM-dd HH:mm:ss') 
        - to_timestamp('time_stamp','yyyy-MM-dd HH:mm:ss') ).alias('time_diff')).show()


+-------+--------------------+
|user_id|           time_diff|
+-------+--------------------+
|      2|            24 hours|
|      3|6 minutes 59 seconds|
|      6|23 hours 59 minut...|
|      7|  24 hours 1 seconds|
+-------+--------------------+



24/08/29 11:00:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/29 11:00:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [47]:
odd_df.join(even_df,odd_df.user_id == even_df.user_id,'inner')\
.select(odd_df.user_id, ( round((unix_timestamp('even_timestamp','yyyy-MM-dd HH:mm:ss') 
        - unix_timestamp('time_stamp','yyyy-MM-dd HH:mm:ss'))/3600 ,4)).alias('time_diff'))\
.filter(col('time_diff') <= lit('24.0')).show()


+-------+---------+
|user_id|time_diff|
+-------+---------+
|      2|     24.0|
|      3|   0.1164|
|      6|  23.9997|
+-------+---------+



24/08/29 11:07:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/29 11:07:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [63]:
confirmations_df.createOrReplaceTempView('confirmationsdf')

spark.sql('''
 with main as (select user_id,time_stamp,row_number() over(order by user_id,time_stamp) as rownum from confirmationsdf),
       odd as (select user_id,time_stamp from main where rownum%2 = 1),
      even as (select user_id,time_stamp as even_timestamp from main where rownum%2 = 0) 
              select odd.user_id, round((unix_timestamp(even_timestamp,'yyyy-MM-dd HH:mm:ss') -
                                   unix_timestamp(time_stamp,'yyyy-MM-dd HH:mm:ss'))/3600,4) as time_diff 
                   from odd join even on odd.user_id = even.user_id 
                        group by odd.user_id,even_timestamp,time_stamp having time_diff <= 24
         ''').show()

+-------+---------+
|user_id|time_diff|
+-------+---------+
|      2|     24.0|
|      3|   0.1164|
|      6|  23.9997|
+-------+---------+



24/08/29 11:22:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/29 11:22:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


#### 82 171 Employees With Missing Information E

In [155]:
employees_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/82_employees.csv'))

In [156]:
salaries_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/82_salaries.csv'))

In [66]:
employees_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- name: string (nullable = true)



In [67]:
salaries_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [77]:
employees_df.select('employee_id').join(salaries_df,employees_df.employee_id == salaries_df.employee_id,'leftanti')\
.union(salaries_df.select('employee_id').join(employees_df,employees_df.employee_id == salaries_df.employee_id,'leftanti')).show()

+-----------+
|employee_id|
+-----------+
|          2|
|          1|
+-----------+



In [83]:
employees_df.createOrReplaceTempView('employeesdf')
salaries_df.createOrReplaceTempView('salariesdf')

spark.sql('''
 (select employeesdf.employee_id from employeesdf left anti join salariesdf on employeesdf.employee_id = salariesdf.employee_id)
 union all (
select salariesdf.employee_id from salariesdf left anti join employeesdf on employeesdf.employee_id = salariesdf.employee_id)

         ''').show()

+-----------+
|employee_id|
+-----------+
|          2|
|          1|
+-----------+



#### 83 172 Employees Whose Manager Left the Company E

In [157]:
employees_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/83_employees.csv'))

In [85]:
employees_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- manager_id: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [99]:
out = employees_df.select('employee_id').collect()
x = [out[i][0] for i in range(0,len(out))]

employees_df.filter("salary < 30000 ").select('employee_id','manager_id').filter(~col('manager_id').isin(x)) \
.select('employee_id').show()

+-----------+
|employee_id|
+-----------+
|         11|
+-----------+



In [104]:
employees_df.createOrReplaceTempView('employeesdf')

spark.sql('''
        select employee_id from employeesdf where salary < 30000 
             and manager_id not in (select employee_id from employeesdf)
         ''').show()

+-----------+
|employee_id|
+-----------+
|         11|
+-----------+



#### 84 176 . Low-Quality Problems E

In [158]:
problems_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/84_problems.csv'))

In [107]:
problems_df.printSchema()

root
 |-- problem_id: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- dislikes: integer (nullable = true)



In [118]:
problems_df.select( 'problem_id', expr("( (likes)/(likes + dislikes) ) * 100").alias('perecentate')  )\
.filter(col('perecentate') < 60 ).select('problem_id').orderBy('problem_id').show()

+----------+
|problem_id|
+----------+
|         7|
|        10|
|        11|
|        13|
+----------+



In [129]:
problems_df.createOrReplaceTempView('problemsdf')

spark.sql('''
           select problem_id from 
           (select problem_id,((likes/(likes+dislikes))*100) as percentage from problemsdf)
                where percentage < 60 order by problem_id
         ''').show()

+----------+
|problem_id|
+----------+
|         7|
|        10|
|        11|
|        13|
+----------+



#### 85 180 The Winner University E

In [161]:
newyork_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/85_newyork.csv'))

In [162]:

california_df= (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/85_california.csv'))

In [270]:
newyork_df.printSchema()

root
 |-- student_id: integer (nullable = true)
 |-- score: integer (nullable = true)



In [271]:
california_df.printSchema()

root
 |-- student_id: integer (nullable = true)
 |-- score: integer (nullable = true)



In [272]:
newyork = newyork_df.withColumn('uni',lit('newyork university'))
california = california_df.withColumn('uni',lit('california university'))

window_spec = Window.partitionBy('winner')
windowspecwin = Window.partitionBy('win')
newyork.withColumn('winner',when( col('score') >= 90,col('uni'))).distinct()\
.union(california.withColumn('winner',when( col('score') >= 90,col('uni'))).distinct()).filter(~col('winner').isNull())\
.withColumn('win',count(col('winner')).over(window_spec))\
.withColumn('fwin',when(count(col('win')).over(windowspecwin) == 1,col('uni')) 
                  .when(count(col('win')).over(windowspecwin)  > 1,lit('No Winner')))\
.select(col('fwin').alias('winner')).distinct().show()

                                                                                

+---------+
|   winner|
+---------+
|No Winner|
+---------+



In [275]:
newyork_df.createOrReplaceTempView('newyorkdf')
california_df.createOrReplaceTempView('californiadf')

spark.sql('''
 with main as (select distinct uni from (
                    (select student_id,score, 'newyork university' as uni from newyorkdf) union 
                    (select student_id,score, 'california university' as uni from californiadf) )
                        where score >= 90),
       one as (select case 
                         when (uni = "newyork university")     then uni
                         when (uni = "california university'") then uni
                         when (uni = "newyork university") and (uni = "newyork university") then "No Winner"
                          end as winner  from main)
               select winner from one where winner is not null
       ''').show(truncate=False)

+------------------+
|winner            |
+------------------+
|newyork university|
+------------------+



In [287]:
spark.sql('''
 with main as (select distinct uni from (
                    (select student_id,score, 'newyork university' as uni from newyorkdf) union 
                    (select student_id,score, 'california university' as uni from californiadf) )
                        where score >= 90),
       one as (select uni, dense_rank() over(order by uni) rnk from main)
               select * from one
  
       ''').show(truncate=False)

24/08/29 16:07:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+---------------------+---+
|uni                  |rnk|
+---------------------+---+
|california university|1  |
|newyork university   |2  |
+---------------------+---+



#### 86 181 The Number of Rich Customers E

In [163]:
store_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/86_store.csv'))

In [289]:
store_df.printSchema()

root
 |-- bill_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- amount: integer (nullable = true)



In [316]:
store_df.filter(col('amount') > 500).groupBy('customer_id')\
.agg(countDistinct(col('customer_id')).alias('cnt')).select(sum('cnt').alias('rich_count')).show()

+----------+
|rich_count|
+----------+
|         2|
+----------+



In [317]:
store_df.createOrReplaceTempView('storedf')

In [320]:
spark.sql('''
         select count(distinct customer_id) as rich_count from storedf where amount > 500
         ''').show()

+----------+
|rich_count|
+----------+
|         2|
+----------+



#### 87 187  The Number of Users That Are Eligible for Discount E

In [164]:
purchases_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/87_purchases.csv'))

#startDate = 2022-03-08, endDate = 2022-03-20, minAmount = 1000

In [291]:
purchases_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- time_stamp: string (nullable = true)
 |-- amount: integer (nullable = true)



In [363]:
start_date = '2022-03-08 00:00:00'
end_date =   '2022-03-20 00:00:00'
purchases_df.select(to_timestamp(col('time_stamp'),'yyyy-MM-dd HH:mm:ss').alias('timestamp'))\
.filter( (col('timestamp').between(to_timestamp(lit(start_date),'yyyy-MM-dd HH:mm:ss'),
                                 to_timestamp(lit(end_date),'yyyy-MM-dd HH:mm:ss'))) & (col('amount') > 1000))\
       .groupBy(col('timestamp')).agg(count(col('timestamp')).alias('user_cnt')).select('user_cnt').show()

+--------+
|user_cnt|
+--------+
|       1|
+--------+



In [370]:
purchases_df.createOrReplaceTempView('purchasesdf')

spark.sql('''
            select count(*) as user_cnt from purchasesdf where
                  time_stamp between to_timestamp('2022-03-08 00:00:00','yyyy-MM-dd HH:mm:ss')
                                and  to_timestamp('2022-03-20 00:00:00','yyyy-MM-dd HH:mm:ss')
                   and amount > 1000

         ''').show()

+--------+
|user_cnt|
+--------+
|       1|
+--------+



#### 88 189 The Users That Are Eligible for Discount E

In [165]:
purchases_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/88_purchases.csv'))
# startDate = 2022-03-08, endDate = 2022-03-20, minAmount = 1000

In [293]:
purchases_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- time_stamp: string (nullable = true)
 |-- amount: integer (nullable = true)



In [376]:
start_date = '2022-03-08 00:00:00'
end_date =   '2022-03-20 00:00:00'
purchases_df.select('user_id',to_timestamp(col('time_stamp'),'yyyy-MM-dd HH:mm:ss').alias('timestamp'))\
.filter( (col('timestamp').between(to_timestamp(lit(start_date),'yyyy-MM-dd HH:mm:ss'),
                                 to_timestamp(lit(end_date),'yyyy-MM-dd HH:mm:ss'))) & (col('amount') > 1000))\
.select('user_id').show()

+-------+
|user_id|
+-------+
|      3|
+-------+



In [373]:
purchases_df.createOrReplaceTempView('purchasesdf')

spark.sql('''
            select user_id as user_cnt from purchasesdf where
                  time_stamp between to_timestamp('2022-03-08 00:00:00','yyyy-MM-dd HH:mm:ss')
                                and  to_timestamp('2022-03-20 00:00:00','yyyy-MM-dd HH:mm:ss')
                   and amount > 1000

         ''').show()

+--------+
|user_cnt|
+--------+
|       3|
+--------+



#### 89 196 Product Sales Analysis V E

In [166]:
sales_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/89_sales.csv'))

In [167]:
product_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/89_products.csv'))

In [303]:
sales_df.printSchema()

root
 |-- sale_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)



In [304]:
product_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- price: integer (nullable = true)



In [382]:
sales_df.join(product_df,sales_df.product_id == product_df.product_id,'inner')\
.select('user_id',expr("quantity * price").alias('sales_price')).groupBy('user_id')\
.agg(sum(col(('sales_price'))).alias('spending')).orderBy(col('spending').desc(),col('user_id').asc()).show()

+-------+--------+
|user_id|spending|
+-------+--------+
|    101|     125|
|    102|      75|
|    103|      75|
+-------+--------+



In [386]:
sales_df.createOrReplaceTempView('sales_df')
product_df.createOrReplaceTempView('product_df')

spark.sql('''
        select user_id, sum((quantity * price)) as spending from sales_df
             join product_df on sales_df.product_id == product_df.product_id group by user_id
             order by spending desc, user_id asc
         ''').show()

+-------+--------+
|user_id|spending|
+-------+--------+
|    101|     125|
|    102|      75|
|    103|      75|
+-------+--------+



#### 90 197 All the Matches of the League E

In [168]:
teams_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/90_teams.csv'))

In [388]:
teams_df.printSchema()

root
 |-- team_name: string (nullable = true)



In [400]:
home_team = teams_df.withColumnRenamed('team_name','home_team')
away_team = teams_df.withColumnRenamed('team_name','away_team')

home_team.join(away_team).filter(col('home_team') != col('away_team')).show()

+-----------+-----------+
|  home_team|  away_team|
+-----------+-----------+
|Leetcode FC|    Ahly SC|
|Leetcode FC|Real Madrid|
|    Ahly SC|Leetcode FC|
|    Ahly SC|Real Madrid|
|Real Madrid|Leetcode FC|
|Real Madrid|    Ahly SC|
+-----------+-----------+



In [407]:
teams_df.createOrReplaceTempView('teamsdf')

spark.sql('''
           select home_team,away_team from 
           (select team_name as away_team from teamsdf) as yy,
           (select team_name as home_team from teamsdf) as xx 
           where home_team != away_team
          ''').show()

+-----------+-----------+
|  home_team|  away_team|
+-----------+-----------+
|    Ahly SC|Leetcode FC|
|Real Madrid|Leetcode FC|
|Leetcode FC|    Ahly SC|
|Real Madrid|    Ahly SC|
|Leetcode FC|Real Madrid|
|    Ahly SC|Real Madrid|
+-----------+-----------+



#### 91 199 Number of Unique Subjects Taught by Each Teacher E

In [169]:
teacher_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/91_teacher.csv'))

In [409]:
teacher_df.printSchema()

root
 |-- teacher_id: integer (nullable = true)
 |-- subject_id: integer (nullable = true)
 |-- dept_id: integer (nullable = true)



In [412]:
teacher_df.groupBy('teacher_id').agg(countDistinct(col('subject_id')).alias('cnt')).show()

+----------+---+
|teacher_id|cnt|
+----------+---+
|         1|  2|
|         2|  4|
+----------+---+



In [414]:
teacher_df.createOrReplaceTempView('teacherdf')

spark.sql('''
          select teacher_id,count(distinct subject_id) as cnt from teacherdf group by teacher_id
         ''').show()

+----------+---+
|teacher_id|cnt|
+----------+---+
|         1|  2|
|         2|  4|
+----------+---+



#### 92 201  Sort the Olympic Table E

In [170]:
olympic_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/92_olympic.csv'))

In [416]:
olympic_df.printSchema()

root
 |-- country: string (nullable = true)
 |-- gold_medals: integer (nullable = true)
 |-- silver_medals: integer (nullable = true)
 |-- bronze_medals: integer (nullable = true)



In [419]:
olympic_df.select('country','gold_medals','silver_medals','bronze_medals')\
.orderBy(col('gold_medals').desc(),col('silver_medals').desc(),col('bronze_medals').desc(),col('country').asc()).show()

+-----------+-----------+-------------+-------------+
|    country|gold_medals|silver_medals|bronze_medals|
+-----------+-----------+-------------+-------------+
|      China|         10|           10|           20|
|        USA|         10|           10|           20|
|     Israel|          2|            2|            3|
|      Egypt|          2|            2|            2|
|South Sudan|          0|            0|            1|
+-----------+-----------+-------------+-------------+



In [421]:
olympic_df.createOrReplaceTempView('olympicdf')

spark.sql('''
          select country,gold_medals,silver_medals,bronze_medals from olympicdf
          order by gold_medals desc,silver_medals desc,bronze_medals desc,country asc
         ''').show()

+-----------+-----------+-------------+-------------+
|    country|gold_medals|silver_medals|bronze_medals|
+-----------+-----------+-------------+-------------+
|      China|         10|           10|           20|
|        USA|         10|           10|           20|
|     Israel|          2|            2|            3|
|      Egypt|          2|            2|            2|
|South Sudan|          0|            0|            1|
+-----------+-----------+-------------+-------------+



#### 93 204 Form a Chemical Bond E

In [171]:
elements_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/93_elements.csv'))

In [423]:
elements_df.printSchema()

root
 |-- symbol: string (nullable = true)
 |-- type: string (nullable = true)
 |-- electrons: integer (nullable = true)



In [435]:
Nonmetal_df = elements_df.filter(col('type') == lit('Nonmetal')).select(col('symbol').alias('Nonmetal'))
Metal_df = elements_df.filter(col('type') == lit('Metal')).select(col('symbol').alias('Metal'))
Nonmetal_df.join(Metal_df).show()

+--------+-----+
|Nonmetal|Metal|
+--------+-----+
|      Cl|   Na|
|      Cl|   Ca|
|      Cl|   La|
|       o|   Na|
|       o|   Ca|
|       o|   La|
|       N|   Na|
|       N|   Ca|
|       N|   La|
+--------+-----+



In [447]:
elements_df.createOrReplaceTempView('elementsdf')

spark.sql('''
             select NonMetal,Metal from 
            (select symbol as Metal from elementsdf where type = 'Metal'),
            (select symbol as NonMetal from elementsdf where type = 'Nonmetal')
          ''').show()

+--------+-----+
|NonMetal|Metal|
+--------+-----+
|      Cl|   Na|
|       o|   Na|
|       N|   Na|
|      Cl|   Ca|
|       o|   Ca|
|       N|   Ca|
|      Cl|   La|
|       o|   La|
|       N|   La|
+--------+-----+



#### 94 205 Concatenate the Name and the Profession E

In [172]:
person_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/94_person.csv'))

In [449]:
person_df.printSchema()

root
 |-- person_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- profession: string (nullable = true)



In [464]:
person_df.select('person_id',concat('name',lit('('),substring('profession',1,1),lit(')') ).alias('name')).show()

+---------+--------+
|person_id|    name|
+---------+--------+
|        1| Alex(S)|
|        3|Alice(A)|
|        2|  Bob(P)|
|        4|Messi(D)|
|        6|Tyson(E)|
|        5| Meir(L)|
+---------+--------+



In [469]:
person_df.createOrReplaceTempView('persondf')

spark.sql('''
        select person_id, concat(name,'(',substring(profession,1,1),')') as name from persondf
          ''').show()

+---------+--------+
|person_id|    name|
+---------+--------+
|        1| Alex(S)|
|        3|Alice(A)|
|        2|  Bob(P)|
|        4|Messi(D)|
|        6|Tyson(E)|
|        5| Meir(L)|
+---------+--------+



#### 95 206 Find Latest Salaries E

In [173]:
salary_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/95_salary.csv'))

In [471]:
salary_df.printSchema()

root
 |-- emp_id: integer (nullable = true)
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department_id: string (nullable = true)



In [474]:
window_spec = Window.partitionBy('emp_id')

salary_df.withColumn('salary',max(col('salary')).over(window_spec)).distinct().orderBy('emp_id').show()

+------+---------+--------+------+-------------+
|emp_id|firstname|lastname|salary|department_id|
+------+---------+--------+------+-------------+
|     1|     Todd|  Wilson|110000|        D1006|
|     2|   Justin|   Simon|130000|        D1005|
|     3|    Kelly| Rosario| 42689|        D1002|
|     4| Patricia|  Powell|170000|        D1004|
|     5|   Sherry|  Golden| 44101|        D1002|
|     6|  Natasha| Swanson| 90000|        D1005|
+------+---------+--------+------+-------------+



In [478]:
salary_df.createOrReplaceTempView('salarydf')

spark.sql('''
          select distinct emp_id,firstname,lastname,max(salary) over(partition by emp_id) as salary,department_id 
               from salarydf order by emp_id
         ''').show()

+------+---------+--------+------+-------------+
|emp_id|firstname|lastname|salary|department_id|
+------+---------+--------+------+-------------+
|     1|     Todd|  Wilson|110000|        D1006|
|     2|   Justin|   Simon|130000|        D1005|
|     3|    Kelly| Rosario| 42689|        D1002|
|     4| Patricia|  Powell|170000|        D1004|
|     5|   Sherry|  Golden| 44101|        D1002|
|     6|  Natasha| Swanson| 90000|        D1005|
+------+---------+--------+------+-------------+



#### 96 207 Count Artist Occurrences On Spotify Ranking List E

In [174]:
spotify_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/96_spotify.csv'))

In [480]:
spotify_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- track_name: string (nullable = true)
 |-- artist: string (nullable = true)



In [482]:
spotify_df.groupBy('artist').agg(count(col('artist')).alias('occurrences'))\
                            .orderBy(col('occurrences').desc(),col('artist').asc()).show()

+----------+-----------+
|    artist|occurrences|
+----------+-----------+
| DJ Khalid|          2|
|Ed Sheeran|          2|
|       Sia|          1|
+----------+-----------+



In [485]:
spotify_df.createOrReplaceTempView('spotifydf')

spark.sql('''
         select artist,count(artist) as occurrences from spotifydf group by artist
              order by occurrences desc, artist asc
        ''').show()

+----------+-----------+
|    artist|occurrences|
+----------+-----------+
| DJ Khalid|          2|
|Ed Sheeran|          2|
|       Sia|          1|
+----------+-----------+



#### 97 209 Bikes Last Time Used  E

In [175]:
bikes_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/97_bikes.csv'))

In [487]:
bikes_df.printSchema()

root
 |-- ride_id: integer (nullable = true)
 |-- bike_number: string (nullable = true)
 |-- start_time: string (nullable = true)
 |-- end_time: string (nullable = true)



In [491]:
bikes_df.groupBy('bike_number').agg(max(col('end_time')).alias('end_time'))\
.orderBy(col('end_time').desc()).show()

+-----------+-------------------+
|bike_number|           end_time|
+-----------+-------------------+
|     W00576|2012-03-28 02:50:00|
|     W00455|2012-03-26 17:40:00|
|     W00300|2012-03-25 10:50:00|
+-----------+-------------------+



In [493]:
bikes_df.createOrReplaceTempView('bikesdf')

spark.sql('''
          select bike_number, max(end_time) as end_time from bikesdf group by bike_number order by end_time desc
         ''').show()

+-----------+-------------------+
|bike_number|           end_time|
+-----------+-------------------+
|     W00576|2012-03-28 02:50:00|
|     W00455|2012-03-26 17:40:00|
|     W00300|2012-03-25 10:50:00|
+-----------+-------------------+



#### 98 214 Total Traveled Distance E

In [176]:
users_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/98_users.csv'))

In [177]:
rides_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/98_rides.csv'))

In [496]:
users_df .printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- name: string (nullable = true)



In [497]:
rides_df.printSchema()

root
 |-- ride_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- distance: integer (nullable = true)



In [498]:
rides_df.show()

+-------+-------+--------+
|ride_id|user_id|distance|
+-------+-------+--------+
|     72|     17|     160|
|     42|     14|     161|
|     45|      4|      59|
|     32|      2|     197|
|     15|      4|     357|
|     56|      2|     196|
|     10|     14|      25|
+-------+-------+--------+



In [507]:
rides_df.groupBy('user_id').agg(sum(col('distance')).alias('traveled distance'))\
.join(users_df,rides_df.user_id == users_df.user_id, 'right')\
.select(users_df.user_id,'name',coalesce('traveled distance',lit('0')).alias('traveled distance')).orderBy('user_id').show()

+-------+-------+-----------------+
|user_id|   name|traveled distance|
+-------+-------+-----------------+
|      2|  Avery|              393|
|      4|Michael|              416|
|     10|Eleanor|                0|
|     14|  Ethan|              186|
|     17|Addison|              160|
+-------+-------+-----------------+



In [534]:
users_df.createOrReplaceTempView('usersdf')
rides_df.createOrReplaceTempView('ridesdf')

spark.sql('''
  with main as (select user_id as userid,sum(distance) as traveleddis from ridesdf group by user_id)
               select user_id,name,coalesce(traveleddis,'0') as traveled_distanct  
                    from main right join usersdf on main.userid = usersdf.user_id order by user_id
         ''').show()

+-------+-------+-----------------+
|user_id|   name|traveled_distanct|
+-------+-------+-----------------+
|      2|  Avery|              393|
|      4|Michael|              416|
|     10|Eleanor|                0|
|     14|  Ethan|              186|
|     17|Addison|              160|
+-------+-------+-----------------+



#### 99 215 Highest Salaries Difference E

In [178]:
salaries_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/99_salaries.csv'))

In [536]:
salaries_df.printSchema()

root
 |-- emp_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = true)



In [557]:
eng_salry = salaries_df.filter(col('department') == lit('Engineering'))\
.groupBy('department').agg(max(col('salary')).alias('salary')).select('salary')

mar_salry = salaries_df.filter(col('department') == lit('Marketing'))\
.groupBy('department').agg(max(col('salary')).alias('salary')).select('salary')

eng_salry.join(mar_salry).select(abs(mar_salry.salary - eng_salry.salary).alias('salary_difference') ).show()

+-----------------+
|salary_difference|
+-----------------+
|            49000|
+-----------------+



In [562]:
salaries_df.createOrReplaceTempView('salariesdf')

spark.sql('''
  with eng as (select max(salary) as salary from salariesdf where department = "Engineering"),
       mrk as (select max(salary) as salary from salariesdf where department = "Marketing")
              select eng.salary - mrk.salary from mrk,eng
        ''').show()

+-----------------+
|(salary - salary)|
+-----------------+
|            49000|
+-----------------+



#### 100 221 Calculate Compressed Mean E

In [179]:
orders_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/100_orders.csv'))

In [564]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- item_count: integer (nullable = true)
 |-- order_occurrences: integer (nullable = true)



In [570]:
window_spec = Window.partitionBy('dummy')
orders_df.withColumn('dummy',lit('1'))\
.withColumn('average_items_per_order',round((sum(col('item_count') * col('order_occurrences')).over(window_spec))/
 (sum(col('order_occurrences')).over(window_spec)),2)).select('average_items_per_order').distinct().show()

+-----------------------+
|average_items_per_order|
+-----------------------+
|                    2.7|
+-----------------------+



In [574]:
orders_df.createOrReplaceTempView('ordersdf')

spark.sql('''
           select average_items_per_order from
          (select 1 as dummy,round(sum(item_count * order_occurrences)/sum(order_occurrences),2) as average_items_per_order 
               from ordersdf group by dummy)
         ''').show()

+-----------------------+
|average_items_per_order|
+-----------------------+
|                    2.7|
+-----------------------+



#### 101 223 Find Expensive Cities E

In [180]:
listings_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/101_listiings.csv'))

In [578]:
listings_df.printSchema()

root
 |-- listing_id: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- price: integer (nullable = true)



In [598]:
window_spec = Window.partitionBy('city')
window_dummy = Window.partitionBy('dummy')

listings_df.withColumn('dummy',lit('1'))\
.withColumn('state',sum(col('price')).over(window_spec)/count(col('city')).over(window_spec))\
.withColumn('national',sum(col('state')).over(window_dummy)/count(col('dummy')).over(window_dummy))\
.select('city',(col('state') - col('national')).alias('diff') ).filter(col('diff') > 0)\
.select('city').distinct().show()



+----------+
|      city|
+----------+
|   Chicago|
|LosAngeles|
+----------+



In [610]:
listings_df.createOrReplaceTempView('listingsdf')

spark.sql('''
  with one as (select city, sum(price)/count(city) as state from listingsdf group by city),
       two as (select sum(price)/count(*) as national from listingsdf)
               select city from 
                    (select city,(state - national) as diff from one,two)
                    where diff > 0
         ''').show()

+----------+
|      city|
+----------+
|   Chicago|
|LosAngeles|
+----------+



#### 102 226 Loan Types E

In [181]:
loans_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/102_loans.csv'))

In [5]:
loans_df.printSchema()

root
 |-- loan_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- loan_type: string (nullable = true)



In [9]:
loans_df.filter("loan_type in ('Refinance','Mortgage')")\
.groupBy('user_id').agg(count(col('user_id')).alias('cnt')).filter(col('cnt') == 2).select('user_id').show()

+-------+
|user_id|
+-------+
|    102|
+-------+



In [12]:
loans_df.createOrReplaceTempView('loansdf')

spark.sql('''
          select user_id from 
          (select user_id, count(user_id) as cnt from loansdf where loan_type in ('Refinance','Mortgage')
                  group by user_id) where cnt = 2
         ''').show()

+-------+
|user_id|
+-------+
|    102|
+-------+



#### 103 229 Find Candidates for Data Scientist Position E

In [182]:
candidates_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/103_candidates.csv'))

In [14]:
candidates_df.printSchema()

root
 |-- candidate_id: integer (nullable = true)
 |-- skill: string (nullable = true)



In [23]:
candidates_df.filter("skill in ('Python','Tableau','PostgreSQL')").groupBy('candidate_id').agg(count(col('candidate_id')).alias('cnt'))\
.filter(col('cnt') == 3).select('candidate_id').orderBy(col('candidate_id').asc()).show()

+------------+
|candidate_id|
+------------+
|         123|
|         147|
+------------+



In [27]:
candidates_df.createOrReplaceTempView('candidatesdf')

spark.sql('''
          select candidate_id from 
          (select candidate_id, count(candidate_id) as cnt from candidatesdf 
               where skill in ('Python','Tableau','PostgreSQL') group by candidate_id)
            where cnt = 3 order by candidate_id asc
          ''').show()

+------------+
|candidate_id|
+------------+
|         123|
|         147|
+------------+



#### 104 230 Classifying Triangles by Lengths E

In [183]:
triangles_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/104_tringles.csv'))

In [30]:
triangles_df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: integer (nullable = true)



In [31]:
triangles_df.show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
| 20| 20| 23|
| 20| 20| 20|
| 20| 21| 22|
| 13| 14| 30|
+---+---+---+



In [59]:
triangles_df.withColumn('triangle_type',
    when(((col('A') == col('B')) & (col('B') == col('C'))),lit('Equilateral'))
   .when(((col('A') == col('B')) & (col('B') != col('C'))),lit('Isosceles')) 
   .when((((col('A')+col('B') ) > col('C') ) & ((col('B')+col('C')) > col('A')) & ((col('A')+col('C')) >col('B'))),lit('Not A Triangle'))
   .when((col('A') != col('B')) & (col('B') != col('C')) & (col('A') != col('C')),lit('Scalene')))\
   .select('triangle_type').show()



+--------------+
| triangle_type|
+--------------+
|     Isosceles|
|   Equilateral|
|Not A Triangle|
|       Scalene|
+--------------+



In [49]:
triangles_df.createOrReplaceTempView('trianglesdf')

spark.sql('''
           select case when A = B and B  = C then "Equilateral" 
                       when A = B and B != C then "Isosceles" 
                       when ((A+B) > C) and ((B+C) > A) and ((A+C) >B) then "Not A Triangle"
                       when A !=B and B != C and A != C then "Scalene"
                       end triangle_type from trianglesdf
          ''').show()

+--------------+
| triangle_type|
+--------------+
|     Isosceles|
|   Equilateral|
|Not A Triangle|
|       Scalene|
+--------------+



#### 105 235 Find All Unique Email Domains E

In [184]:
emails_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/105_emails.csv'))

In [61]:
emails_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- email: string (nullable = true)



In [104]:
pattern = r'@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'

emails_df.filter(col('email').rlike('.com')).withColumn('email_domain',regexp_extract("email", pattern, 1))\
.groupBy('email_domain').agg(count(col('email_domain')).alias('cnt')).show()

+------------+---+
|email_domain|cnt|
+------------+---+
| outlook.com|  2|
|   yahoo.com|  1|
+------------+---+



In [116]:
emails_df.createOrReplaceTempView('emailsdf')

spark.sql('''
          select regexp_extract(email, '@([a-zA-Z0-9.-]+\\.[a-zA-Z]{2,})', 1) AS email_domain,count(*) as cnt
               from emailsdf where email rlike '.com' group by email_domain
         ''').show()

+------------+---+
|email_domain|cnt|
+------------+---+
| outlook.com|  2|
|   yahoo.com|  1|
+------------+---+



#### 106 242 Invalid Tweets II E

In [185]:
tweets_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/106_tweets.csv'))

In [131]:
tweets_df.printSchema()

root
 |-- tweet_id: integer (nullable = true)
 |-- content: string (nullable = true)



In [153]:
tweets_df.withColumn('len',length(col('content')).alias('len'))\
.withColumn('cnt_mentions',col('len') - length(regexp_replace(col('content'),'@','')))\
.withColumn('hashtags',col('len') - length(regexp_replace(col('content'),'#','')))\
.withColumn('tweet',when( (col('len') > 140) 
                       | (col('cnt_mentions') > 3) 
                       | (col('hashtags') > 3), col('tweet_id')).otherwise(0)).filter(col('tweet') != 0)\
.select('tweet_id').show()

+--------+
|tweet_id|
+--------+
|       1|
|       4|
+--------+



In [164]:
tweets_df.createOrReplaceTempView('tweetsdf')

spark.sql('''
        select tweet_id from 
        (select tweet_id, case
                         when length(content) > 140 then tweet_id
                         when length(content) - length(regexp_replace(content,'@','')) > 3 then tweet_id
                         when length(content) - length(regexp_replace(content,'#','')) > 3 then tweet_id
                         else 0 end tweet from tweetsdf) 
            where tweet > 0
        
         ''').show()

+--------+
|tweet_id|
+--------+
|       1|
|       4|
+--------+



#### 107 244 Second Day Verification E

In [186]:
emails_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/107_emails.csv'))

In [187]:
texts_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/107_texts.csv'))

In [182]:
emails_df.printSchema()

root
 |-- email_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- signup_date: string (nullable = true)



In [183]:
texts_df.printSchema()

root
 |-- text_id: integer (nullable = true)
 |-- email_id: integer (nullable = true)
 |-- signup_action: string (nullable = true)
 |-- action_date: string (nullable = true)



In [189]:
emails_df.join(texts_df,emails_df.email_id == texts_df.email_id,'inner').filter(col('signup_action') != lit('Not Verified'))\
.withColumn('time_diff',datediff('action_date','signup_date') ).select('user_id').orderBy(col('user_id').asc()).show()
                                                                       

+-------+
|user_id|
+-------+
|   7005|
|   7771|
+-------+



In [196]:
emails_df.createOrReplaceTempView('emailsdf')
texts_df.createOrReplaceTempView('textsdf')

spark.sql('''
           select user_id,datediff(action_date,signup_date) as diff from emailsdf
                join textsdf on emailsdf.email_id = textsdf.email_id where signup_action != "Not Verified"
                order by user_id asc
         ''').show()

+-------+----+
|user_id|diff|
+-------+----+
|   7005|   1|
|   7771|   1|
+-------+----+



#### 108 246 Find Cities in Each State E

In [14]:
cities_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/108_cities.csv'))

In [198]:
cities_df.printSchema()

root
 |-- state: string (nullable = true)
 |-- city: string (nullable = true)



In [232]:

cities_df.groupBy('state').agg(array_join(collect_list(col('city')),',').alias('cities')).show(truncate=False)

+----------+-----------------------------------+
|state     |cities                             |
+----------+-----------------------------------+
|Texas     |Houston,Austin                     |
|California|Los Angeles,San Francisco,San Diego|
|New York  |New York City,Buffalo,Rochester    |
+----------+-----------------------------------+



In [237]:
cities_df.createOrReplaceTempView('citiesdf')

spark.sql('''
          select state,array_join(collect_list(city),',') as cities from citiesdf group by state
         ''').show(truncate=False)

+----------+-----------------------------------+
|state     |cities                             |
+----------+-----------------------------------+
|Texas     |Houston,Austin                     |
|California|Los Angeles,San Francisco,San Diego|
|New York  |New York City,Buffalo,Rochester    |
+----------+-----------------------------------+



#### 109 251 remier League Table Ranking E 3246

In [15]:
teamstats_df = (spark.read
                 .option('header',True)
                 .option('inferSchema',True)
                 .format('csv')
                 .load('../../data/database/109_teamstats.csv'))

In [241]:
teamstats_df.printSchema()

root
 |-- team_id: integer (nullable = true)
 |-- team_name: string (nullable = true)
 |-- matches_played: integer (nullable = true)
 |-- wins: integer (nullable = true)
 |-- draws: integer (nullable = true)
 |-- losses: integer (nullable = true)



In [242]:
teamstats_df.show()

+-------+---------------+--------------+----+-----+------+
|team_id|      team_name|matches_played|wins|draws|losses|
+-------+---------------+--------------+----+-----+------+
|      1|Manchester City|            10|   6|    2|     2|
|      2|      Liverpool|            10|   6|    2|     2|
|      3|        Chelsea|            10|   5|    3|     2|
|      4|        Arsenal|            10|   4|    4|     2|
|      5|      Tottenham|            10|   3|    5|     2|
+-------+---------------+--------------+----+-----+------+



In [258]:
window_spec = Window.orderBy(col('points').desc())

teamstats_df.select('team_id','team_name',expr("((wins * 3)+(draws * 1))" ).alias('points'))\
.withColumn('position',rank().over(window_spec)).orderBy(col('position').asc(),col('team_id').desc()).show()

+-------+---------------+------+--------+
|team_id|      team_name|points|position|
+-------+---------------+------+--------+
|      2|      Liverpool|    20|       1|
|      1|Manchester City|    20|       1|
|      3|        Chelsea|    18|       3|
|      4|        Arsenal|    16|       4|
|      5|      Tottenham|    14|       5|
+-------+---------------+------+--------+



24/08/30 20:35:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [263]:
teamstats_df.createOrReplaceTempView('teamstatsdf')

spark.sql('''
          
          select team_id,team_name, ((wins * 3 )+(draws * 1)) as points,
               rank() over(order by ((wins * 3 )+(draws * 1)) desc) as position from teamstatsdf 
               order by position asc, team_id desc
         ''').show()

+-------+---------------+------+--------+
|team_id|      team_name|points|position|
+-------+---------------+------+--------+
|      2|      Liverpool|    20|       1|
|      1|Manchester City|    20|       1|
|      3|        Chelsea|    18|       3|
|      4|        Arsenal|    16|       4|
|      5|      Tottenham|    14|       5|
+-------+---------------+------+--------+



24/08/30 20:40:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
