# SQL 50 in Pandas

In [1]:
from typing import List
from typing import Optional
import pandas as pd
import numpy as np

### 1757. Recyclable and Low Fat Products [E]

Write a solution to find the ids of products that are both low fat and recyclable.

Return the result table in any order.

In [13]:
def find_products(products: pd.DataFrame) -> pd.DataFrame:
    return products[ (products['low_fats']=='Y') & (products['recyclable']=='Y') ][['product_id']]

In [2]:
data = [['0', 'Y', 'N'], ['1', 'Y', 'Y'], ['2', 'N', 'Y'], ['3', 'Y', 'Y'], ['4', 'N', 'N']]
products = pd.DataFrame(data, columns=['product_id', 'low_fats', 'recyclable']).astype({'product_id':'int64', 'low_fats':'category', 'recyclable':'category'})

In [14]:
find_products(products)

Unnamed: 0,product_id
1,1
3,3


### 2877. Create a DataFrame from List [E]

Write a solution to create a DataFrame from a 2D list called student_data. This 2D list contains the IDs and ages of some students.

The DataFrame should have two columns, student_id and age, and be in the same order as the original 2D list.

In [16]:
input_list = [
  [1, 15],
  [2, 11],
  [3, 11],
  [4, 20]
]

In [23]:
def createDataframe(student_data: List[List[int]]) -> pd.DataFrame:
    return pd.DataFrame(student_data, columns=['student_id','age'])

In [24]:
createDataframe(input_list)

Unnamed: 0,student_id,age
0,1,15
1,2,11
2,3,11
3,4,20


### 176. Second Highest Salary [M]

Write a solution to find the second highest distinct salary from the Employee table. If there is no second highest salary, return null (return None in Pandas).

The result format is in the following example.

In [64]:
def second_highest_salary(employee: pd.DataFrame) -> pd.DataFrame:
    df = employee['salary'].drop_duplicates().sort_values(ascending=False).reset_index(drop=True)
    if len(df) >= 2:
        return pd.DataFrame([df[1]], columns=['SecondHighestSalary'])
    else:
        return pd.DataFrame([None], columns=['SecondHighestSalary'])

In [25]:
data = [[1, 100], [2, 200], [3, 300]]
employee = pd.DataFrame(data, columns=['id', 'salary']).astype({'id':'int64', 'salary':'int64'})

In [67]:
second_highest_salary(employee)

Unnamed: 0,SecondHighestSalary
0,200


### 584. Find Customer Referee

Find the names of the customer that are not referred by the customer with id = 2.

Return the result table in any order.

In [105]:
def find_customer_referee(customer: pd.DataFrame) -> pd.DataFrame:
    return customer[ (customer['referee_id']!=2) | (customer['referee_id'].isnull() ) ][['name']] # .drop_duplicates()

In [80]:
data = [[1, 'Will', None], [2, 'Jane', None], [3, 'Alex', 2], [4, 'Bill', None], [5, 'Zack', 1], [6, 'Mark', 2]]
customer = pd.DataFrame(data, columns=['id', 'name', 'referee_id']).astype({'id':'Int64', 'name':'object', 'referee_id':'Int64'})

In [106]:
find_customer_referee(customer)

Unnamed: 0,name
0,Will
1,Jane
3,Bill
4,Zack
6,Tse


### 570. Managers with at Least 5 Direct Reports [M]

Write a solution to find managers with at least five direct reports.

Return the result table in any order.

In [129]:
def find_managers(employee: pd.DataFrame) -> pd.DataFrame:
    df5 = employee.groupby('managerId', as_index=False).agg({'id':'count'}).query('id >= 5').rename(columns={'id':'count'})
    if df5.shape[0] > 0:
        return pd.merge(left=employee, right=df5, how='inner', left_on='id', right_on='managerId')[['name']]
    else:
        return pd.DataFrame(None, columns=['name'])

In [107]:
data = [[101, 'John', 'A', None], [102, 'Dan', 'A', 101], [103, 'James', 'A', 101], [104, 'Amy', 'A', 101], [105, 'Anne', 'A', 101], [106, 'Ron', 'B', 101]]
employee = pd.DataFrame(data, columns=['id', 'name', 'department', 'managerId']).astype({'id':'Int64', 'name':'object', 'department':'object', 'managerId':'Int64'})

In [130]:
find_managers(employee)

Unnamed: 0,name


### 185. Department Top Three Salaries [H]

A company's executives are interested in seeing who earns the most money in each of the company's departments. A high earner in a department is an employee who has a salary in the top three unique salaries for that department.

Write a solution to find the employees who are high earners in each of the departments.

Return the result table in any order.

The result format is in the following example.

In [198]:
# Beats 69.03%

def top_three_salaries(employee: pd.DataFrame, department: pd.DataFrame) -> pd.DataFrame:
    department2 = department.rename(columns={'id':'departmentId','name':'Department'})
    employee2 = employee.rename(columns={'name':'Employee','salary':'Salary'})
    df = pd.merge(employee2, department2, how='left', on='departmentId') # id Employee salary departnemtnId Department

    if df.shape[0] < 1:
        return pd.DataFrame(None, columns=['Department','Employee','Salary'])

    count = 0
    for dept in department2['departmentId'].unique():
        df_loop = employee2[ employee2['departmentId']==dept ].sort_values('Salary', ascending=False)[['Salary','departmentId']].drop_duplicates().head(3)

        if count == 0:
            df_top3 = df_loop.copy()
        else:
            df_top3 = pd.concat([df_top3, df_loop], axis=0)
        count += 1

    df2 = pd.merge(df, df_top3, how='inner', on=['Salary', 'departmentId'])[['Department','Employee','Salary']]
    return df2

In [199]:
data = [[1, 'Joe', 85000, 1], [2, 'Henry', 80000, 2], [3, 'Sam', 60000, 2], [4, 'Max', 90000, 1], [5, 'Janet', 69000, 1], [6, 'Randy', 85000, 1], [7, 'Will', 70000, 1]]
employee = pd.DataFrame(data, columns=['id', 'name', 'salary', 'departmentId']).astype({'id':'Int64', 'name':'object', 'salary':'Int64', 'departmentId':'Int64'})
data = [[1, 'IT'], [2, 'Sales']]
department = pd.DataFrame(data, columns=['id', 'name']).astype({'id':'Int64', 'name':'object'})

In [200]:
top_three_salaries(employee, department)

Unnamed: 0,Department,Employee,Salary
0,IT,Joe,85000
1,Sales,Henry,80000
2,Sales,Sam,60000
3,IT,Max,90000
4,IT,Randy,85000
5,IT,Will,70000


### 197. Rising Temperature [E]

Write a solution to find all dates' id with higher temperatures compared to its previous dates (yesterday).

Return the result table in any order.


In [248]:
def rising_temperature(weather: pd.DataFrame) -> pd.DataFrame:
    df = weather.sort_values('recordDate', ascending=True).reset_index(drop=True)
    df['day_diff'] = df['recordDate'].diff()
    df['temp_diff'] = df['temperature'].diff()
    return df[ (df['day_diff']== pd.Timedelta(days=1)) & (df['temp_diff']>0) ][['id']]

In [267]:
def rising_temperature(weather: pd.DataFrame) -> pd.DataFrame:
    weather['recordDate'] = pd.to_datetime(weather['recordDate'])
    weather2 = weather.copy()
    weather2['recordDate'] = weather2['recordDate'] + pd.Timedelta(days=1) # temp yesterday
    df = pd.merge(weather, weather2, how='left', on='recordDate') # x is today, y is yesterday
    return df[ df['temperature_x'] - df['temperature_y'] > 0 ][['id_x']].rename(columns={'id_x':'id'})


In [272]:
data = [[1, '2015-01-01', 10], [2, '2015-01-02', 25], [3, '2015-01-03', 20], [4, '2015-01-04', 30]]
# data = [[1, '2015-01-01', 10], [2, '2015-01-02', 25]]
weather = pd.DataFrame(data, columns=['id', 'recordDate', 'temperature']).astype({'id':'Int64', 'recordDate':'datetime64[ns]', 'temperature':'Int64'})

In [271]:
rising_temperature(weather)

Unnamed: 0,id
1,2


### 1148. Article Views I [E]

Write a solution to find all the authors that viewed at least one of their own articles.

Return the result table sorted by id in ascending order.

In [277]:
# Beats 7.45%
def article_views(views: pd.DataFrame) -> pd.DataFrame:
    return views[ views['author_id']==views['viewer_id'] ].groupby('author_id', as_index=False).agg({'article_id':'count'})[['author_id']].rename(columns={'author_id':'id'}).sort_values('id', ascending=True)

In [None]:
# Beats 81.56
def article_views(views: pd.DataFrame) -> pd.DataFrame:
    return views.loc[ views['author_id']==views['viewer_id'] , ['author_id']].drop_duplicates().rename(columns={'author_id':'id'}).sort_values('id', ascending=True)

In [274]:
data = [[1, 3, 5, '2019-08-01'], [1, 3, 6, '2019-08-02'], [2, 7, 7, '2019-08-01'], [2, 7, 6, '2019-08-02'], [4, 7, 1, '2019-07-22'], [3, 4, 4, '2019-07-21'], [3, 4, 4, '2019-07-21']]
views = pd.DataFrame(data, columns=['article_id', 'author_id', 'viewer_id', 'view_date']).astype({'article_id':'Int64', 'author_id':'Int64', 'viewer_id':'Int64', 'view_date':'datetime64[ns]'})

In [None]:
article_views(views)

Unnamed: 0,id
0,4
1,7


### 175. Combine Two Tables [E]

Write a solution to report the first name, last name, city, and state of each person in the Person table. If the address of a personId is not present in the Address table, report null instead.

Return the result table in any order.

In [284]:
# Beats 36.82%
def combine_two_tables(person: pd.DataFrame, address: pd.DataFrame) -> pd.DataFrame:
    # We want: first name, last name, city, state (for people in person table)
    return pd.merge(left=person, right=address, how='left', on='personId')[['firstName','lastName','city','state']]
    # personid firstName lastName addressId city state

In [280]:
data = [[1, 'Wang', 'Allen'], [2, 'Alice', 'Bob']]
person = pd.DataFrame(data, columns=['personId', 'firstName', 'lastName']).astype({'personId':'Int64', 'firstName':'object', 'lastName':'object'})
data = [[1, 2, 'New York City', 'New York'], [2, 3, 'Leetcode', 'California']]
address = pd.DataFrame(data, columns=['addressId', 'personId', 'city', 'state']).astype({'addressId':'Int64', 'personId':'Int64', 'city':'object', 'state':'object'})

In [285]:
combine_two_tables(person, address) 

Unnamed: 0,firstName,lastName,city,state
0,Wang,Allen,,
1,Alice,Bob,New York City,New York


### 181. Employees Earning More Than Their Managers [E]

Write a solution to find the employees who earn more than their managers.

Return the result table in any order.

In [453]:
# Beats 90.93%
def find_employees(employee: pd.DataFrame) -> pd.DataFrame:
    if employee.shape[0] < 1:
        return pd.DataFrame(None, columns=['Employee'])
    manager = employee.copy()
    manager = manager[['id','name','salary']].rename(columns = {'id':'managerId', 'name':'manager_name', 'salary':'manager_salary'}) # managerId manager_name manager_salary
    df = pd.merge(employee, manager, how='inner', on='managerId') # id name salary managerId manager_name manager_salary
    if df.shape[0] < 1:
        return pd.DataFrame(None, columns=['Employee'])
    else:
        return df[ df['salary'] > df['manager_salary'] ][['name']].rename(columns = {'name':'Employee'})

In [447]:
data = [[1, 'Joe', 70000, 3], [2, 'Henry', 80000, 4], [3, 'Sam', 60000, None], [4, 'Max', 90000, None]]
employee = pd.DataFrame(data, columns=['id', 'name', 'salary', 'managerId']).astype({'id':'Int64', 'name':'object', 'salary':'Int64', 'managerId':'Int64'})

In [454]:
find_employees(employee)

Unnamed: 0,Employee
0,Joe


### 1193. Monthly Transactions I [M]

Write an SQL query to find for each month and country, the number of transactions and their total amount, the number of approved transactions and their total amount.

Return the result table in any order.

In [306]:
def monthly_transactions(transactions: pd.DataFrame) -> pd.DataFrame:
    # transactions['month'] = pd.to_datetime(transactions['trans_date']).dt.to_period('M')
    # transactions.loc[ transactions['country'].isna(), transactions['country'] ] = 'null'

    transactions['month'] = pd.to_datetime(transactions['trans_date']).dt.strftime('%Y-%m')
    
    trans = transactions.groupby(['month','country'], as_index=False, dropna=False).agg(trans_count=('state','count'), trans_total_amount=('amount','sum'))
    trans_approved = transactions[ transactions['state']=='approved' ].groupby(['month','country'], as_index=False, dropna=False).agg(approved_count=('state','count'), approved_total_amount=('amount','sum'))
    
    if trans.shape[0] == 0:
        return pd.DataFrame(None, columns=['month','country','trans_count','approved_count','trans_total_amount','approved_total_amount'])
    
    df = pd.merge(trans, trans_approved, how='left', on=['month','country'])[['month','country','trans_count','approved_count','trans_total_amount','approved_total_amount']]

    # Fix approved ds NAs if exist
    df.loc[ df['approved_count'].isna(), ['approved_count'] ] = 0
    df.loc[ df['approved_total_amount'].isna(), ['approved_total_amount'] ] = 0

    return df

In [299]:
data = [[121, 'US', 'approved', 1000, '2018-12-18'], [122, 'US', 'declined', 2000, '2018-12-19'], [123, 'US', 'approved', 2000, '2019-01-01'], [124, 'DE', 'approved', 2000, '2019-01-07']]
data = [[121, 'US', 'declined', 1000, '2018-12-18'], [122, 'US', 'declined', 2000, '2018-12-19'], [123, 'US', 'declined', 2000, '2019-01-01'], [124, 'DE', 'declined', 2000, '2019-01-07']]
transactions = pd.DataFrame(data, columns=['id', 'country', 'state', 'amount', 'trans_date']).astype({'id':'Int64', 'country':'object', 'state':'object', 'amount':'Int64', 'trans_date':'datetime64[ns]'})

In [305]:
monthly_transactions(transactions)

Unnamed: 0,month,country,trans_count,approved_count,trans_total_amount,approved_total_amount
0,2018-12,US,2,0.0,3000,0
1,2019-01,DE,1,0.0,2000,0
2,2019-01,US,1,0.0,2000,0


### 595. Big Countries [E]

A country is big if:

* it has an area of at least three million (i.e., 3000000 km2), or

* it has a population of at least twenty-five million (i.e., 25000000).

Write a solution to find the name, population, and area of the big countries.

Return the result table in any order.

In [310]:
def big_countries(world: pd.DataFrame) -> pd.DataFrame:
    return world[ (world['area'] >= 3000000) | (world['population'] >= 25000000) ][['name','population','area']]

In [307]:
data = [['Afghanistan', 'Asia', 652230, 25500100, 20343000000], ['Albania', 'Europe', 28748, 2831741, 12960000000], ['Algeria', 'Africa', 2381741, 37100000, 188681000000], ['Andorra', 'Europe', 468, 78115, 3712000000], ['Angola', 'Africa', 1246700, 20609294, 100990000000]]
world = pd.DataFrame(data, columns=['name', 'continent', 'area', 'population', 'gdp']).astype({'name':'object', 'continent':'object', 'area':'Int64', 'population':'Int64', 'gdp':'Int64'})

In [311]:
big_countries(world)

Unnamed: 0,name,population,area
0,Afghanistan,25500100,652230
2,Algeria,37100000,2381741


### 1661. Average Time of Process per Machine [E]

There is a factory website that has several machines each running the same number of processes. Write a solution to find the average time each machine takes to complete a process.

The time to complete a process is the 'end' timestamp minus the 'start' timestamp. The average time is calculated by the total time to complete every process on the machine divided by the number of processes that were run.

The resulting table should have the machine_id along with the average time as processing_time, which should be rounded to 3 decimal places.

Return the result table in any order.


In [319]:
def get_average_time(activity: pd.DataFrame) -> pd.DataFrame:
    # machine_id processing_time (rounded to 3)
    # processing_time = total_time / #_processes

    if activity.shape[0] == 0:
        return pd.DataFrame(None, columns=['machine_id','processing_time'])

    activity_start = activity[ activity['activity_type'] == 'start' ][['machine_id','process_id','timestamp']].rename(columns={'timestamp':'time_start'})
    activity_end = activity[ activity['activity_type'] == 'end' ][['machine_id','process_id','timestamp']].rename(columns={'timestamp':'time_end'})

    df = pd.merge(activity_start, activity_end, how='left', on=['machine_id','process_id']) # machine_id process_id time_start time_end

    df['process_time'] = df['time_end'] - df['time_start']  # machine_id process_id time_start time_end process_time

    df_grouped = df.groupby(['machine_id'], as_index=False).agg(process_count=('process_id','count'), process_total_time=('process_time','sum')) # machine_id process_count process_total_time

    df_grouped['processing_time'] = round(df_grouped['process_total_time']/df_grouped['process_count'] + 1e-9, 3)

    return df_grouped[['machine_id','processing_time']]
    

In [312]:
data = [[0, 0, 'start', 0.712], [0, 0, 'end', 1.52], [0, 1, 'start', 3.14], [0, 1, 'end', 4.12], [1, 0, 'start', 0.55], [1, 0, 'end', 1.55], [1, 1, 'start', 0.43], [1, 1, 'end', 1.42], [2, 0, 'start', 4.1], [2, 0, 'end', 4.512], [2, 1, 'start', 2.5], [2, 1, 'end', 5]]
activity = pd.DataFrame(data, columns=['machine_id', 'process_id', 'activity_type', 'timestamp']).astype({'machine_id':'Int64', 'process_id':'Int64', 'activity_type':'object', 'timestamp':'Float64'})

In [320]:
get_average_time(activity)

Unnamed: 0,machine_id,processing_time
0,0,0.894
1,1,0.995
2,2,1.456


### 1378. Replace Employee ID With The Unique Identifier [E]

Write a solution to show the unique ID of each user, If a user does not have a unique ID just show null.

Return the result table in any order.

In [333]:
def replace_employee_id(employees: pd.DataFrame, employee_uni: pd.DataFrame) -> pd.DataFrame:
    if employees.shape[0] == 0:
        return pd.DataFrame(None, columns=['unique_id','name'])
    df = pd.merge(employees, employee_uni, how='left', on='id')[['unique_id','name']] # use how='left' in case employee_uni is empty
    return df

In [325]:
data = [[1, 'Alice'], [7, 'Bob'], [11, 'Meir'], [90, 'Winston'], [3, 'Jonathan']]
employees = pd.DataFrame(data, columns=['id', 'name']).astype({'id':'int64', 'name':'object'})
data = [[3, 1], [11, 2], [90, 3]]
employee_uni = pd.DataFrame(data, columns=['id', 'unique_id']).astype({'id':'int64', 'unique_id':'int64'})

In [334]:
replace_employee_id(employees, employee_uni)

Unnamed: 0,unique_id,name
0,,Alice
1,,Bob
2,2.0,Meir
3,3.0,Winston
4,1.0,Jonathan
