In [1]:
# Below is a table schema for a P2P messaging application. 
# The table contains send/receive message data for the application's users.
# Column Name 	Data Type 	Description
# date 	string 	date of the message sent/received, format is 'YYYY-mm-dd'
# timestamp 	integer 	timestamp of the message sent/received, epoch seconds
# sender_id 	integer 	id of the message sender
# receiver_id 	integer 	id of the message receiver
# Question: Using Python and the Pandas library, 
# how would you find the fraction of messages that get a response within 5 minutes?
# For simplicity, let's limit data to Jan 1, 2019. 
import pandas as pd

# mock up data: expect this DF to return 2/6
s = """date,timestamp,sender_id,receiver_id
2019-01-01,1546322000,1,2
2019-01-01,1546322050,1,2
2019-01-01,1546322100,2,1
2019-01-01,1546325500,3,1
2019-01-01,1546329200,1,3
2019-01-01,1546329300,3,2
2019-01-01,1546329400,2,3
2019-03-09,1552111656,1,3
"""
df = pd.read_csv(pd.compat.StringIO(s))


In [4]:
d = df[df["date"]=='2019-01-01']
msg_responded = len(
    d.merge(d, left_on='sender_id', right_on="receiver_id")
    .query("timestamp_y <= timestamp_x + 5*60")
    .query("timestamp_y > timestamp_x")
)
msg_sent = len(d)
print('{} msg sent'.format(msg_sent))
print('{} msg responded'.format(msg_responded))
print('{:.2f} fraction of msg responded to within 5m'.format(msg_responded / msg_sent))
d

7 msg sent
3 msg responded
0.43 fraction of msg responded to within 5m


Unnamed: 0,date,timestamp,sender_id,receiver_id
0,2019-01-01,1546322000,1,2
1,2019-01-01,1546322050,1,2
2,2019-01-01,1546322100,2,1
3,2019-01-01,1546325500,3,1
4,2019-01-01,1546329200,1,3
5,2019-01-01,1546329300,3,2
6,2019-01-01,1546329400,2,3
