## Create Test and Control groups

In [1]:
import pandas as pd
import sqlite3

In [2]:
con = sqlite3.connect('../../datasets/checking-logs.sqlite')

#### Create a new table datamart in the database by joining the tables pageviews and checker using only one query

In [3]:
create_datamart = """
CREATE TABLE IF NOT EXISTS datamart AS
WITH 
filt_checker AS (
    SELECT 
        uid, 
        labname, 
        timestamp AS first_commit_ts
    FROM checker
    WHERE status = 'ready'
      AND NumTrials = 1
      AND labname IN ('laba04', 'laba04s', 'laba05', 'laba06', 'laba06s', 'project1')
),
filt_pageviews AS (
    SELECT 
        uid, 
        MIN(datetime) AS first_view_ts
    FROM pageviews
    WHERE uid LIKE 'user_%'
    GROUP BY uid
)
SELECT 
    c.uid, 
    c.labname,  
    c.first_commit_ts,  
    p.first_view_ts     
FROM filt_checker c
LEFT JOIN filt_pageviews p ON c.uid = p.uid;
"""

con.execute(create_datamart)

sql = """
SELECT *
FROM datamart
"""
datamart = pd.read_sql(sql, con, parse_dates=['first_commit_ts', 'first_view_ts'])
datamart['first_commit_ts'] = pd.to_datetime(datamart['first_commit_ts'])
datamart['first_view_ts'] = pd.to_datetime(datamart['first_view_ts'])
print(datamart)

         uid   labname            first_commit_ts              first_view_ts
0     user_4  project1 2020-04-17 05:19:02.744528                        NaT
1     user_4    laba04 2020-04-17 11:33:17.366400                        NaT
2     user_4   laba04s 2020-04-17 11:48:41.992466                        NaT
3    user_17  project1 2020-04-18 07:56:45.408648 2020-04-18 10:56:55.833899
4    user_30    laba04 2020-04-18 13:36:53.971502 2020-04-17 22:46:26.785035
..       ...       ...                        ...                        ...
141  user_23    laba06 2020-05-21 08:34:10.517205                        NaT
142  user_19   laba06s 2020-05-21 13:27:06.705881 2020-04-21 20:30:38.034966
143  user_23   laba06s 2020-05-21 14:29:15.709568                        NaT
144  user_17    laba06 2020-05-21 15:21:31.567615 2020-04-18 10:56:55.833899
145  user_17   laba06s 2020-05-21 17:39:17.783615 2020-04-18 10:56:55.833899

[146 rows x 4 columns]


In [4]:
#con.execute("DROP TABLE datamart")

#### Using Pandas methods, create two dataframes: test and control

##### Test should have the users that have the values in first_view_ts

In [5]:
test = datamart[datamart['first_view_ts'].notna()].copy()
print(test)

         uid   labname            first_commit_ts              first_view_ts
3    user_17  project1 2020-04-18 07:56:45.408648 2020-04-18 10:56:55.833899
4    user_30    laba04 2020-04-18 13:36:53.971502 2020-04-17 22:46:26.785035
7    user_30   laba04s 2020-04-18 14:51:37.498399 2020-04-17 22:46:26.785035
8    user_14    laba04 2020-04-18 15:14:00.312338 2020-04-18 10:53:52.623447
11   user_14   laba04s 2020-04-18 22:30:30.247628 2020-04-18 10:53:52.623447
18   user_19    laba04 2020-04-20 19:05:01.297780 2020-04-21 20:30:38.034966
19   user_25    laba04 2020-04-20 19:16:50.673054 2020-05-09 23:54:54.260791
20   user_21    laba04 2020-04-21 17:48:00.487806 2020-04-22 22:40:36.824081
21   user_30  project1 2020-04-22 12:36:24.053518 2020-04-17 22:46:26.785035
23   user_21   laba04s 2020-04-22 20:09:21.857747 2020-04-22 22:40:36.824081
24   user_28    laba04 2020-04-22 21:47:19.707242 2020-05-10 21:07:50.350946
27   user_17    laba04 2020-04-23 14:24:29.947554 2020-04-18 10:56:55.833899

##### Control should have the users that have missing values in first_view_ts

In [6]:
control = datamart[datamart['first_view_ts'].isna()].copy()
print(control)

         uid   labname            first_commit_ts first_view_ts
0     user_4  project1 2020-04-17 05:19:02.744528           NaT
1     user_4    laba04 2020-04-17 11:33:17.366400           NaT
2     user_4   laba04s 2020-04-17 11:48:41.992466           NaT
5     user_2    laba04 2020-04-18 13:42:35.482008           NaT
6     user_2   laba04s 2020-04-18 13:51:22.291271           NaT
..       ...       ...                        ...           ...
132   user_2   laba06s 2020-05-19 14:45:03.908268           NaT
138   user_6   laba06s 2020-05-20 14:50:07.609937           NaT
140   user_7   laba06s 2020-05-20 23:05:37.742597           NaT
141  user_23    laba06 2020-05-21 08:34:10.517205           NaT
143  user_23   laba06s 2020-05-21 14:29:15.709568           NaT

[87 rows x 4 columns]


##### Replace the missing values in the control with the average first_view_ts of the test users, we will use this value for the future analysis

In [7]:
avg = test['first_view_ts'].mean()
print(avg)
control['first_view_ts'] = control['first_view_ts'].fillna(avg)
print(control)

2020-04-27 00:40:05.761783552
         uid   labname            first_commit_ts  \
0     user_4  project1 2020-04-17 05:19:02.744528   
1     user_4    laba04 2020-04-17 11:33:17.366400   
2     user_4   laba04s 2020-04-17 11:48:41.992466   
5     user_2    laba04 2020-04-18 13:42:35.482008   
6     user_2   laba04s 2020-04-18 13:51:22.291271   
..       ...       ...                        ...   
132   user_2   laba06s 2020-05-19 14:45:03.908268   
138   user_6   laba06s 2020-05-20 14:50:07.609937   
140   user_7   laba06s 2020-05-20 23:05:37.742597   
141  user_23    laba06 2020-05-21 08:34:10.517205   
143  user_23   laba06s 2020-05-21 14:29:15.709568   

                    first_view_ts  
0   2020-04-27 00:40:05.761783552  
1   2020-04-27 00:40:05.761783552  
2   2020-04-27 00:40:05.761783552  
5   2020-04-27 00:40:05.761783552  
6   2020-04-27 00:40:05.761783552  
..                            ...  
132 2020-04-27 00:40:05.761783552  
138 2020-04-27 00:40:05.761783552  
140 2020-

##### Save both tables into the database, you will use them in the next exercises

In [8]:
test.to_sql('test', con, if_exists='replace', index=False)
control.to_sql('control', con, if_exists='replace', index=False)

87

In [9]:
con.close()