# Работа над ошибками

In [1]:
import pandas as pd
import numpy as np

#### **Pandas.** Задание 1. Cумма за последний месяц для каждого юзера

In [2]:
dataset = {'user_id':[1,2,3,1,2,3,1,2,3],
           'datetime':['01.01.2023','01.01.2023','01.01.2023',
                       '01.02.2023','01.02.2023','01.02.2023',
                       '01.03.2023','01.03.2023','01.03.2023'],
           'order_value':[5,5,5,10,10,10,15,15,15]}

In [3]:
df = pd.DataFrame(data=dataset)
df['datetime'] = pd.to_datetime(df['datetime'],format='%d.%m.%Y', errors='coerce')

In [4]:
df['max_month'] = df.groupby('user_id')['datetime'].transform(max)
df_result = (df[df['datetime'].dt.month==df['max_month'].dt.month]
              .groupby(['user_id'],as_index=False)['order_value'].sum())

In [5]:
df_result.head()

Unnamed: 0,user_id,order_value
0,1,15
1,2,15
2,3,15


#### **Pandas.** Задание 2. Работа с пропусками в датафрейме

In [6]:
dataset = {'user_id':[1,2,3,1,2,3,1,2,3],
           'month':[1,1,1,2,2,2,3,3,3],
           'order_value':[5,None,5,None,10,10,15,15,None]}
           
df = pd.DataFrame(data=dataset)

In [7]:
values = {'order_value': df.groupby('month')['order_value'].transform(np.mean)}
df_result = df.fillna(value=values)

In [8]:
df_result.head(10)

Unnamed: 0,user_id,month,order_value
0,1,1,5.0
1,2,1,5.0
2,3,1,5.0
3,1,2,10.0
4,2,2,10.0
5,3,2,10.0
6,1,3,15.0
7,2,3,15.0
8,3,3,15.0


#### **SQL.** Задание 1. Посчитать количество новых клиентов в разбивке по дате+часам.

In [9]:
# Клиента считаем новым, если по нему не было записей не менее 24 часов до события.

In [10]:
%%capture
# Install postgresql server
!sudo apt-get -y -qq update
!sudo apt-get -y -qq install postgresql
!sudo service postgresql start

# Setup a password `postgres` for username `postgres`
!sudo -u postgres psql -U postgres -c "ALTER USER postgres PASSWORD 'postgres';"

# Setup a database with name `tfio_demo` to be used
!sudo -u postgres psql -U postgres -c 'DROP DATABASE IF EXISTS db;'
!sudo -u postgres psql -U postgres -c 'CREATE DATABASE db;'

In [11]:
from sqlalchemy import create_engine
import psycopg2

In [12]:
endpoint="postgresql://{}:{}@{}:{}/{}".format('postgres','postgres','localhost','5432','db')
print(endpoint)

postgresql://postgres:postgres@localhost:5432/db


In [13]:
con =  create_engine(endpoint)

In [14]:
dataset = {'client_id':[1,2,3,
                        1,2,4,
                        2,3,4,
                        1,3,4,
                        5,1,3], 
           'datetime':['01.01.2023 12:00:00','01.01.2023 12:00:00','01.01.2023 12:00:00',
                       '01.01.2023 23:00:00','01.01.2023 23:00:00','01.01.2023 23:00:00',
                       '04.01.2023 20:00:00','04.01.2023 20:00:00','04.01.2023 20:00:00',
                       '05.01.2023 12:00:00','05.01.2023 12:00:00','05.01.2023 12:00:00',
                       '07.01.2023 12:00:00','07.01.2023 12:00:00','07.01.2023 12:00:00',]}

In [15]:
df = pd.DataFrame(data=dataset)
df['datetime'] = pd.to_datetime(df['datetime'],format='%d.%m.%Y %H:%M:%S', errors='coerce')

In [16]:
df.head()

Unnamed: 0,client_id,datetime
0,1,2023-01-01 12:00:00
1,2,2023-01-01 12:00:00
2,3,2023-01-01 12:00:00
3,1,2023-01-01 23:00:00
4,2,2023-01-01 23:00:00


In [17]:
df.to_sql('events', con, index=False, if_exists='replace')

In [18]:
def select_postgresql(sql):
    return pd.read_sql(sql,con)

In [19]:
sql = '''select e.*,
                coalesce(extract(epoch from e.datetime-e.datetime_last)/3600,0) ::integer as hour_diff
         from (select e.*,
                      e.datetime::date as dt,
                      extract(hour from e.datetime)::integer as hour,
                      lag(e.datetime) over (partition by e.client_id order by e.datetime) as datetime_last         
               from events as e) as e
               order by e.client_id'''

In [20]:
select_postgresql(sql)

Unnamed: 0,client_id,datetime,dt,hour,datetime_last,hour_diff
0,1,2023-01-01 12:00:00,2023-01-01,12,NaT,0
1,1,2023-01-01 23:00:00,2023-01-01,23,2023-01-01 12:00:00,11
2,1,2023-01-05 12:00:00,2023-01-05,12,2023-01-01 23:00:00,85
3,1,2023-01-07 12:00:00,2023-01-07,12,2023-01-05 12:00:00,48
4,2,2023-01-01 12:00:00,2023-01-01,12,NaT,0
5,2,2023-01-01 23:00:00,2023-01-01,23,2023-01-01 12:00:00,11
6,2,2023-01-04 20:00:00,2023-01-04,20,2023-01-01 23:00:00,69
7,3,2023-01-01 12:00:00,2023-01-01,12,NaT,0
8,3,2023-01-04 20:00:00,2023-01-04,20,2023-01-01 12:00:00,80
9,3,2023-01-05 12:00:00,2023-01-05,12,2023-01-04 20:00:00,16


In [21]:
sql = '''with tbl as (select e.*,
                             coalesce(extract(epoch from e.datetime-e.datetime_last)/3600,0)::integer as hour_diff
                     from (select e.*,
                                  e.datetime::date as dt,
                                  extract(hour from e.datetime)::integer as hour,
                                  lag(e.datetime) over (partition by e.client_id order by e.datetime) as datetime_last         
                           from events as e) as e
                     order by e.datetime)
                     
          -- На интересуют новые клиенты (t.hour_diff=0) и клиенты, кто не пользовался сервисом более 24 часов        
          select t.dt,
                 t.hour,
                 count(t.hour_diff) as number_new_customers
          from tbl as t
          where t.hour_diff = 0 OR t.hour_diff>24
          group by t.dt, t.hour
          order by t.dt, t.hour'''

In [22]:
select_postgresql(sql)

Unnamed: 0,dt,hour,number_new_customers
0,2023-01-01,12,3
1,2023-01-01,23,1
2,2023-01-04,20,3
3,2023-01-05,12,1
4,2023-01-07,12,3
