In [1]:
import pandas as pd
import sqlite3
from datetime import datetime
from sqlite3 import Error

In [2]:
df = pd.read_csv('final_transactions.csv', parse_dates=['TX_DATETIME'])
df.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT
0,0,2023-01-01 00:00:31,596,3156,533.07
1,1,2023-01-01 00:02:10,4961,3412,808.56
2,2,2023-01-01 00:07:56,2,1365,1442.94
3,3,2023-01-01 00:09:29,4128,8737,620.65
4,4,2023-01-01 00:10:34,927,9906,490.66


In [3]:
df_transaction_bd = df[['TX_DATETIME', 'CUSTOMER_ID', 'TX_AMOUNT']]
df_transaction_bd.head()

Unnamed: 0,TX_DATETIME,CUSTOMER_ID,TX_AMOUNT
0,2023-01-01 00:00:31,596,533.07
1,2023-01-01 00:02:10,4961,808.56
2,2023-01-01 00:07:56,2,1442.94
3,2023-01-01 00:09:29,4128,620.65
4,2023-01-01 00:10:34,927,490.66


In [4]:
df_transaction_bd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   TX_DATETIME  1048575 non-null  datetime64[ns]
 1   CUSTOMER_ID  1048575 non-null  int64         
 2   TX_AMOUNT    1048575 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 24.0 MB


In [5]:
def create_connection(path):
    connection = None
    try:
        connection = sqlite3.connect(path)
        print("Connection to SQLite DB successful")
    except Error as e:
        print(f"The error '{e}' occurred")

    return connection

In [6]:
def execute_query(connection, query):
    cursor = connection.cursor()
    try:
        cursor.execute(query)
        connection.commit()
        print("Query executed successfully")
    except Error as e:
        print(f"The error '{e}' occurred")

In [7]:
connection = create_connection('database.db')

Connection to SQLite DB successful


In [8]:
cursor = sqlite3.Cursor(connection)

In [9]:
sql_create_table = '''
                        CREATE TABLE IF NOT EXISTS transaction_bd (
                                            "TX_DATETIME" TEXT,
                                            "CUSTOMER_ID" INTEGER,
                                            "TX_AMOUNT" REAL
                                                        )
                     '''

In [10]:
execute_query(connection, sql_create_table)

Query executed successfully


In [11]:
df_transaction_bd.to_sql('transaction_bd', connection, if_exists='replace', index=False)

1048575

---

•	A. Вывести всех клиентов, у которых сумма транзакций больше 700000 за весь период (сортируя клиентов по возрастанию);

In [12]:
df_task_1 = df_transaction_bd.groupby('CUSTOMER_ID', as_index=False)['TX_AMOUNT'].sum()
df_task_1.query('TX_AMOUNT > 700000').sort_values('TX_AMOUNT').reset_index(drop=True).rename(columns={'TX_AMOUNT': 'TOTAL_AMOUNT'})

Unnamed: 0,CUSTOMER_ID,TOTAL_AMOUNT
0,2249,707478.64
1,3116,721980.69
2,389,753411.9
3,4163,765153.63
4,2891,786115.87


In [13]:
sql_task_a = '''
            SELECT
                CUSTOMER_ID,
                sum(TX_AMOUNT) as TOTAL_AMOUNT
            FROM
                transaction_bd
            GROUP by
                CUSTOMER_ID
            HAVING TOTAL_AMOUNT > 700000
            ORDER by TOTAL_AMOUNT
'''

pd.read_sql(sql_task_a, connection)

Unnamed: 0,CUSTOMER_ID,TOTAL_AMOUNT
0,2249,707478.64
1,3116,721980.69
2,389,753411.9
3,4163,765153.63
4,2891,786115.87


---

•	B. Вывести всех клиентов, у которых сумма транзакций больше 200000 за период 01.01.2023 - 13.01.2023 (сортируя клиентов по возрастанию);

In [14]:
df_task_b = df_transaction_bd[(datetime(2023, 1, 1) <= df_transaction_bd['TX_DATETIME']) & (df_transaction_bd['TX_DATETIME'] <= datetime(2023, 1, 13, 23, 59, 59))]
df_task_b.groupby('CUSTOMER_ID', as_index=False)['TX_AMOUNT'].sum().query('TX_AMOUNT > 200000').sort_values('TX_AMOUNT').reset_index(drop=True).rename(columns={'TX_AMOUNT': 'TOTAL_AMOUNT'})

Unnamed: 0,CUSTOMER_ID,TOTAL_AMOUNT
0,3406,207733.48
1,4252,220650.36
2,1918,241299.96
3,3833,269107.4


In [15]:
sql_task_b = '''
                    SELECT
                        CUSTOMER_ID,
                        sum(TX_AMOUNT) as TOTAL_AMOUNT
                    FROM
                        transaction_bd
                    WHERE TX_DATETIME BETWEEN '2023-01-01 00:00:00' AND '2023-01-13 23:59:59'
                    GROUP by CUSTOMER_ID
                    HAVING TOTAL_AMOUNT > 200000
                    ORDER by TOTAL_AMOUNT
'''

pd.read_sql(sql_task_b, connection)

Unnamed: 0,CUSTOMER_ID,TOTAL_AMOUNT
0,3406,207733.48
1,4252,220650.36
2,1918,241299.96
3,3833,269107.4


---

•	C. Вывести тех клиентов, у которых id начинается с 4 и количество транзакций за весь период более 444;

In [23]:
df_task_c = (df_transaction_bd[df_transaction_bd['CUSTOMER_ID']
                               .apply(lambda x: str(x).startswith('4'))])
(df_task_c.groupby('CUSTOMER_ID', as_index=False)['TX_AMOUNT'].count()
                                 .query('TX_AMOUNT > 444').sort_values('TX_AMOUNT'))

Unnamed: 0,CUSTOMER_ID,TX_AMOUNT
648,4539,445
770,4661,449
342,4231,451


In [24]:
sql_task_c = '''
                    SELECT
                        CUSTOMER_ID,
                        count(TX_AMOUNT) as COUNT_AMOUNT
                    FROM
                        (SELECT
                            *
                        FROM
                            transaction_bd
                        WHERE substr(CUSTOMER_ID, 1, 1) = '4')
                    GROUP by CUSTOMER_ID
                    HAVING COUNT_AMOUNT > 444
                    ORDER by COUNT_AMOUNT
                '''

pd.read_sql(sql_task_c, connection)

Unnamed: 0,CUSTOMER_ID,COUNT_AMOUNT
0,4539,445
1,4661,449
2,4231,451


In [25]:
# Добавление столбца 'INCOME_LEVEL' с флагом доходности клиентов
cursor.execute('ALTER TABLE transaction_bd ADD COLUMN INCOME_LEVEL TEXT')

<sqlite3.Cursor at 0x2bd09ae8a40>

In [26]:
# Заполнение столбца 'INCOME_LEVEL' согласно заданию
cursor.execute('''UPDATE transaction_bd SET INCOME_LEVEL = t.TOTAL_AMOUNT
                    FROM
                        (SELECT
                            CUSTOMER_ID,
                            CASE
                                WHEN sum(TX_AMOUNT) <= 50000 THEN 'низкая доходность'
                                WHEN sum(TX_AMOUNT) <= 100000 THEN 'средняя доходность'
                                ELSE 'высокая доходность'
                            END  as TOTAL_AMOUNT
                        FROM
                            transaction_bd
                        GROUP BY CUSTOMER_ID) as t
                    WHERE transaction_bd.CUSTOMER_ID=t.CUSTOMER_ID''')

<sqlite3.Cursor at 0x2bd09ae8a40>

In [27]:
connection.commit()

In [29]:
df_type_amount = df_transaction_bd.groupby('CUSTOMER_ID', as_index=False)['TX_AMOUNT'].sum().rename(columns={'TX_AMOUNT': 'TOTAL_AMOUNT'})
df_type_amount.head()

Unnamed: 0,CUSTOMER_ID,TOTAL_AMOUNT
0,0,119568.47
1,1,323510.82
2,2,162541.9
3,3,2406.56
4,4,300589.33


In [30]:
df_type_amount['INCOME_LEVEL'] = df_type_amount['TOTAL_AMOUNT'].apply(lambda x: 'низкая доходность' if x <= 50000 else 'средняя доходность' if x <= 100000 else 'высокая доходность')
df_type_amount.head()

Unnamed: 0,CUSTOMER_ID,TOTAL_AMOUNT,INCOME_LEVEL
0,0,119568.47,высокая доходность
1,1,323510.82,высокая доходность
2,2,162541.9,высокая доходность
3,3,2406.56,низкая доходность
4,4,300589.33,высокая доходность


In [34]:
df_transaction_bd = df_transaction_bd.merge(df_type_amount, how='inner', on='CUSTOMER_ID')[['TX_DATETIME', 'CUSTOMER_ID', 'TX_AMOUNT', 'INCOME_LEVEL']]
df_transaction_bd.head()

Unnamed: 0,TX_DATETIME,CUSTOMER_ID,TX_AMOUNT,INCOME_LEVEL
0,2023-01-01 00:00:31,596,533.07,высокая доходность
1,2023-01-01 14:32:25,596,558.99,высокая доходность
2,2023-01-01 18:33:13,596,589.15,высокая доходность
3,2023-01-01 21:28:48,596,612.09,высокая доходность
4,2023-01-02 13:19:16,596,166.47,высокая доходность


In [39]:
df_transaction_bd.groupby(['INCOME_LEVEL', 'CUSTOMER_ID'], as_index=False)['TX_AMOUNT'].count().groupby('INCOME_LEVEL', as_index=False)['CUSTOMER_ID'].count()

Unnamed: 0,INCOME_LEVEL,CUSTOMER_ID
0,высокая доходность,2167
1,низкая доходность,1812
2,средняя доходность,1007


In [29]:
sql_task_e = '''
                SELECT
                    INCOME_LEVEL,
                    count(CUSTOMER_ID) as COUNT_CUSTOMER
                FROM
                    (SELECT
                        CUSTOMER_ID,
                        INCOME_LEVEL
                    FROM
                        transaction_bd
                    GROUP by
                        CUSTOMER_ID,
                        INCOME_LEVEL)
                GROUP by INCOME_LEVEL
'''
pd.read_sql(sql_task_e, connection)

Unnamed: 0,INCOME_LEVEL,COUNT_CUSTOMER
0,высокая доходность,2167
1,низкая доходность,1812
2,средняя доходность,1007


---

•	F. Вывести сумму транзакций за каждый день (сортируя дни по возрастанию).

In [40]:
df_transaction_bd['DATE'] = df_transaction_bd.TX_DATETIME.apply(datetime.date)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_transaction_bd['DATE'] = df_transaction_bd.TX_DATETIME.apply(datetime.date)


In [42]:
df_transaction_bd.head()

Unnamed: 0,TX_DATETIME,CUSTOMER_ID,TX_AMOUNT,DATE
0,2023-01-01 00:00:31,596,533.07,2023-01-01
1,2023-01-01 00:02:10,4961,808.56,2023-01-01
2,2023-01-01 00:07:56,2,1442.94,2023-01-01
3,2023-01-01 00:09:29,4128,620.65,2023-01-01
4,2023-01-01 00:10:34,927,490.66,2023-01-01


In [48]:
df_transaction_bd.groupby('DATE', as_index=False)['TX_AMOUNT'].sum().sort_values('DATE').rename(columns={'TX_AMOUNT': 'TOTAL_AMOUNT'})

Unnamed: 0,DATE,TOTAL_AMOUNT
0,2023-01-01,4827656.26
1,2023-01-02,4862551.41
2,2023-01-03,5058973.71
3,2023-01-04,4938142.47
4,2023-01-05,5002954.23
...,...,...
105,2023-04-16,5299386.81
106,2023-04-17,5100973.31
107,2023-04-18,5233557.39
108,2023-04-19,5194846.44


In [49]:
sql_task_f = '''
                SELECT
                    date(TX_DATETIME) as DATE,
                    sum(TX_AMOUNT) as TOTAL_AMOUNT
                FROM
                    transaction_bd
                GROUP by DATE
                ORDER by DAtE
            '''
pd.read_sql(sql_task_f, connection)

Unnamed: 0,DATE,TOTAL_AMOUNT
0,2023-01-01,4827656.26
1,2023-01-02,4862551.41
2,2023-01-03,5058973.71
3,2023-01-04,4938142.47
4,2023-01-05,5002954.23
...,...,...
105,2023-04-16,5299386.81
106,2023-04-17,5100973.31
107,2023-04-18,5233557.39
108,2023-04-19,5194846.44


## Подготовка дашборда

In [None]:
from dash import Dash, html, dcc
import plotly.express as px
import pandas as pd

app = Dash()

fig1 = px.bar(data, x="date", y="num_orders", title = 'Зависимость числа бронирований от даты')

fig2 = px.pie(data_apart, values='num_orders', names='repeated_guest')

app.layout = html.Div(children=[
    html.H1(children='Данные по сервису бронирования'),

    html.Div(children='''
        Анализ данных по бронированию апартаментов.
    '''),

    dcc.Graph(
        id='example-graph',
        figure=fig1
    ),
     dcc.Graph(
        id='example-graph1',
        figure=fig2
    )
])

app.run_server()

In [12]:
# !pip install dash

In [14]:
# !pip install jupyter-dash

In [20]:
connection.close()