In [1]:
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
data = pd.read_excel('discount.xlsx')

### 1. Общая информация о данных

In [3]:
# посмотрим на набор данных
data.sample(10)

Unnamed: 0,cid,lid,value,term,before_dicount,after_discount,pc_discount,loyalty_level,ll_discount
18204,2069683,1504421,2600,15,1.0,0.98,,0.0,0.02
2705,4533106,1454894,7000,30,1.0,1.0,,,
8737,4348323,1476096,3500,37,1.0,0.98,,0.0,0.02
18973,4634752,1483374,19000,28,1.0,0.98,,0.0,0.02
7184,4667926,1482355,30000,70,0.99,0.7326,0.26,,
15776,4526396,1498214,11615,39,1.0,1.0,,,
12506,4667295,1493485,3500,10,1.0,0.98,,0.0,0.02
10833,4656166,1477271,30000,168,0.99,0.7326,0.26,,
3765,2887018,1462543,20700,56,0.99,0.8415,0.15,,
7828,4004265,1464936,8050,10,1.0,0.98,,0.0,0.02


- cid - id клиента
- lid - id займа
- value - сумма займа
- term - запрошенный срок займа
- before_dicount - ставка до скидки
- after_discount - ставка после скидки
- pc_discount - скидка по промокоду
- loyalty_level - уровень программы лояльности
- ll_discount - скидка по программе лояльности

In [4]:
# посмотрим на общую информацию таблицы
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   cid             30000 non-null  int64  
 1   lid             30000 non-null  int64  
 2   value           30000 non-null  int64  
 3   term            30000 non-null  int64  
 4   before_dicount  30000 non-null  float64
 5   after_discount  30000 non-null  float64
 6   pc_discount     9156 non-null   float64
 7   loyalty_level   18736 non-null  float64
 8   ll_discount     18736 non-null  float64
dtypes: float64(5), int64(4)
memory usage: 2.1 MB


In [5]:
# проверим таблицу на наличие дубликатов
data.duplicated().sum()

0

In [6]:
# проверим таблицу на наличие пропусков
data.isnull().sum()

cid                   0
lid                   0
value                 0
term                  0
before_dicount        0
after_discount        0
pc_discount       20844
loyalty_level     11264
ll_discount       11264
dtype: int64

<div class="alert alert-block alert-info">
<ul>
    <li> в таблице 30 000 строк
    <li> каждая строка - информация об уникальном займе
    <li> в столбцах pc_discount, loyalty_level, ll_discount есть пропуски
</ul>
</div>

### 2. Исследование скидочных промокодов

#### 2.1 Общая информация

In [11]:
data['pc_discount'].value_counts()

0.15    7202
0.50     870
0.26     798
0.10     234
0.16      52
Name: pc_discount, dtype: int64

<div class="alert alert-block alert-info">
<ul>
    <li> 20 844 займа были оформлены без использования скидочных промокодов
    <li> всего с промокодами было оформлено 9 156 займов
    <li> большинство из них со скидкой в 15% (7 202)
</ul>
</div>

In [13]:
data['pc_discount'].describe()

count    9156.000000
mean        0.191623
std         0.105013
min         0.100000
25%         0.150000
50%         0.150000
75%         0.150000
max         0.500000
Name: pc_discount, dtype: float64

<div class="alert alert-block alert-info">
<ul>
    <li> минимальное значение 10%, максимальное значение 50%
    <li> среднее знаение (mean) 19.16%
    <li> медиана (50%) 15%
    <li> большинство из них со скидкой в 15% (7 202)
</ul>
</div>

#### 2.2 Средневзвешенная скидка

In [25]:
data['pcOnlyDiscount'] = data['before_dicount'] - data['pc_discount']
data['pcValue'] = (1 + (data['term'] * data['pcOnlyDiscount']) / 100) * data['value']
data

Unnamed: 0,cid,lid,value,term,before_dicount,after_discount,pc_discount,loyalty_level,ll_discount,pcOnlyDiscount,pcW
0,3407919,1461721,8700,30,1.00,0.8600,,7.0,0.14,,
1,1224622,1461738,13900,30,1.00,0.8000,,10.0,0.20,,
2,4624067,1452680,17250,56,0.99,0.8415,0.15,,,0.84,25364.40
3,4567936,1460449,5000,12,1.00,0.9800,,0.0,0.02,,
4,4646053,1470745,3200,39,1.00,1.0000,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
29995,4583460,1509829,20700,56,0.99,0.8415,0.15,,,0.84,30437.28
29996,4493671,1510229,4000,30,1.00,0.9600,,2.0,0.04,,
29997,110324,1510730,3000,10,1.00,0.8800,,6.0,0.12,,
29998,3537622,1511179,2000,40,1.00,0.9600,,2.0,0.04,,


In [10]:
discount['ll_discount'].fillna(0.0, inplace=True)
discount['d_rate'] = discount['before_dicount'] - discount['ll_discount']

discount.sample(10)

NameError: name 'discount' is not defined

In [None]:
discount['value_wo_discount'] = ((1 + (discount['term'] * discount['before_dicount']) / 100) * discount['value']).astype('int')

discount.sample(10)

In [None]:
discount['value_w_discount'] = ((1 + (discount['term'] * discount['d_rate']) / 100) * discount['value']).astype('int')

discount.sample(10)

In [None]:
(discount['value_wo_discount'].sum() - discount['value_w_discount'].sum()) / discount['value_wo_discount'].sum()

In [None]:
discount.query('ll_discount.notna()')['ll_discount'].mean()

In [None]:
discount.query('ll_discount.notna()')['loyalty_level'].value_counts()

In [None]:
discount.groupby(by='cid').agg({'lid': 'count'}).sort_values(by='lid', ascending=False)

In [None]:
discount.query('cid == 4488303')