# Создание рандомного датасета

In [1]:
!pip install faker



In [2]:
import pandas as pd
from faker import Faker
from tqdm import tqdm
import itertools
import random
from datetime import date
from dateutil.relativedelta import relativedelta

## Генерация имен абонентов

In [3]:
fake = Faker()

In [4]:
list_id = []
list_name = []
list_city = []
list_state = []

In [5]:
i = 1
for _ in tqdm(range(1000000)):
  list_id.append(i)
  list_name.append(fake.name())
  list_state.append(fake.state())
  i = i + 1 

100%|██████████| 1000000/1000000 [02:08<00:00, 7802.21it/s]


In [6]:
clients = pd.DataFrame(list(zip(list_id,list_name,list_state)),
                       columns=['id','name','state'])

In [7]:
clients.head(5)

Unnamed: 0,id,name,state
0,1,Jason Mathews,Wyoming
1,2,Dakota Brown,Hawaii
2,3,Ashley Martinez,Missouri
3,4,Ronald Duarte,Delaware
4,5,James Moore,Alaska


## Генерация даты подключения абонента согласно маркетинговым акциям

In [8]:
# Акция 1 - апрель (продвижение тарифа middle)
# Акция 2 - август (продвижение тарифа junior)
# Акция 3 - декабрь (продвижение тарифа senior)

In [9]:
def generator_value(dict_val):
  """Генератор случайных значений. Работает по словарю с частотами"""
  list_val_l = [[key] * dict_val[key] for key in dict_val]
  list_val = list(itertools.chain.from_iterable(list_val_l))
  random_val = random.choice(list_val)
  return  random_val

In [10]:
# Словарь Месяц подключения. Пики приходятся на апрель, август, декабрь
dict_month_connection  = {'1' : 1,
                          '2' : 1, 
                          '3' : 1,
                          '4' : 3,
                          '5' : 1,
                          '6' : 1,
                          '7' : 1,
                          '8' : 4,
                          '9' : 1,
                          '10': 1,
                          '11': 1,
                          '12' : 3}

In [11]:
clients['date_con'] = clients['id'].apply(lambda x: date(2020,int(generator_value(dict_month_connection)),1))

In [12]:
clients.head(7)

Unnamed: 0,id,name,state,date_con
0,1,Jason Mathews,Wyoming,2020-01-01
1,2,Dakota Brown,Hawaii,2020-12-01
2,3,Ashley Martinez,Missouri,2020-07-01
3,4,Ronald Duarte,Delaware,2020-04-01
4,5,James Moore,Alaska,2020-11-01
5,6,Victoria Potter,Arizona,2020-09-01
6,7,Nicole Gordon,Massachusetts,2020-04-01


In [13]:
clients.date_con.value_counts()

2020-08-01    210111
2020-04-01    158532
2020-12-01    158139
2020-11-01     52941
2020-02-01     52820
2020-01-01     52796
2020-09-01     52738
2020-07-01     52644
2020-03-01     52584
2020-05-01     52400
2020-06-01     52355
2020-10-01     51940
Name: date_con, dtype: int64

## Установка первоначального тарифа

In [14]:
# Словарь Тарифы Основной
dict_tariff_plan_common = {'senior':1,'middle':2,'junior':3}
# Словарь Тарифы Акция "Все учиться" (август)
dict_tariff_plan_august = {'senior':1,'middle':1,'junior':3}
# Словарь Тарифы Акция "Ты дорос до миддла" (апрель)
dict_tariff_plan_april = {'senior':1,'middle':2,'junior':1}
# Словарь Тарифы Акция "В зиму сеньором" (декабрь)
dict_tariff_plan_december = {'senior':2,'middle':1,'junior':1}

In [15]:
def generator_value_tariff(val_date):
  """Генерация возможного тарифного плана в зависимости от проведения/непроведения маркетинговой акции"""
  if val_date == date(2020,4,1):
    t = generator_value(dict_tariff_plan_april)
  elif val_date == date(2020,8,1):
    t= generator_value(dict_tariff_plan_august)
  elif val_date == date(2020,12,1):
    t= generator_value(dict_tariff_plan_december)
  else:
    t= generator_value(dict_tariff_plan_common)
  return t

In [16]:
clients['tariff_plan_con'] = clients['date_con'].apply(lambda x: generator_value_tariff(x))

In [17]:
clients.head(7)

Unnamed: 0,id,name,state,date_con,tariff_plan_con
0,1,Jason Mathews,Wyoming,2020-01-01,senior
1,2,Dakota Brown,Hawaii,2020-12-01,senior
2,3,Ashley Martinez,Missouri,2020-07-01,junior
3,4,Ronald Duarte,Delaware,2020-04-01,middle
4,5,James Moore,Alaska,2020-11-01,senior
5,6,Victoria Potter,Arizona,2020-09-01,junior
6,7,Nicole Gordon,Massachusetts,2020-04-01,middle


In [18]:
clients.tariff_plan_con.value_counts()

junior    442449
middle    317916
senior    239635
Name: tariff_plan_con, dtype: int64

## Смена тарифного плана

In [19]:
# Словарь Смена тарифного плана. Число месяцев со дня подключения
dict_month_change_tаriff_plan = {'':10,'1':3,'2':2,'3':1}

In [20]:
def generator_value_date_change_tariff_plan(val_date):
  """Генерация даты смены тарифного плана"""
  m = generator_value(dict_month_change_tаriff_plan)
  if m!='':
    count_month = int(m)
    d = val_date + relativedelta(months = count_month)
    return d

In [21]:
clients['date_change_tariff_plan'] = clients['date_con'].apply(lambda x: generator_value_date_change_tariff_plan(x))

In [22]:
clients.head(7)

Unnamed: 0,id,name,state,date_con,tariff_plan_con,date_change_tariff_plan
0,1,Jason Mathews,Wyoming,2020-01-01,senior,
1,2,Dakota Brown,Hawaii,2020-12-01,senior,2021-03-01
2,3,Ashley Martinez,Missouri,2020-07-01,junior,
3,4,Ronald Duarte,Delaware,2020-04-01,middle,
4,5,James Moore,Alaska,2020-11-01,senior,
5,6,Victoria Potter,Arizona,2020-09-01,junior,2020-10-01
6,7,Nicole Gordon,Massachusetts,2020-04-01,middle,2020-05-01


In [23]:
clients.date_change_tariff_plan.value_counts()

2020-09-01    49108
2021-01-01    39803
2020-05-01    39560
2020-10-01    39222
2020-06-01    32780
2020-11-01    29670
2020-07-01    26320
2021-02-01    23155
2020-08-01    19726
2020-04-01    19719
2020-12-01    19676
2020-03-01    16382
2021-03-01     9864
2020-02-01     9854
Name: date_change_tariff_plan, dtype: int64

In [24]:
# Логика смены тарифов: senior - senior, middle - senior, junior- middle
def generator_value_change_tariff_plan_(name_tariff):
  """Вспомогательная функция для смены тарифного плана"""
  if name_tariff == 'senior':
    t = 'senior'
  elif name_tariff == 'middle':
    t = 'senior'
  elif name_tariff == 'junior':
    t = 'middle'
  return t

In [25]:
def generator_value_change_tariff_plan(row):
  """Функция смены тарифного плана"""
  if row['date_change_tariff_plan'] is not None:
    t =  generator_value_change_tariff_plan_(row['tariff_plan_con'])
    return t

In [26]:
clients['tariff_plan_change'] = clients.apply(lambda row: generator_value_change_tariff_plan(row),axis=1)

In [27]:
clients.head(15)

Unnamed: 0,id,name,state,date_con,tariff_plan_con,date_change_tariff_plan,tariff_plan_change
0,1,Jason Mathews,Wyoming,2020-01-01,senior,,
1,2,Dakota Brown,Hawaii,2020-12-01,senior,2021-03-01,senior
2,3,Ashley Martinez,Missouri,2020-07-01,junior,,
3,4,Ronald Duarte,Delaware,2020-04-01,middle,,
4,5,James Moore,Alaska,2020-11-01,senior,,
5,6,Victoria Potter,Arizona,2020-09-01,junior,2020-10-01,middle
6,7,Nicole Gordon,Massachusetts,2020-04-01,middle,2020-05-01,senior
7,8,Mr. Kenneth Carrillo,North Dakota,2020-08-01,senior,,
8,9,Michelle Morse,Virginia,2020-12-01,senior,2021-01-01,senior
9,10,Kristin Kim,New Jersey,2020-09-01,junior,,


In [28]:
clients.tariff_plan_change.value_counts()

senior    208956
middle    165883
Name: tariff_plan_change, dtype: int64

## Генерация даты возможного ухода абонента

In [29]:
# Логика ухода абонентов. 
# Больше всего людей "отваливаются" либо сразу после регистрации либо ближе к году использования сервиса
# Практически нет оттока с 3 по 6 месяц
# Есть абоненты, которые продолжают платить и после года использования сервиса

In [30]:
# Словарь Количество месяцев использования сервиса 
dict_count_month_before_discon ={'1' : 3,
                          '2' : 2, 
                          '3' : 1,
                          '4' : 1,
                          '5' : 1,
                          '6' : 1,
                          '7' : 2,
                          '8' : 3,
                          '9' : 4,
                          '10': 5,
                          '11': 6,
                          '12' : 7,
                          '' : 3}

In [31]:
def generator_value_date_discon(row):
  """Генерация даты ухода абонента"""
  m = generator_value(dict_count_month_before_discon)
  if m!='':
    count_month = int(m)
    if row['date_change_tariff_plan'] is not None:
      d = row['date_change_tariff_plan'] + relativedelta(months = count_month)
    else:
      d = row['date_con'] + relativedelta(months = count_month)
    return d

In [32]:
clients['date_discon'] = clients.apply(lambda row: generator_value_date_discon(row),axis=1)

In [33]:
clients.head(7)

Unnamed: 0,id,name,state,date_con,tariff_plan_con,date_change_tariff_plan,tariff_plan_change,date_discon
0,1,Jason Mathews,Wyoming,2020-01-01,senior,,,2020-02-01
1,2,Dakota Brown,Hawaii,2020-12-01,senior,2021-03-01,senior,2021-11-01
2,3,Ashley Martinez,Missouri,2020-07-01,junior,,,
3,4,Ronald Duarte,Delaware,2020-04-01,middle,,,2020-11-01
4,5,James Moore,Alaska,2020-11-01,senior,,,2020-12-01
5,6,Victoria Potter,Arizona,2020-09-01,junior,2020-10-01,middle,2021-10-01
6,7,Nicole Gordon,Massachusetts,2020-04-01,middle,2020-05-01,senior,2021-04-01


In [34]:
clients.date_discon.value_counts()

2021-04-01    72658
2021-03-01    67566
2021-08-01    67162
2021-07-01    64695
2021-02-01    64399
2021-05-01    62929
2021-06-01    62073
2021-01-01    61729
2021-09-01    50611
2020-12-01    46278
2021-10-01    43910
2021-11-01    37811
2020-11-01    37224
2021-12-01    31387
2020-10-01    31294
2020-09-01    27257
2020-08-01    16746
2020-07-01    15112
2020-06-01    14730
2020-05-01    13703
2022-01-01    11959
2020-04-01     6822
2022-02-01     5685
2020-03-01     5013
2020-02-01     2597
2022-03-01     1718
Name: date_discon, dtype: int64

In [36]:
clients.to_csv('dataset_clients.csv', index=False)