In [280]:

import numpy as np
import pandas as pd
import os
import faker
from faker import Faker
import matplotlib.pyplot as plt
import seaborn as sns

from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import MultiTableMetadata
from sdv.multi_table import HMASynthesizer
from rdt.transformers.categorical import LabelEncoder
from sdv.constraints import create_custom_constraint_class
from sdv.sampling import Condition
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import get_column_plot


from datetime import datetime
from datetime import timedelta
import math
import graphviz

from sqlalchemy import create_engine, MetaData


In [281]:
pd.set_option('display.max_columns', 40)


In [282]:
db_config = {'user': 'mentor',
             'pwd': 'm06uz3.tMQ017y8n',
             'host': '185.86.147.205',
             'port': 5432,
             'db': 'db_mentor'}

connection_string = 'postgresql://{}:{}@{}:{}/{}'.format(db_config['user'],
                                                         db_config['pwd'],
                                                         db_config['host'],
                                                         db_config['port'],
                                                         db_config['db'])
engine = create_engine(connection_string)


In [283]:
db_config = {'user': 'postgres',
             'pwd': 'I10v33b01a',
             'host': 'localhost',
             'port': 5432,
             'db': 'test_db'}

connection_string = 'postgresql://{}:{}@{}:{}/{}'.format(db_config['user'],
                                                         db_config['pwd'],
                                                         db_config['host'],
                                                         db_config['port'],
                                                         db_config['db'])
engine = create_engine(connection_string)


In [284]:
sql_domain = '''
select  *
from    domain
'''

domain = pd.read_sql_query(sql_domain, con=engine)


In [285]:
sql_region = '''
select  *
from    region
'''

region = pd.read_sql_query(sql_region, con=engine)


In [286]:
sql_users = '''
select  *
from    users
'''

users = pd.read_sql_query(sql_users, con=engine)


In [287]:
sql_sessions = '''
select  *
from    sessions
'''

sessions = pd.read_sql_query(sql_sessions, con=engine)


In [288]:
users_without_sessions = users[(~users['user_id'].isin(sessions['mentor_id'])) &
                               (~users['user_id'].isin(sessions['mentee_id']))]


In [289]:
sessions_and_users = (
    sessions.copy().
    merge(
        users.copy(),
        how='left',
        left_on='mentor_id',
        right_on='user_id')
    .merge(
        users.copy(),
        how='left',
        left_on='mentee_id',
        right_on='user_id',
        suffixes=('_mentor', '_mentee'))
    .drop(
        ['user_id_mentor', 'user_id_mentee', 'role_mentor', 'role_mentee'],
        axis=1))

sessions_and_users = sessions_and_users.rename(
    columns={'name_r_mentor': 'mentor_region', 'name_r_mentee': 'mentee_region'})

sessions_and_users['mentee_id'] = 20000 + sessions_and_users['mentee_id']
sessions_and_users['mentor_id'] = 30000 + sessions_and_users['mentor_id']

sessions_and_users = sessions_and_users[['session_id', 'session_date_time', 'session_status', 'mentor_domain_id',
                                         'mentor_id',  'reg_date_mentor', 'region_id_mentor',
                                         'mentee_id', 'reg_date_mentee', 'region_id_mentee']]


In [290]:
columns1 = ['session_id', 'session_date_time', 'session_status', 'mentor_domain_id',
            'mentor_id', 'reg_date_mentor', 'mentee_id',
            'reg_date_mentee', 'region_id_mentor', 'region_id_mentee']


sessions_and_users1 = sessions_and_users[columns1]


In [291]:
sessions_and_users1 = sessions_and_users1.astype(
    {
        'session_id': 'int64',
        'session_date_time': 'datetime64[ns]',
        'session_status': 'object',
        'mentor_domain_id': 'int16',
        'mentor_id': 'int32',
        'reg_date_mentor': 'datetime64[ns]',
        'mentee_id': 'int32',
        'reg_date_mentee': 'datetime64[ns]',
        'region_id_mentor': 'int16',
        'region_id_mentee': 'int16'
    }
)


In [292]:
unique_mentor_id = sessions_and_users1.sort_values(by='reg_date_mentor')['mentor_id'].unique()
mentor_id_map = {}

for i, j in enumerate(unique_mentor_id, 30001):
    mentor_id_map[j] = i

sessions_and_users1['mentor_id'] = sessions_and_users1['mentor_id'].map(mentor_id_map)

unique_mentee_id = sessions_and_users1.sort_values(by='reg_date_mentee')['mentee_id'].unique()
mentee_id_map = {}

for i, j in enumerate(unique_mentee_id, 20001):
    mentee_id_map[j] = i

sessions_and_users1['mentee_id'] = sessions_and_users1['mentee_id'].map(mentee_id_map)


In [293]:
sessions_and_users1.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6524 entries, 0 to 6523
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   session_id         6524 non-null   int64         
 1   session_date_time  6524 non-null   datetime64[ns]
 2   session_status     6524 non-null   object        
 3   mentor_domain_id   6524 non-null   int16         
 4   mentor_id          6524 non-null   int64         
 5   reg_date_mentor    6524 non-null   datetime64[ns]
 6   mentee_id          6524 non-null   int64         
 7   reg_date_mentee    6524 non-null   datetime64[ns]
 8   region_id_mentor   6524 non-null   int16         
 9   region_id_mentee   6524 non-null   int16         
dtypes: datetime64[ns](3), int16(3), int64(3), object(1)
memory usage: 395.1+ KB


In [294]:
sessions_and_users1.isna().any(axis=0)


session_id           False
session_date_time    False
session_status       False
mentor_domain_id     False
mentor_id            False
reg_date_mentor      False
mentee_id            False
reg_date_mentee      False
region_id_mentor     False
region_id_mentee     False
dtype: bool

In [295]:
metadata = SingleTableMetadata()

metadata.detect_from_dataframe(data=sessions_and_users1)

metadata.update_column(
    column_name='session_id',
    sdtype='id',
    regex_format='[0-9]{6}')


metadata.update_column(
    column_name='mentor_domain_id',
    sdtype='numerical',
    computer_representation='Int64')


metadata.update_column(
    column_name='mentor_id',
    sdtype='numerical',
    computer_representation='Int32')


metadata.update_column(
    column_name='mentee_id',
    sdtype='numerical',
    computer_representation='Int32')

metadata.update_column(
    column_name='region_id_mentor',
    sdtype='numerical',
    computer_representation='Int64')

metadata.update_column(
    column_name='region_id_mentee',
    sdtype='numerical',
    computer_representation='Int64')


metadata.update_column(
    column_name='session_date_time',
    sdtype='datetime',
    datetime_format='%Y-%m-%d')

metadata.update_column(
    column_name='reg_date_mentor',
    sdtype='datetime',
    datetime_format='%Y-%m-%d')

metadata.update_column(
    column_name='reg_date_mentee',
    sdtype='datetime',
    datetime_format='%Y-%m-%d')


In [296]:
metadata.validate()


In [297]:
distributions = {'session_date_time': 'uniform'}

synthesizer_1 = GaussianCopulaSynthesizer(
    metadata,
    enforce_min_max_values=True,
    numerical_distributions=distributions)


In [298]:
synthesizer_1.get_parameters()


{'enforce_min_max_values': True,
 'enforce_rounding': True,
 'locales': None,
 'numerical_distributions': {'session_date_time': 'uniform'},
 'default_distribution': 'beta'}

In [299]:
'''my_constraint_sessions_date_time = {
    'constraint_class': 'ScalarInequality',
    'constraint_parameters': {
        'column_name': 'session_date_time',
        'relation': '<=',
        'value':  '2022-09-15'
    }
}

synthesizer_1.add_constraints(constraints=[
    my_constraint_sessions_date_time
])

my_constraint_dates_mentor = {
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'reg_date_mentor',
        'high_column_name': 'session_date_time',
        'strict_boundaries': False
    }
}

synthesizer_1.add_constraints(constraints=[
    my_constraint_dates_mentor
])


my_constraint_dates_mentee = {
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'reg_date_mentee',
        'high_column_name': 'session_date_time',
        'strict_boundaries': False
    }
}

synthesizer_1.add_constraints(constraints=[
    my_constraint_dates_mentee
])
'''


"my_constraint_sessions_date_time = {\n    'constraint_class': 'ScalarInequality',\n    'constraint_parameters': {\n        'column_name': 'session_date_time',\n        'relation': '<=',\n        'value':  '2022-09-15'\n    }\n}\n\nsynthesizer_1.add_constraints(constraints=[\n    my_constraint_sessions_date_time\n])\n\nmy_constraint_dates_mentor = {\n    'constraint_class': 'Inequality',\n    'constraint_parameters': {\n        'low_column_name': 'reg_date_mentor',\n        'high_column_name': 'session_date_time',\n        'strict_boundaries': False\n    }\n}\n\nsynthesizer_1.add_constraints(constraints=[\n    my_constraint_dates_mentor\n])\n\n\nmy_constraint_dates_mentee = {\n    'constraint_class': 'Inequality',\n    'constraint_parameters': {\n        'low_column_name': 'reg_date_mentee',\n        'high_column_name': 'session_date_time',\n        'strict_boundaries': False\n    }\n}\n\nsynthesizer_1.add_constraints(constraints=[\n    my_constraint_dates_mentee\n])\n"

In [300]:
'''conditions = []

mentor_conditions = ['mentor_id', 'reg_date_mentor', 'mentor_domain_id']

df2 = pd.DataFrame(sessions_and_users1.value_counts(mentor_conditions, sort=False)).reset_index()

for i, row in df2.iterrows():
    cond = Condition(
        num_rows=row[3],
        column_values={
            mentor_conditions[0]: row[mentor_conditions[0]],
            mentor_conditions[1]: row[mentor_conditions[1]],
            mentor_conditions[2]: row[mentor_conditions[2]]}
    )
    conditions.append(cond)

mentee_conditions = ['mentee_id', 'reg_date_mentee']

df1 = pd.DataFrame(sessions_and_users1.value_counts(mentee_conditions, sort=False)).reset_index()

for i, row in df1.iterrows():
    cond = Condition(
        num_rows=row[2],
        column_values={mentee_conditions[0]: row[mentee_conditions[0]], mentee_conditions[1]: row[mentee_conditions[1]]}
    )
    conditions.append(cond)


df_ses = pd.DataFrame(sessions_and_users1.value_counts(subset=['session_date_time'], sort=False)).reset_index()

for i, row in df_ses.iterrows():
    cond = Condition(
        num_rows=row[1],
        column_values={'session_date_time': row['session_date_time']}
    )
    conditions.append(cond)'''


"conditions = []\n\nmentor_conditions = ['mentor_id', 'reg_date_mentor', 'mentor_domain_id']\n\ndf2 = pd.DataFrame(sessions_and_users1.value_counts(mentor_conditions, sort=False)).reset_index()\n\nfor i, row in df2.iterrows():\n    cond = Condition(\n        num_rows=row[3],\n        column_values={\n            mentor_conditions[0]: row[mentor_conditions[0]],\n            mentor_conditions[1]: row[mentor_conditions[1]],\n            mentor_conditions[2]: row[mentor_conditions[2]]}\n    )\n    conditions.append(cond)\n\nmentee_conditions = ['mentee_id', 'reg_date_mentee']\n\ndf1 = pd.DataFrame(sessions_and_users1.value_counts(mentee_conditions, sort=False)).reset_index()\n\nfor i, row in df1.iterrows():\n    cond = Condition(\n        num_rows=row[2],\n        column_values={mentee_conditions[0]: row[mentee_conditions[0]], mentee_conditions[1]: row[mentee_conditions[1]]}\n    )\n    conditions.append(cond)\n\n\ndf_ses = pd.DataFrame(sessions_and_users1.value_counts(subset=['session_dat

In [301]:
conditions = []

mentor_conditions = ['mentor_id', 'reg_date_mentor', 'mentor_domain_id']

df2 = pd.DataFrame(sessions_and_users1.value_counts(mentor_conditions, sort=False)).reset_index()

for i, row in df2.iterrows():
    cond = Condition(
        num_rows=row[3],
        column_values={
            mentor_conditions[0]: row[mentor_conditions[0]],
            mentor_conditions[1]: row[mentor_conditions[1]],
            mentor_conditions[2]: row[mentor_conditions[2]]}
    )
    conditions.append(cond)

mentee_conditions = ['mentee_id', 'reg_date_mentee']

df1 = pd.DataFrame(sessions_and_users1.value_counts(mentee_conditions, sort=False)).reset_index()

for i, row in df1.iterrows():
    cond = Condition(
        num_rows=row[2],
        column_values={
            mentee_conditions[0]: row[mentee_conditions[0]],
            mentee_conditions[1]: row[mentee_conditions[1]]}
    )
    conditions.append(cond)


dates_conditions = ['session_date_time', 'reg_date_mentor', 'reg_date_mentee']

df3 = pd.DataFrame(sessions_and_users1.value_counts(dates_conditions, sort=False)).reset_index()

for i, row in df3.iterrows():
    cond = Condition(
        num_rows=row[3],
        column_values={
            dates_conditions[0]: row[dates_conditions[0]],
            dates_conditions[1]: row[dates_conditions[1]],
            dates_conditions[2]: row[dates_conditions[2]]
        }
    )
    conditions.append(cond)


In [302]:
sessions_and_users1.head(3)


Unnamed: 0,session_id,session_date_time,session_status,mentor_domain_id,mentor_id,reg_date_mentor,mentee_id,reg_date_mentee,region_id_mentor,region_id_mentee
0,1,2021-02-12,finished,5,30011,2021-01-12,20115,2021-02-05,3,3
1,2,2021-02-17,finished,7,30027,2021-01-31,20108,2021-02-02,6,4
2,3,2021-02-19,finished,5,30019,2021-01-21,20130,2021-02-08,1,2


In [303]:
synthesizer_1.fit(sessions_and_users1)


In [304]:
synthesizer_1.get_constraints()


[]

In [305]:
'''synthetic_data_with_conditions = synthesizer_1.sample(num_rows=len(sessions_and_users1)*2)'''


'synthetic_data_with_conditions = synthesizer_1.sample(num_rows=len(sessions_and_users1)*2)'

In [306]:
synthetic_data_with_conditions = synthesizer_1.sample_from_conditions(
    conditions=conditions
)


Sampling conditions: 100%|██████████| 19572/19572 [07:56<00:00, 41.11it/s]


In [307]:

synthetic_data_with_conditions['Error'] = ''

for i, row in synthetic_data_with_conditions.iterrows():
    if row['session_date_time'] < row['reg_date_mentor'] or row['session_date_time'] < row['reg_date_mentee']:
        synthetic_data_with_conditions.at[i, 'Error'] = 'Error'
synthetic_data_with_conditions = synthetic_data_with_conditions[synthetic_data_with_conditions['Error'] != 'Error'].drop(
    'Error', axis=1)


In [308]:
quality_report = evaluate_quality(
    real_data=sessions_and_users1,
    synthetic_data=synthetic_data_with_conditions,
    metadata=metadata)


Creating report: 100%|██████████| 4/4 [00:00<00:00, 24.02it/s]



Overall Quality Score: 93.78%

Properties:
Column Shapes: 91.86%
Column Pair Trends: 95.71%


In [309]:
for col in synthetic_data_with_conditions.columns:
    try:
        fig = get_column_plot(
            real_data=sessions_and_users1,
            synthetic_data=synthetic_data_with_conditions,
            column_name=col, metadata=metadata)

        fig.show()
    except ValueError:
        continue


In [310]:
sessions_synthetic = synthetic_data_with_conditions[[
    'session_id', 'session_date_time', 'mentor_id', 'mentee_id', 'session_status', 'mentor_domain_id']]
sessions_synthetic = sessions_synthetic.sort_values(by='session_date_time')
sessions_synthetic['session_id'] = np.arange(1, len(sessions_synthetic)+1)


In [311]:
mentee_synthetic = synthetic_data_with_conditions[['mentee_id', 'reg_date_mentee', 'region_id_mentee']].copy()
mentee_synthetic['role'] = 'mentee'
mentee_synthetic = mentee_synthetic[['mentee_id', 'reg_date_mentee', 'role', 'region_id_mentee']]
mentee_synthetic.sort_values(by='reg_date_mentee', inplace=True)
mentee_synthetic.drop_duplicates(subset='mentee_id', inplace=True)
mentee_synthetic.rename(
    columns={
        'mentee_id': 'user_id',
        'reg_date_mentee': 'reg_date',
        'region_id_mentee': 'region_id'},
    inplace=True
)


In [312]:
mentor_synthetic = synthetic_data_with_conditions[['mentor_id', 'reg_date_mentor', 'region_id_mentor']].copy()
mentor_synthetic['role'] = 'mentor'
mentor_synthetic = mentor_synthetic[['mentor_id', 'reg_date_mentor', 'role', 'region_id_mentor']]
mentor_synthetic.sort_values(by='reg_date_mentor', inplace=True)
mentor_synthetic.drop_duplicates(subset='mentor_id', inplace=True)
mentor_synthetic.rename(
    columns={
        'mentor_id': 'user_id',
        'reg_date_mentor': 'reg_date',
        'region_id_mentor': 'region_id'},
    inplace=True
)


In [313]:
users_with_sessions_synthetic = pd.concat([mentee_synthetic, mentor_synthetic], axis=0, ignore_index=True)


In [314]:
users_without_sessions['user_id'] = 10000 + users_without_sessions['user_id']


In [315]:
unique_user_id = users_without_sessions.sort_values(by='reg_date')['user_id'].unique()

user_id_map = {}

for i, j in enumerate(unique_user_id, 10001):
    user_id_map[j] = i

users_without_sessions['user_id'] = users_without_sessions['user_id'].map(user_id_map)


In [316]:
metadata_user_without_sessions = SingleTableMetadata()

metadata_user_without_sessions.detect_from_dataframe(data=users_without_sessions)

metadata_user_without_sessions.update_column(
    column_name='user_id',
    sdtype='id',
    regex_format='1[0-9]{4}')


metadata_user_without_sessions.update_column(
    column_name='region_id',
    sdtype='numerical',
    computer_representation='Int64')

metadata_user_without_sessions.update_column(
    column_name='reg_date',
    sdtype='datetime',
    datetime_format='%Y-%m-%d')


In [317]:
metadata_user_without_sessions.validate()


In [318]:
synthesizer_users_without_sessions = GaussianCopulaSynthesizer(
    metadata_user_without_sessions,
    numerical_distributions={
        'reg_date': 'uniform'
    }
)


In [319]:
my_constraint_region_id_mentor = {
    'constraint_class': 'ScalarInequality',
    'constraint_parameters': {
        'column_name': 'region_id',
        'relation': '<=',
        'value':  19
    }
}

synthesizer_users_without_sessions.add_constraints(constraints=[
    my_constraint_region_id_mentor
])


my_constraint_sessions_date_time = {
    'constraint_class': 'ScalarInequality',
    'constraint_parameters': {
        'column_name': 'reg_date',
        'relation': '<=',
        'value':  '2022-09-15'
    }
}

synthesizer_users_without_sessions.add_constraints(constraints=[
    my_constraint_sessions_date_time
])


In [320]:
synthesizer_users_without_sessions.fit(users_without_sessions)


In [321]:
synthesizer_users_without_sessions.get_info()


{'class_name': 'GaussianCopulaSynthesizer',
 'creation_date': '2023-07-13',
 'is_fit': True,
 'last_fit_date': '2023-07-13',
 'fitted_sdv_version': '1.2.0'}

In [322]:
users_without_sessions_synthetic = synthesizer_users_without_sessions.sample(
    num_rows=len(users_without_sessions))


Sampling rows: 100%|██████████| 662/662 [00:00<00:00, 16108.82it/s]


In [323]:
original_dates = users_without_sessions.sort_values(by='reg_date')[['reg_date']].reset_index().drop('index', axis=1)
users_without_sessions_synthetic1 = users_without_sessions_synthetic.rename(columns={'reg_date': 'reg_date_s'})
users_without_sessions_synthetic1 = users_without_sessions_synthetic1.sort_values(
    by='reg_date_s')
users_without_sessions_synthetic1 = users_without_sessions_synthetic1.reset_index().drop('index', axis=1)
users_without_sessions_synthetic1 = pd.concat(
    [users_without_sessions_synthetic1, original_dates],
    axis=1)
users_without_sessions_synthetic1 = users_without_sessions_synthetic1.drop(columns='reg_date_s')
users_without_sessions_synthetic1


Unnamed: 0,user_id,role,region_id,reg_date
0,10661,mentee,5,2021-01-01
1,10199,mentee,9,2021-01-04
2,10513,mentee,2,2021-01-06
3,10206,mentee,1,2021-01-07
4,10207,mentee,1,2021-01-07
...,...,...,...,...
657,10168,mentee,4,2022-09-11
658,10170,mentee,2,2022-09-12
659,10039,mentee,3,2022-09-13
660,10153,mentee,1,2022-09-14


In [324]:
quality_report = evaluate_quality(
    real_data=users_without_sessions,
    synthetic_data=users_without_sessions_synthetic1,
    metadata=metadata_user_without_sessions)


Creating report: 100%|██████████| 4/4 [00:00<00:00, 317.00it/s]


Overall Quality Score: 95.39%

Properties:
Column Shapes: 96.12%
Column Pair Trends: 94.66%





In [325]:
for col in users_without_sessions_synthetic.columns:
    try:
        fig = get_column_plot(
            real_data=users_without_sessions,
            synthetic_data=users_without_sessions_synthetic1,
            column_name=col,
            metadata=metadata_user_without_sessions
        )

        fig.show()
    except ValueError:
        continue


In [326]:
users_synthetic = pd.concat(
    [users_with_sessions_synthetic, users_without_sessions_synthetic1],
    axis=0, ignore_index=True)


In [327]:
db_config_copy = {'user': 'postgres',
                  'pwd': 'I10v33b01a',
                  'host': 'localhost',
                  'port': 5432,
                  'db': 'db_mentor_copy_2'}

connection_string_copy = 'postgresql://{}:{}@{}/{}'.format(db_config_copy['user'],
                                                           db_config_copy['pwd'],
                                                           db_config_copy['host'],
                                                           db_config_copy['db'])
engine_copy = create_engine(connection_string_copy)


In [328]:
database_tables = ['region', 'users', 'domain', 'sessions']
database_csv = []


In [329]:
for table in database_tables:
    engine_copy.execute(f'truncate {table} CASCADE')
    print(f'{table} очищена!')


region очищена!
users очищена!
domain очищена!
sessions очищена!


In [330]:
new_db = [region, users_synthetic, domain, sessions_synthetic]


In [331]:
for table, name in zip(new_db, database_tables):
    table.to_sql(name, con=engine_copy, if_exists='append', index=False)
    print(f'таблица {name} заполнена')


таблица region заполнена
таблица users заполнена
таблица domain заполнена
таблица sessions заполнена
