# Create data

## Setup

In [1]:
import sqlite3
import pickle

import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)

## Initialise DB for errors and tables

In [3]:
cnx = sqlite3.connect('00_db.db')
cnx.close()

In [4]:
cnx = sqlite3.connect('00_db.db')

cursor = cnx.cursor()
cursor.execute('SELECT name from sqlite_master where type= "table"')

if 'df_issues' not in [x[0] for x in cursor.fetchall()]:
    pd.DataFrame(
        columns=[
            "key_1", "key_2", "key_3", "file", "sub_file", "step_number",
            "category", "issue_short_desc", "issue_long_desc", "column",
            "issue_count", "issue_idx", "grouping"
        ]
    ).to_sql('df_issues', cnx)

cnx.close()

## Create tables

### Conversions

In [5]:
df_convert = pd.DataFrame(
    [
        ('A', '1', '0.6', '2019-01-01'),
        ('B', '4', '5.2', '2019-02-05'),
        ('C', '1', '5.6', '2018-12-17'),
        ('D', '10', '15.9', '2019-07-18'),
        ('E', '-8', '4.7', '2018-03-09')
    ],
    columns=['object', 'int', 'float', 'date']
)

In [6]:
df_convert_issues = pd.DataFrame(
    [
        ('A', '1', '0.6', '2019-02-29'),
        ('B', '4.5', 'A', '2019-22-05'),
        ('C', '1', '5.6', '2018-12-17'),
        ('D', 'b', '15.9', '2019-09-31'),
        (5, '-8', '4.7', '2018-03-09')
    ],
    columns=['object', 'int', 'float', 'date']
)

### Altering

In [7]:
df_alterations = pd.DataFrame(
    [
        ('A', 2, 'key_1'),
        ('B', 199, 'key_2'),
        ('C', -1, 'key_1'),
        ('D', 20, 'key_3'),
        ('E', 6, 'key_2')
    ],
    columns=['to_map', 'add_1', 'merge_key']
)

In [8]:
df_alterations_issues = pd.DataFrame(
    [
        ('A', 2, 'key_1'),
        ('B', 199, 2),
        ('C', -1, 'key_1'),
        (['D'], 'a', 'key_3'),
        ('E', 6, 'key_2')
    ],
    columns=['to_map', 'add_1', 'merge_key']
)

### Checks

In [9]:
df_checks = pd.DataFrame(
    [
        (3, 'A', 'a'),
        (10, 'A', 'z'),
        (9, 'B', 'b'),
        (4, 'D', 'd'),
        (7, 'C', 'c')
    ],
    columns=['number', 'category_1', 'category_2']
)

In [10]:
df_checks_issues = pd.DataFrame(
    [
        (1, 'Z', 'y'),
        (10, 'A', 'a'),
        (9, 'Y', 'b'),
        (4, 'B', 'b'),
        (-1, 'C', 'c')
    ],
    columns=['number', 'category_1', 'category_2']
)

### For summary tables

In [11]:
df_summary = pd.DataFrame(
    [
        ('b', 'c', 1, 6),
        ('d', 'b', 1, 9),
        ('c', 'b', 1, 0),
        ('d', 'd', 1, 9),
        ('c', 'b', 1, 1),
        ('a', 'd', 1, 3),
        ('c', 'c', 1, 0),
        ('c', 'd', 1, 0),
        ('c', 'c', 1, 0),
        ('a', 'e', 1, 4),
        ('b', 'e', 1, 7),
        ('a', 'd', 1, 4),
        ('b', 'e', 1, 6),
        ('b', 'c', 1, 8),
        ('b', 'c', 1, 7),
        ('d', 'e', 1, 9),
        ('a', 'b', 1, 5),
        ('a', 'd', 1, 5),
        ('a', 'b', 1, 4),
        ('d', 'b', 1, 10),
        ('b', 'c', 1, 6),
        ('b', 'e', 1, 7),
        ('a', 'e', 1, 4),
        ('a', 'c', 1, 3),
        ('c', 'c', 1, 0),
        ('c', 'd', 1, 2),
        ('a', 'b', 1, 3),
        ('a', 'e', 1, 5),
        ('a', 'c', 1, 3),
        ('a', 'e', 1, 4),
        ('b', 'd', 1, 6),
        ('c', 'e', 1, 1),
        ('b', 'e', 1, 7),
        ('c', 'c', 1, 0),
        ('a', 'c', 1, 5),
        ('c', 'b', 1, 0),
        ('d', 'b', 1, 8),
        ('d', 'e', 1, 10),
        ('d', 'c', 1, 8),
        ('a', 'd', 1, 3),
        ('d', 'e', 1, 10),
        ('d', 'c', 1, 8),
        ('d', 'e', 1, 10),
        ('a', 'c', 1, 4),
        ('d', 'b', 1, 8),
        ('d', 'b', 1, 10),
        ('d', 'e', 1, 10),
        ('a', 'c', 1, 5),
        ('a', 'd', 1, 5),
        ('d', 'c', 1, 10)
    ],
    columns=['str', 'str_2', 'count', 'int_max']
)

## Write out data

In [12]:
df_convert.to_csv('data/df_convert.tsv', sep='\t', index=False)
df_convert_issues.to_csv('data/df_convert_issues.tsv', sep='\t', index=False)

df_alterations.to_csv('data/df_alterations.tsv', sep='\t', index=False)
df_alterations_issues.to_csv('data/df_alterations_issues.tsv', sep='\t', index=False)

pickle.dump(df_checks, open('data/df_checks.pkl', 'wb'))
pickle.dump(df_checks_issues, open('data/df_checks_issues.pkl', 'wb'))

pickle.dump(df_summary, open('data/df_summary.pkl', 'wb'))

---

**GigiSR**