In [1]:
from IPython.core.display import HTML
HTML('<style type="text/css">' + open('assets/css/notebook.css').read() + '</style>')

**GigiSR**
<hr>

# Example notebook

## Contents 
<hr>

+ [Setup](#Setup)
+ [Example](#Example)
+ [Example from scripts](#Example-from-scripts)
+ [General testing](#General-testing)

## Setup

<hr>

In [2]:
import pickle
import os

import pandas as pd
import numpy as np
import xlrd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 12)
pd.set_option('display.float_format', '{:,.2f}'.format)

In [3]:
from data_curation import DataCuration

## Example
<hr>

Initialise the `DataCuration` object, this is the class we will be using to do our data curation

In [13]:
dc = DataCuration('a')

In [14]:
def list_the_files(path):
    list_files = os.listdir(path)
    list_files = [os.path.abspath(os.path.join(path, x)) for x in list_files]
    list_files = [x for x in list_files if '.xlsx' in x.lower()]
    return list_files


dc.find_files('data/input/test_scripts_1/', function=list_the_files)

dc.list_files

['C:\\Users\\georg\\Documents\\workspace\\modules\\data_curation\\data\\input\\test_scripts_1\\A.xlsx']

In [15]:
def read_files(list_files, **kwargs):
    dict_files = dict()
    for file in list_files:
        xl = pd.ExcelFile(file)
        for sheet in xl.sheet_names:
            df = xl.parse(sheet_name=sheet, dtype=str, keep_default_na=False)
            key = '{} -:- {}'.format(
                file.split('\\')[-1].lower().replace('.xlsx', ''), sheet)
            dict_files[key] = df.copy()
    return dict_files


dc.reading_in(function=read_files)

[(x, dc.tables[x].shape) for x in dc.tables.keys()]

[('a -:- Sheet1', (11, 4))]

In [16]:
dc.tables['a -:- Sheet1']

Unnamed: 0,Number,A date,Another date£,StringStringString
0,1,2019-01-01 00:00:00,2018-07-07 00:00:00,A string this is
1,1,2019-01-01 00:00:00,2018-04-09 00:00:00,Test
2,1,2019-01-01 00:00:00,2018-01-10 00:00:00,testing
3,3,2019-01-01 00:00:00,2017-10-13 00:00:00,test test test
4,4,2019-01-01 00:00:00,2017-07-16 00:00:00,
5,5,2019-01-01 00:00:00,2017-04-18 00:00:00,
6,6,2019-01-01 00:00:00,2017-01-19 00:00:00,Blah
7,7,2019-01-01 00:00:00,2016-10-22 00:00:00,Dah
8,1234,2019-01-01 00:00:00,2016-07-25 00:00:00,Doh
9,3,2019-01-01 00:00:00,2016-04-27 00:00:00,Boh


In [17]:
dict_dfs = dc.tables

In [18]:
dc.set_headers(['number', 'date_1', 'date_2', 'string'])

dc.tables['a -:- Sheet1']

Unnamed: 0,number,date_1,date_2,string
0,1,2019-01-01 00:00:00,2018-07-07 00:00:00,A string this is
1,1,2019-01-01 00:00:00,2018-04-09 00:00:00,Test
2,1,2019-01-01 00:00:00,2018-01-10 00:00:00,testing
3,3,2019-01-01 00:00:00,2017-10-13 00:00:00,test test test
4,4,2019-01-01 00:00:00,2017-07-16 00:00:00,
5,5,2019-01-01 00:00:00,2017-04-18 00:00:00,
6,6,2019-01-01 00:00:00,2017-01-19 00:00:00,Blah
7,7,2019-01-01 00:00:00,2016-10-22 00:00:00,Dah
8,1234,2019-01-01 00:00:00,2016-07-25 00:00:00,Doh
9,3,2019-01-01 00:00:00,2016-04-27 00:00:00,Boh


In [19]:
dc.assert_nulls([''])

dc.tables['a -:- Sheet1']

Unnamed: 0,number,date_1,date_2,string
0,1,2019-01-01 00:00:00,2018-07-07 00:00:00,A string this is
1,1,2019-01-01 00:00:00,2018-04-09 00:00:00,Test
2,1,2019-01-01 00:00:00,2018-01-10 00:00:00,testing
3,3,2019-01-01 00:00:00,2017-10-13 00:00:00,test test test
4,4,2019-01-01 00:00:00,2017-07-16 00:00:00,
5,5,2019-01-01 00:00:00,2017-04-18 00:00:00,
6,6,2019-01-01 00:00:00,2017-01-19 00:00:00,Blah
7,7,2019-01-01 00:00:00,2016-10-22 00:00:00,Dah
8,1234,2019-01-01 00:00:00,2016-07-25 00:00:00,Doh
9,3,2019-01-01 00:00:00,2016-04-27 00:00:00,Boh


In [20]:
dict_convert = {
    'columns': ['number'],
    'dtype': ['int', 'float'],
    'functions': {
        '1': lambda df, col: df[col].astype(int)
    }
}

dc.tables['a -:- Sheet1']

Unnamed: 0,number,date_1,date_2,string
0,1,2019-01-01 00:00:00,2018-07-07 00:00:00,A string this is
1,1,2019-01-01 00:00:00,2018-04-09 00:00:00,Test
2,1,2019-01-01 00:00:00,2018-01-10 00:00:00,testing
3,3,2019-01-01 00:00:00,2017-10-13 00:00:00,test test test
4,4,2019-01-01 00:00:00,2017-07-16 00:00:00,
5,5,2019-01-01 00:00:00,2017-04-18 00:00:00,
6,6,2019-01-01 00:00:00,2017-01-19 00:00:00,Blah
7,7,2019-01-01 00:00:00,2016-10-22 00:00:00,Dah
8,1234,2019-01-01 00:00:00,2016-07-25 00:00:00,Doh
9,3,2019-01-01 00:00:00,2016-04-27 00:00:00,Boh


## Example from scripts
<hr>

In [21]:
dc = pickle.load(open('pickles/df.pkl', 'rb'))

In [22]:
dc.list_files

['C:\\Users\\georg\\Documents\\workspace\\modules\\data_curation\\data\\input\\test_scripts_1\\A.xlsx']

In [23]:
dc.tables['a -:- Sheet1']

Unnamed: 0,number,date_1,date_2,string,number_2,key_1,key_2
0,1,2019-01-01,2018-07-07,A string this is,2,A,
1,1,2019-01-01,2018-04-09,Test,2,A,
2,1,2019-01-01,2018-01-10,testing,2,A,
3,3,2019-01-01,2017-10-13,test test test,6,A,
4,4,2019-01-01,2017-07-16,,8,A,
5,5,2019-01-01,2017-04-18,,10,A,
6,6,2019-01-01,2017-01-19,Blah,12,A,
7,7,2019-01-01,2016-10-22,Dah,14,A,
8,1234,2019-01-01,2016-07-25,Doh,2468,A,
9,3,2019-01-01,2016-04-27,Boh,6,A,


In [24]:
dc.df_issues

Unnamed: 0,key_1,key_2,key_3,file,sub_file,step_number,issue_short_desc,issue_long_desc,column,issue_count,issue_idx


## General testing
<hr>

In [None]:
def test():
    return 2

In [None]:
test()

In [None]:
type(test).__name__

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.DataFrame(
    [
        ('nan', 'NA') for x in range(0, 1000)
    ]
)

In [None]:
df.replace('nan', np.nan)

In [None]:
import string

In [None]:
dict_test = {
    'a': pd.DataFrame(columns=list(string.ascii_lowercase)[0:5]),
    'b': pd.DataFrame(columns=list(string.ascii_lowercase)[0:5]),
    'c': pd.DataFrame(columns=list(string.ascii_lowercase)[0:5]),
    'd': pd.DataFrame(columns=list(string.ascii_lowercase)[0:4]),
    'e': pd.DataFrame(columns=list(string.ascii_lowercase)[0:5])
}

In [None]:
list_test = [dict_test[x].shape[1] for x in dict_test.keys()]

In [None]:
len(set(list_test))

In [None]:
dict_test[[x for x in dict_test.keys()][0]]