# Example notebook

## Contents 
<hr>

+ [Setup](#Setup)
+ [Example](#Example)
+ [Example from scripts](#Example-from-scripts)
+ [General testing](#General-testing)

## Setup

<hr>

In [1]:
import pickle
import os

import pandas as pd
import numpy as np
import xlrd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 12)
pd.set_option('display.float_format', '{:,.2f}'.format)

In [2]:
from data_curation import DataCuration

## Example
<hr>

Initialise the `DataCuration` object, this is the class we will be using to do our data curation

In [3]:
dc = DataCuration('a', 'a')

In [4]:
def list_the_files(path):
    list_files = os.listdir(path)
    list_files = [os.path.abspath(os.path.join(path, x)) for x in list_files]
    list_files = [
        x for x in list_files if '.xlsx' 
        in x.lower() if ('~' not in x) & (x.split('\\')[-1] != 'headers.xlsx')
    ]
    return list_files


dc.find_files('../data/input/test_scripts_1/', function=list_the_files)

dc.list_files

['C:\\Users\\georg\\Documents\\workspace\\modules\\data_curation\\data\\input\\test_scripts_1\\A.xlsx']

In [5]:
def read_files(list_files, **kwargs):
    dict_files = dict()
    for file in list_files:
        xl = pd.ExcelFile(file)
        for sheet in xl.sheet_names:
            df = xl.parse(sheet_name=sheet, dtype=str, keep_default_na=False)
            key = '{} -:- {}'.format(
                file.split('\\')[-1].lower().replace('.xlsx', ''), sheet)
            dict_files[key] = df.copy()
    return dict_files


dc.reading_in(function=read_files, overwrite=True)

[(x, dc.tables[x].shape) for x in dc.tables.keys()]

[('a -:- Sheet1', (11, 6))]

In [6]:
dc.tables['a -:- Sheet1']

Unnamed: 0,Number,A date,Another date£,StringStringString,lat,lng
0,1,2019-01-01 00:00:00,2018-07-07 00:00:00,A string this is,51.5074,0.1278
1,1,2019-01-01 00:00:00,2018-04-09 00:00:00,Test,51.5084,0.1268
2,1,2019-01-01 00:00:00,2018-01-10 00:00:00,testing,51.50939999999999,0.1258
3,3,2019-01-01 00:00:00,2017-10-13 00:00:00,test test test,51.51039999999999,0.1248
4,4,2019-01-01 00:00:00,2017-07-16 00:00:00,,51.51139999999999,0.1238
5,5,2019-01-01 00:00:00,2017-04-18 00:00:00,,51.51239999999999,0.1227999999999999
6,6,2019-01-01 00:00:00,2017-01-19 00:00:00,Blah,51.51339999999998,0.1217999999999999
7,7,2019-01-01 00:00:00,2016-10-22 00:00:00,Dah,51.51439999999998,0.1207999999999999
8,1234,2019-01-01 00:00:00,2016-07-25 00:00:00,Doh,51.51539999999998,0.1197999999999999
9,3,2019-01-01 00:00:00,2016-04-27 00:00:00,Boh,51.516399999999976,0.1187999999999999


In [7]:
dict_dfs = dc.tables

In [8]:
dc.set_headers(['number', 'date_1', 'date_2', 'string', 'lat', 'lng'])

dc.tables['a -:- Sheet1']

Unnamed: 0,number,date_1,date_2,string,lat,lng
0,1,2019-01-01 00:00:00,2018-07-07 00:00:00,A string this is,51.5074,0.1278
1,1,2019-01-01 00:00:00,2018-04-09 00:00:00,Test,51.5084,0.1268
2,1,2019-01-01 00:00:00,2018-01-10 00:00:00,testing,51.50939999999999,0.1258
3,3,2019-01-01 00:00:00,2017-10-13 00:00:00,test test test,51.51039999999999,0.1248
4,4,2019-01-01 00:00:00,2017-07-16 00:00:00,,51.51139999999999,0.1238
5,5,2019-01-01 00:00:00,2017-04-18 00:00:00,,51.51239999999999,0.1227999999999999
6,6,2019-01-01 00:00:00,2017-01-19 00:00:00,Blah,51.51339999999998,0.1217999999999999
7,7,2019-01-01 00:00:00,2016-10-22 00:00:00,Dah,51.51439999999998,0.1207999999999999
8,1234,2019-01-01 00:00:00,2016-07-25 00:00:00,Doh,51.51539999999998,0.1197999999999999
9,3,2019-01-01 00:00:00,2016-04-27 00:00:00,Boh,51.516399999999976,0.1187999999999999


In [9]:
dc.assert_nulls([''])

dc.tables['a -:- Sheet1']

Unnamed: 0,number,date_1,date_2,string,lat,lng
0,1,2019-01-01 00:00:00,2018-07-07 00:00:00,A string this is,51.5074,0.1278
1,1,2019-01-01 00:00:00,2018-04-09 00:00:00,Test,51.5084,0.1268
2,1,2019-01-01 00:00:00,2018-01-10 00:00:00,testing,51.50939999999999,0.1258
3,3,2019-01-01 00:00:00,2017-10-13 00:00:00,test test test,51.51039999999999,0.1248
4,4,2019-01-01 00:00:00,2017-07-16 00:00:00,,51.51139999999999,0.1238
5,5,2019-01-01 00:00:00,2017-04-18 00:00:00,,51.51239999999999,0.1227999999999999
6,6,2019-01-01 00:00:00,2017-01-19 00:00:00,Blah,51.51339999999998,0.1217999999999999
7,7,2019-01-01 00:00:00,2016-10-22 00:00:00,Dah,51.51439999999998,0.1207999999999999
8,1234,2019-01-01 00:00:00,2016-07-25 00:00:00,Doh,51.51539999999998,0.1197999999999999
9,3,2019-01-01 00:00:00,2016-04-27 00:00:00,Boh,51.516399999999976,0.1187999999999999


In [10]:
dict_convert = {
    'int': {
        'columns': ['number'],
        'dtypes': ['int', 'float'],
        'functions': {
            '1': lambda df, col: df[col].astype(int)
        }
    },
    'float': {
        'columns': ['lat', 'lng'],
        'dtypes': ['float'],
        'functions': {
            1: lambda df, col: df[col].astype(float)
        }
    },
    'date': {
        'columns': ['date_1', 'date_2'],
        'dtypes': ['date'],
        'functions': {
            1: lambda df, col: pd.to_datetime(df[col], format='%Y-%m-%d')
        }
    }
}

dc.convert_columns(dictionary=dict_convert)

dc.tables['a -:- Sheet1'].dtypes

number             int32
date_1    datetime64[ns]
date_2    datetime64[ns]
string            object
lat              float64
lng              float64
dtype: object

---
**GigiSR**