# CSV Base Functionality

In [1]:
import importlib
import pandas as pd
import glob

import d6t.stack.combine_csv as d6tc

## Get sample data

In [2]:
import urllib.request
cfg_fname_sample = 'test-data.zip'
urllib.request.urlretrieve("https://github.com/d6t/d6t-lib/raw/master/"+cfg_fname_sample, cfg_fname_sample)
import zipfile
zip_ref = zipfile.ZipFile(cfg_fname_sample, 'r')
zip_ref.extractall('.')
zip_ref.close()

## Case: Clean files
* All files have all columns
* Commonly used csv settings with seperator ',' and header present

In [26]:
cfg_fnames = list(glob.glob('test-data/input/test-data-input-csv-clean-*.csv'))
print(cfg_fnames)

['test-data/input\\test-data-input-csv-clean-jan.csv', 'test-data/input\\test-data-input-csv-clean-feb.csv', 'test-data/input\\test-data-input-csv-clean-mar.csv']


### Detect CSV settings across all files

In [27]:
# finds common csv across all files
cfg_sniff = d6tc.sniff_settings_csv(cfg_fnames)
print(cfg_sniff)


{'delim': ',', 'skiprows': 0, 'has_header': True, 'header': 0}


### Preview and check colums across all files

In [28]:
# get previews
c = d6tc.CombinerCSV(cfg_fnames, all_strings=True) # all_strings=True makes reading faster
col_preview = c.preview_columns()

In [29]:
print('all columns equal?', col_preview['is_all_equal'])
print('')
print('which columns are present in which files?')
print('')
print(col_preview['df_columns_present'].reset_index(drop=True))
print('')
print('in what order do columns appear in the files?')
print('')
print(col_preview['df_columns_order'].reset_index(drop=True))

all columns equal? True

which columns are present in which files?

                            filename  cost  date profit sales
0  test-data-input-csv-clean-feb.csv  True  True   True  True
1  test-data-input-csv-clean-jan.csv  True  True   True  True
2  test-data-input-csv-clean-mar.csv  True  True   True  True

in what order do columns appear in the files?

                            filename cost date profit sales
0  test-data-input-csv-clean-feb.csv    0    1      2     3
1  test-data-input-csv-clean-jan.csv    0    1      2     3
2  test-data-input-csv-clean-mar.csv    0    1      2     3


### Preview Combine

In [30]:
c.combine_preview()

Unnamed: 0,cost,date,profit,sales,filename
0,-80,2011-01-01,20,100,test-data-input-csv-clean-jan.csv
1,-80,2011-01-02,20,100,test-data-input-csv-clean-jan.csv
2,-80,2011-01-03,20,100,test-data-input-csv-clean-jan.csv
0,-90,2011-02-01,110,200,test-data-input-csv-clean-feb.csv
1,-90,2011-02-02,110,200,test-data-input-csv-clean-feb.csv
2,-90,2011-02-03,110,200,test-data-input-csv-clean-feb.csv
0,-100,2011-03-01,200,300,test-data-input-csv-clean-mar.csv
1,-100,2011-03-02,200,300,test-data-input-csv-clean-mar.csv
2,-100,2011-03-03,200,300,test-data-input-csv-clean-mar.csv


### Full Combine

In [31]:
c.combine().head()

Unnamed: 0,cost,date,profit,sales,filename
0,-80,2011-01-01,20,100,test-data-input-csv-clean-jan.csv
1,-80,2011-01-02,20,100,test-data-input-csv-clean-jan.csv
2,-80,2011-01-03,20,100,test-data-input-csv-clean-jan.csv
3,-80,2011-01-04,20,100,test-data-input-csv-clean-jan.csv
4,-80,2011-01-05,20,100,test-data-input-csv-clean-jan.csv


## Case: Mismatched columns
* One file has extra columns

In [32]:
cfg_fnames = list(glob.glob('test-data/input/test-data-input-csv-colmismatch-*.csv'))
print(cfg_fnames)

['test-data/input\\test-data-input-csv-colmismatch-jan.csv', 'test-data/input\\test-data-input-csv-colmismatch-feb.csv', 'test-data/input\\test-data-input-csv-colmismatch-mar.csv']


In [33]:
# get previews
c = d6tc.CombinerCSV(cfg_fnames, all_strings=True) # all_strings=True makes reading faster
col_preview = c.preview_columns()

In [34]:
print('all columns equal?', col_preview['is_all_equal'])
print('')
print('which columns are unique?', col_preview['columns_unique'])
print('')
print('which files have unique columns?')
print('')
print(col_preview['df_columns_present'][col_preview['columns_unique']])

all columns equal? False

which columns are unique? ['profit2']

which files have unique columns?

                                                   profit2
file_path                                                 
test-data/input\test-data-input-csv-colmismatch...   False
test-data/input\test-data-input-csv-colmismatch...   False
test-data/input\test-data-input-csv-colmismatch...    True


In [35]:
c.combine_preview() # keep all columns

Unnamed: 0,cost,date,filename,profit,profit2,sales
0,-80,2011-01-01,test-data-input-csv-colmismatch-jan.csv,20,,100
1,-80,2011-01-02,test-data-input-csv-colmismatch-jan.csv,20,,100
2,-80,2011-01-03,test-data-input-csv-colmismatch-jan.csv,20,,100
0,-90,2011-02-01,test-data-input-csv-colmismatch-feb.csv,110,,200
1,-90,2011-02-02,test-data-input-csv-colmismatch-feb.csv,110,,200
2,-90,2011-02-03,test-data-input-csv-colmismatch-feb.csv,110,,200
0,-100,2011-03-01,test-data-input-csv-colmismatch-mar.csv,200,400.0,300
1,-100,2011-03-02,test-data-input-csv-colmismatch-mar.csv,200,400.0,300
2,-100,2011-03-03,test-data-input-csv-colmismatch-mar.csv,200,400.0,300


In [36]:
c.combine_preview(is_col_common=True) # keep only common columns

Unnamed: 0,cost,date,profit,sales,filename
0,-80,2011-01-01,20,100,test-data-input-csv-colmismatch-jan.csv
1,-80,2011-01-02,20,100,test-data-input-csv-colmismatch-jan.csv
2,-80,2011-01-03,20,100,test-data-input-csv-colmismatch-jan.csv
0,-90,2011-02-01,110,200,test-data-input-csv-colmismatch-feb.csv
1,-90,2011-02-02,110,200,test-data-input-csv-colmismatch-feb.csv
2,-90,2011-02-03,110,200,test-data-input-csv-colmismatch-feb.csv
0,-100,2011-03-01,200,300,test-data-input-csv-colmismatch-mar.csv
1,-100,2011-03-02,200,300,test-data-input-csv-colmismatch-mar.csv
2,-100,2011-03-03,200,300,test-data-input-csv-colmismatch-mar.csv


## Case: Wrong order columns
* One file has columns in different order, tools like dask or pyspark would put the column values in wrong order

In [37]:
cfg_fnames = list(glob.glob('test-data/input/test-data-input-csv-reorder-*.csv'))
print(cfg_fnames)

['test-data/input\\test-data-input-csv-reorder-jan.csv', 'test-data/input\\test-data-input-csv-reorder-feb.csv', 'test-data/input\\test-data-input-csv-reorder-mar.csv']


In [38]:
# get previews
c = d6tc.CombinerCSV(cfg_fnames, all_strings=True) # all_strings=True makes reading faster
col_preview = c.preview_columns()

Here we can see that all columns are not equal

In [39]:
print('all columns equal?', col_preview['is_all_equal'])
print('')
print('in what order do columns appear in the files?')
print('')
print(col_preview['df_columns_order'].reset_index(drop=True))

all columns equal? False

in what order do columns appear in the files?

                              filename cost date profit sales
0  test-data-input-csv-reorder-feb.csv    2    0      3     1
1  test-data-input-csv-reorder-jan.csv    2    0      3     1
2  test-data-input-csv-reorder-mar.csv    3    0      2     1


In [40]:
c.combine_preview() # automatically puts it in the right order

Unnamed: 0,cost,date,filename,profit,sales
0,-80,2011-01-01,test-data-input-csv-reorder-jan.csv,20,100
1,-80,2011-01-02,test-data-input-csv-reorder-jan.csv,20,100
2,-80,2011-01-03,test-data-input-csv-reorder-jan.csv,20,100
0,-90,2011-02-01,test-data-input-csv-reorder-feb.csv,110,200
1,-90,2011-02-02,test-data-input-csv-reorder-feb.csv,110,200
2,-90,2011-02-03,test-data-input-csv-reorder-feb.csv,110,200
0,-100,2011-03-01,test-data-input-csv-reorder-mar.csv,200,300
1,-100,2011-03-02,test-data-input-csv-reorder-mar.csv,200,300
2,-100,2011-03-03,test-data-input-csv-reorder-mar.csv,200,300


# CSV Advanced: customize select and rename colums
Say you want to select only certain columns or rename them before combining?

In [50]:
# only select particular columns
cfg_col_sel = ['date','sales','cost','profit']
# rename colums
cfg_col_rename = {'sales':'revenue'}

In [52]:
combiner2 = d6tc.CombinerCSVAdvanced(c, cfg_col_sel, cfg_col_rename)
combiner2.combine_preview() 


Unnamed: 0,filename,date,revenue,cost,profit
0,test-data-input-csv-reorder-jan.csv,2011-01-01,100,-80,20
1,test-data-input-csv-reorder-jan.csv,2011-01-02,100,-80,20
2,test-data-input-csv-reorder-jan.csv,2011-01-03,100,-80,20
0,test-data-input-csv-reorder-feb.csv,2011-02-01,200,-90,110
1,test-data-input-csv-reorder-feb.csv,2011-02-02,200,-90,110
2,test-data-input-csv-reorder-feb.csv,2011-02-03,200,-90,110
0,test-data-input-csv-reorder-mar.csv,2011-03-01,300,-100,200
1,test-data-input-csv-reorder-mar.csv,2011-03-02,300,-100,200
2,test-data-input-csv-reorder-mar.csv,2011-03-03,300,-100,200


# CSV out of core functionality

If your files are large you don't want to read them all in memory and then save. Instead you can write directly to the output file.

In [54]:
combiner2.combine_save('test-data/output/test.csv')

True

# Excel Functionality

In [41]:
import importlib
import pandas as pd
import glob

import d6t.stack.combine_csv as d6tc
from d6t.stack.sniffer import XLSSniffer
from d6t.stack.combine_xls import XLStoCSVMultiFile
from d6t.stack.helpers import PrintLogger

In [42]:
cfg_fnames = list(glob.glob('test-data/input/test-data-input-xls-mult-*.xlsx'))
print(cfg_fnames)

['test-data/input\\test-data-input-xls-mult-jan.xlsx', 'test-data/input\\test-data-input-xls-mult-feb.xlsx', 'test-data/input\\test-data-input-xls-mult-mar.xlsx']


### Sniff excel sheets across files

In [43]:
# finds sheets across all files
sniffer = XLSSniffer(cfg_fnames)


In [44]:
print('all files have same sheet count?', sniffer.all_same_count())
print('')
print('all files have same sheet names?', sniffer.all_same_names())
print('')
print('all files contain sheet?', sniffer.all_contain_sheetname('Sheet1'))
print('')
print('detailed dataframe')
print('')
print(sniffer.df_xls_sheets.reset_index(drop=True).head())

all files have same sheet count? True

all files have same sheet names? True

all files contain sheet? True

detailed dataframe

                           file_name sheets_count sheets_idx      sheets_names
0  test-data-input-xls-mult-feb.xlsx            2     [0, 1]  [Sheet1, Sheet2]
1  test-data-input-xls-mult-jan.xlsx            2     [0, 1]  [Sheet1, Sheet2]
2  test-data-input-xls-mult-mar.xlsx            2     [0, 1]  [Sheet1, Sheet2]


### Use the print logger

In [45]:
logger = PrintLogger()

### Convert excel to csv

In [46]:
convertor = XLStoCSVMultiFile(cfg_fnames[:3], 'idx_global', 0, logger)
files_out = convertor.convert_all()
print(files_out)

converting file: test-data-input-xls-mult-jan.xlsx | sheet: 0 ok
converting file: test-data-input-xls-mult-feb.xlsx | sheet: 0 ok
converting file: test-data-input-xls-mult-mar.xlsx | sheet: 0 ok
['test-data/input\\test-data-input-xls-mult-jan.xlsx-0.csv', 'test-data/input\\test-data-input-xls-mult-feb.xlsx-0.csv', 'test-data/input\\test-data-input-xls-mult-mar.xlsx-0.csv']
