In [69]:
import importlib
import pandas as pd
import numpy as np
import glob

import d6tstack.combine_csv
from d6tstack.utils import PrintLogger
logger = PrintLogger()

# CombinerCSV

In [35]:
cfg_fnames = list(glob.glob('test-data/input/test-data-input-csv-colmismatch-*.csv'))
print(cfg_fnames)

['test-data/input/test-data-input-csv-colmismatch-mar.csv', 'test-data/input/test-data-input-csv-colmismatch-feb.csv', 'test-data/input/test-data-input-csv-colmismatch-jan.csv']


In [36]:
c = d6tstack.combine_csv.CombinerCSV(cfg_fnames, all_strings=True)

In [37]:
c.to_csv(output_dir='test-data/output/',overwrite=True)

In [38]:
# doesn't raise any warnings... thought we had overwrite warnings implemented?
c.to_csv(output_dir='test-data/output/',overwrite=False)

In [39]:
# adds 4 columns for filename
pd.read_csv('test-data/output/test-data-input-csv-colmismatch-jan-matched.csv').head()

Unnamed: 0,cost,date,profit,profit2,sales,filename
0,-80,2011-01-01,20,,100,test-data-input-csv-colmismatch-jan-matched.csv
1,-80,2011-01-02,20,,100,test-data-input-csv-colmismatch-jan-matched.csv
2,-80,2011-01-03,20,,100,test-data-input-csv-colmismatch-jan-matched.csv
3,-80,2011-01-04,20,,100,test-data-input-csv-colmismatch-jan-matched.csv
4,-80,2011-01-05,20,,100,test-data-input-csv-colmismatch-jan-matched.csv


In [40]:
# not writing a file / raising error
c.to_csv(output_dir='test-data/output/',separate_files=False)

In [41]:
# not writing a file
c.to_csv(output_dir='test-data/output/',separate_files=False)

In [42]:
c.to_csv(out_filename='test-data/output/test-combined.csv',separate_files=False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  df_all = pd.concat(dfl_all)


In [43]:
# ok
pd.read_csv('test-data/output/test-combined.csv').head()

Unnamed: 0,cost,date,filename,profit,profit2,sales
0,-100,2011-03-01,test-data-input-csv-colmismatch-mar.csv,200,400.0,300
1,-100,2011-03-02,test-data-input-csv-colmismatch-mar.csv,200,400.0,300
2,-100,2011-03-03,test-data-input-csv-colmismatch-mar.csv,200,400.0,300
3,-100,2011-03-04,test-data-input-csv-colmismatch-mar.csv,200,400.0,300
4,-100,2011-03-05,test-data-input-csv-colmismatch-mar.csv,200,400.0,300


In [44]:
pd.read_csv('test-data/output/test-combined.csv').tail()

Unnamed: 0,cost,date,filename,profit,profit2,sales
25,-80,2011-01-06,test-data-input-csv-colmismatch-jan.csv,20,,100
26,-80,2011-01-07,test-data-input-csv-colmismatch-jan.csv,20,,100
27,-80,2011-01-08,test-data-input-csv-colmismatch-jan.csv,20,,100
28,-80,2011-01-09,test-data-input-csv-colmismatch-jan.csv,20,,100
29,-80,2011-01-10,test-data-input-csv-colmismatch-jan.csv,20,,100


In [45]:
# add is_col_common to pass through
c.to_csv(out_filename='test-data/output/test-combined.csv',separate_files=False,is_col_common=False)

TypeError: to_csv() got an unexpected keyword argument 'is_col_common'

In [46]:
# how do I do streaming?
c.to_csv(out_filename='test-data/output/test-combined.csv',separate_files=False,streaming=True)

TypeError: to_csv() got an unexpected keyword argument 'streaming'

In [56]:
c.to_parquet(output_dir='test-data/output/')

AttributeError: 'CombinerCSV' object has no attribute 'to_parquet'

In [61]:
c.to_sql('mysql+mysqlconnector://testusr:testusr@localhost/test','testd6tstack')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  df_all = pd.concat(dfl_all)


True

In [62]:
from sqlalchemy.engine import create_engine
sqlcnxn = create_engine('mysql+mysqlconnector://testusr:testusr@localhost/test').connect()

In [66]:
pd.read_sql_table('testd6tstack',sqlcnxn).head()

Unnamed: 0,index,cost,date,filename,profit,profit2,sales
0,0,-100,2011-03-01,test-data-input-csv-colmismatch-mar.csv,200,400,300
1,1,-100,2011-03-02,test-data-input-csv-colmismatch-mar.csv,200,400,300
2,2,-100,2011-03-03,test-data-input-csv-colmismatch-mar.csv,200,400,300
3,3,-100,2011-03-04,test-data-input-csv-colmismatch-mar.csv,200,400,300
4,4,-100,2011-03-05,test-data-input-csv-colmismatch-mar.csv,200,400,300


In [67]:
pd.read_sql_table('testd6tstack',sqlcnxn).tail()

Unnamed: 0,index,cost,date,filename,profit,profit2,sales
25,5,-80,2011-01-06,test-data-input-csv-colmismatch-jan.csv,20,,100
26,6,-80,2011-01-07,test-data-input-csv-colmismatch-jan.csv,20,,100
27,7,-80,2011-01-08,test-data-input-csv-colmismatch-jan.csv,20,,100
28,8,-80,2011-01-09,test-data-input-csv-colmismatch-jan.csv,20,,100
29,9,-80,2011-01-10,test-data-input-csv-colmismatch-jan.csv,20,,100


# CombinerCSVAdvanced.to_csv()

In [51]:
combiner2 = d6tstack.combine_csv.CombinerCSVAdvanced(c, c.preview_columns()['columns_all'], {'profit2':'profit3'})
combiner2.preview_combine() 

Unnamed: 0,date,profit3,sales,cost,profit,filename
0,2011-03-01,400.0,300,-100,200,test-data-input-csv-colmismatch-mar.csv
1,2011-03-02,400.0,300,-100,200,test-data-input-csv-colmismatch-mar.csv
2,2011-03-03,400.0,300,-100,200,test-data-input-csv-colmismatch-mar.csv
0,2011-02-01,,200,-90,110,test-data-input-csv-colmismatch-feb.csv
1,2011-02-02,,200,-90,110,test-data-input-csv-colmismatch-feb.csv
2,2011-02-03,,200,-90,110,test-data-input-csv-colmismatch-feb.csv
0,2011-01-01,,100,-80,20,test-data-input-csv-colmismatch-jan.csv
1,2011-01-02,,100,-80,20,test-data-input-csv-colmismatch-jan.csv
2,2011-01-03,,100,-80,20,test-data-input-csv-colmismatch-jan.csv


In [53]:
# bug??
combiner2.to_csv(out_filename='test-data/output/test-combined.csv',separate_files=False)

TypeError: combine_save() got an unexpected keyword argument 'parquet_output'

In [54]:
# bug??
combiner2.to_csv(out_filename='test-data/output/test-combined.csv',separate_files=True)

TypeError: align_save() got an unexpected keyword argument 'parquet_output'

In [58]:
combiner2.to_parquet(output_dir='test-data/output/')

TypeError: align_save() got an unexpected keyword argument 'parquet_output'

# DEBUG: large files

In [70]:
cfg_fnames = list(np.sort(glob.glob('/mnt/data/dev/ubs-alphahack2017-shared/data-raw/ihs/US_Factors_Zscores/US_Factors_TotalCap_Cusip_Zscore_Historical_*.txt')))
print(len(cfg_fnames))
print(cfg_fnames)

9
['/mnt/data/dev/ubs-alphahack2017-shared/data-raw/ihs/US_Factors_Zscores/US_Factors_TotalCap_Cusip_Zscore_Historical_20140401_20140430_D.txt', '/mnt/data/dev/ubs-alphahack2017-shared/data-raw/ihs/US_Factors_Zscores/US_Factors_TotalCap_Cusip_Zscore_Historical_20140501_20140829_D.txt', '/mnt/data/dev/ubs-alphahack2017-shared/data-raw/ihs/US_Factors_Zscores/US_Factors_TotalCap_Cusip_Zscore_Historical_20140901_20141231_D.txt', '/mnt/data/dev/ubs-alphahack2017-shared/data-raw/ihs/US_Factors_Zscores/US_Factors_TotalCap_Cusip_Zscore_Historical_20150101_20150630_D.txt', '/mnt/data/dev/ubs-alphahack2017-shared/data-raw/ihs/US_Factors_Zscores/US_Factors_TotalCap_Cusip_Zscore_Historical_20150701_20151231_D.txt', '/mnt/data/dev/ubs-alphahack2017-shared/data-raw/ihs/US_Factors_Zscores/US_Factors_TotalCap_Cusip_Zscore_Historical_20160101_20160630_D.txt', '/mnt/data/dev/ubs-alphahack2017-shared/data-raw/ihs/US_Factors_Zscores/US_Factors_TotalCap_Cusip_Zscore_Historical_20160701_20161230_D.txt', '/m

In [71]:
cfg_fnames = list(np.sort(glob.glob('/mnt/data/dev/ubs-alphahack2017-shared/data-raw/ihs/US_Factors_Zscores/US_Factors_TotalCap_Cusip_Zscore_Historical_20150101_20150630_D.txt')))
print(len(cfg_fnames))
print(cfg_fnames)

1
['/mnt/data/dev/ubs-alphahack2017-shared/data-raw/ihs/US_Factors_Zscores/US_Factors_TotalCap_Cusip_Zscore_Historical_20150101_20150630_D.txt']


In [72]:
c = d6tstack.combine_csv.CombinerCSV(cfg_fnames, all_strings=True)

In [73]:
c.is_all_equal()

True

In [31]:
# how do I do streaming? this loads everything into memeory...
c.to_csv(out_filename='test-data/output/test-combined.csv',separate_files=False)

KeyboardInterrupt: 

In [75]:
c.to_sql_stream('mysql+mysqlconnector://testusr:testusr@localhost/test','testd6tstack')

KeyboardInterrupt: 