-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #380 from datosgobar/xlsx-dumps
Refactor xlsx dumps
- Loading branch information
Showing
10 changed files
with
202 additions
and
75 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
92 changes: 46 additions & 46 deletions
92
series_tiempo_ar_api/apps/dump/generator/xlsx/generator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,68 +1,68 @@ | ||
import csv | ||
import io | ||
import os | ||
|
||
from django.core.files import File | ||
from xlsxwriter.workbook import Workbook | ||
|
||
from series_tiempo_ar_api.apps.dump.generator.generator import remove_old_dumps | ||
from series_tiempo_ar_api.apps.dump.generator.xlsx.workbook import DumpWorkbook | ||
from series_tiempo_ar_api.apps.dump.models import DumpFile, GenerateDumpTask | ||
from series_tiempo_ar_api.utils import read_file_as_csv | ||
|
||
|
||
def read_file_as_csv(file): | ||
ios = io.StringIO() | ||
ios.write(file.read().decode('utf-8')) | ||
class XLSXWriter: | ||
multiple_sheets = { | ||
DumpFile.FILENAME_VALUES: True, | ||
DumpFile.FILENAME_FULL: True, | ||
DumpFile.FILENAME_METADATA: False, | ||
DumpFile.FILENAME_SOURCES: False, | ||
} | ||
|
||
ios.seek(0) | ||
reader = csv.reader(ios) | ||
return reader | ||
def __init__(self, task: GenerateDumpTask, dump_file: DumpFile, workbook_class=DumpWorkbook): | ||
self.workbook_class = workbook_class | ||
self.task = task | ||
self.csv_dump_file = dump_file | ||
self.frequency_column_index = None | ||
self.worksheets = {} | ||
|
||
def write(self): | ||
try: | ||
self.csv_to_xlsx() | ||
except IOError as e: | ||
catalog = self.csv_dump_file.node or 'global' | ||
msg = f"Error escribiendo dump XLSX de dump {catalog} {self.csv_dump_file.file_name}: {e.__class__}: {e}" | ||
GenerateDumpTask.info(self.task, msg) | ||
|
||
def csv_to_xlsx(dump_file: DumpFile): | ||
xlsx = f'{dump_file.file_name}-{dump_file.id}.xlsx' | ||
workbook = Workbook(xlsx) | ||
worksheet = workbook.add_worksheet() | ||
sheet_count = 0 | ||
with dump_file.file as f: | ||
reader = read_file_as_csv(f) | ||
for r, row in enumerate(reader): | ||
if sheet_count > 1000000: | ||
worksheet = workbook.add_worksheet() | ||
sheet_count = 0 | ||
def csv_to_xlsx(self): | ||
"""Escribe el dump en XLSX en un archivo temporal, luego lo guarda en el storage, | ||
por último borra el archivo temporal. Se debe hacer así porque """ | ||
xlsx = self.xlsx_file_name() | ||
with self.csv_dump_file.file as f: | ||
reader = read_file_as_csv(f) | ||
header_row = next(reader) | ||
|
||
for c, col in enumerate(row): | ||
worksheet.write(sheet_count, c, col) | ||
workbook = self.workbook_class(xlsx, | ||
header_row=header_row, | ||
split_by_frequency=self.multiple_sheets[self.csv_dump_file.file_name]) | ||
|
||
sheet_count += 1 | ||
workbook.close() | ||
for row in reader: | ||
workbook.write_row(row) | ||
|
||
with open(xlsx, 'rb') as f: | ||
dump_file.task.dumpfile_set.create(file_name=dump_file.file_name, | ||
file_type=DumpFile.TYPE_XLSX, | ||
node=dump_file.node, | ||
file=File(f)) | ||
workbook.close() | ||
|
||
os.remove(xlsx) | ||
with open(xlsx, 'rb') as f: | ||
self.task.dumpfile_set.create(file_name=self.csv_dump_file.file_name, | ||
file_type=DumpFile.TYPE_XLSX, | ||
node=self.csv_dump_file.node, | ||
file=File(f)) | ||
|
||
os.remove(xlsx) | ||
|
||
def generate(task: GenerateDumpTask, node: str = None): | ||
dumps_qs = DumpFile.objects.all() | ||
if node: | ||
dumps_qs = dumps_qs.filter(node__catalog_id=node) | ||
def xlsx_file_name(self): | ||
return f'{self.csv_dump_file.file_name}-{self.csv_dump_file.id}.{DumpFile.TYPE_XLSX}' | ||
|
||
dumps = [] | ||
for dump_name, _ in DumpFile.FILENAME_CHOICES: | ||
dump_file = dumps_qs.filter(file_name=dump_name, file_type=DumpFile.TYPE_CSV).last() | ||
if dump_file is not None: | ||
dumps.append(dump_file) | ||
|
||
for dump in dumps: | ||
try: | ||
csv_to_xlsx(dump) | ||
except Exception as e: | ||
catalog = node or 'global' | ||
msg = f"Error escribiendo dump XLSX de dump {catalog} {dump.file_name}: {e.__class__}: {e}" | ||
GenerateDumpTask.info(task, msg) | ||
def generate(task: GenerateDumpTask, node: str = None, workbook_class=DumpWorkbook): | ||
for dump in DumpFile.get_last_of_type(DumpFile.TYPE_CSV, node): | ||
XLSXWriter(task, dump, workbook_class).write() | ||
|
||
for filename, _ in DumpFile.FILENAME_CHOICES: | ||
remove_old_dumps(filename, DumpFile.TYPE_CSV, node) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from xlsxwriter import Workbook | ||
|
||
from series_tiempo_ar_api.apps.dump.generator.xlsx.worksheet import DumpWorksheet, SingleWorksheet | ||
|
||
|
||
class DumpWorkbook: | ||
frequency_col_name = 'indice_tiempo_frecuencia' | ||
|
||
def __init__(self, filename: str, header_row: list, split_by_frequency=False): | ||
self.workbook = Workbook(filename) | ||
self.split_by_frequency = split_by_frequency | ||
|
||
self.header_row = header_row | ||
for i, col_name in enumerate(self.header_row): | ||
if col_name == self.frequency_col_name: | ||
self.frequency_column_index = i | ||
break | ||
|
||
self.sheets = {} | ||
self.single_sheet = None | ||
|
||
def add_worksheet(self, sheet_name, frequency): | ||
self.sheets[frequency] = DumpWorksheet(self.workbook, sheet_name) | ||
self.sheets[frequency].write_row(self.header_row) | ||
|
||
def write_row(self, row): | ||
if self.split_by_frequency: | ||
sheet = row[self.frequency_column_index] | ||
if sheet not in self.sheets: | ||
self.init_worksheet(sheet) | ||
self.sheets[sheet].write_row(row) | ||
return | ||
|
||
if not self.single_sheet: | ||
self.single_sheet = SingleWorksheet(self.workbook) | ||
self.single_sheet.write_row(self.header_row) | ||
|
||
self.single_sheet.write_row(row) | ||
|
||
def close(self): | ||
self.workbook.close() | ||
|
||
def init_worksheet(self, frequency: str): | ||
names = { | ||
'R/P1Y': 'anual', | ||
'R/P6M': 'semestral', | ||
'R/P3M': 'trimestral', | ||
'R/P1M': 'mensual', | ||
'R/P1D': 'diaria', | ||
} | ||
sheet_name = names[frequency] | ||
self.add_worksheet(sheet_name, frequency) |
36 changes: 36 additions & 0 deletions
36
series_tiempo_ar_api/apps/dump/generator/xlsx/worksheet.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from xlsxwriter import Workbook | ||
|
||
|
||
class DumpWorksheet: | ||
MAX_ROWS_PER_SHEET = 1000000 | ||
|
||
def __init__(self, workbook: Workbook, name: str): | ||
self.name = name | ||
self.workbook = workbook | ||
self.sheet_count = 0 | ||
self.current_row = 0 | ||
self.current_sheet = None | ||
|
||
self.init_worksheet() | ||
|
||
def write_row(self, row: list): | ||
self.current_sheet.write_row(self.current_row, 0, row) | ||
self.current_row += 1 | ||
|
||
if self.current_row > self.MAX_ROWS_PER_SHEET: | ||
self.init_worksheet() | ||
|
||
def init_worksheet(self): | ||
self.sheet_count += 1 | ||
sheet_name = f'{self.name}-{self.sheet_count}' | ||
self.current_sheet = self.workbook.add_worksheet(sheet_name) | ||
|
||
|
||
class SingleWorksheet: | ||
def __init__(self, workbook: Workbook, name: str = None): | ||
self.current_row = 0 | ||
self.current_sheet = workbook.add_worksheet(name) | ||
|
||
def write_row(self, row: list): | ||
self.current_sheet.write_row(self.current_row, 0, row) | ||
self.current_row += 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
6 changes: 4 additions & 2 deletions
6
series_tiempo_ar_api/apps/dump/management/commands/generate_xlsx_dump.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,11 @@ | ||
#! coding: utf-8 | ||
from django.core.management import BaseCommand | ||
|
||
from series_tiempo_ar_api.apps.dump.tasks import enqueue_xlsx_dump_task | ||
from series_tiempo_ar_api.apps.dump.models import GenerateDumpTask | ||
from series_tiempo_ar_api.apps.dump.tasks import enqueue_dump_task | ||
|
||
|
||
class Command(BaseCommand): | ||
def handle(self, *args, **options): | ||
enqueue_xlsx_dump_task() | ||
task = GenerateDumpTask.objects.create(file_type=GenerateDumpTask.TYPE_XLSX) | ||
enqueue_dump_task(task) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters