Merge pull request #777 from cwida/codecleanup

Code Cleanup and re-add Unity Builds
duckdb · Jul 22, 2020 · 981e7cd · 981e7cd
2 parents 989c1b3 + bc4c6c8
commit 981e7cd
Show file tree

Hide file tree

Showing 275 changed files with 666 additions and 393 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -172,6 +172,78 @@ include_directories(third_party/utf8proc/include)
 include_directories(third_party/miniparquet)
 include_directories(third_party/concurrentqueue)
 
+# todo only regenerate ub file if one of the input files changed hack alert
+function(enable_unity_build UB_SUFFIX SOURCE_VARIABLE_NAME)
+  set(files ${${SOURCE_VARIABLE_NAME}})
+
+  # Generate a unique filename for the unity build translation unit
+  set(unit_build_file ${CMAKE_CURRENT_BINARY_DIR}/ub_${UB_SUFFIX}.cpp)
+  set(temp_unit_build_file ${CMAKE_CURRENT_BINARY_DIR}/ub_${UB_SUFFIX}.cpp.tmp)
+  # Exclude all translation units from compilation
+  set_source_files_properties(${files}
+                              PROPERTIES
+                              HEADER_FILE_ONLY
+                              true)
+
+  set(rebuild FALSE)
+  # check if any of the source files have changed
+  foreach(source_file ${files})
+    if(${CMAKE_CURRENT_SOURCE_DIR}/${source_file} IS_NEWER_THAN
+       ${unit_build_file})
+      set(rebuild TRUE)
+    endif()
+  endforeach(source_file)
+  # write a temporary file
+  file(WRITE ${temp_unit_build_file} "// Unity Build generated by CMake\n")
+  foreach(source_file ${files})
+    file(
+      APPEND ${temp_unit_build_file}
+      "#line 0 \"${source_file}\"\n#include <${CMAKE_CURRENT_SOURCE_DIR}/${source_file}>\n"
+      )
+  endforeach(source_file)
+
+  execute_process(COMMAND ${CMAKE_COMMAND}
+                          -E
+                          compare_files
+                          ${unit_build_file}
+                          ${temp_unit_build_file}
+                  RESULT_VARIABLE compare_result
+                  OUTPUT_VARIABLE bla
+                  ERROR_VARIABLE bla)
+  if(compare_result EQUAL 0)
+    # files are identical: do nothing
+  elseif(compare_result EQUAL 1)
+    # files are different: rebuild
+    set(rebuild TRUE)
+  else()
+    # error while compiling: rebuild
+    set(rebuild TRUE)
+  endif()
+
+  if(${rebuild})
+    file(WRITE ${unit_build_file} "// Unity Build generated by CMake\n")
+    foreach(source_file ${files})
+      file(
+        APPEND ${unit_build_file}
+        "#line 0 \"${source_file}\"\n#include <${CMAKE_CURRENT_SOURCE_DIR}/${source_file}>\n"
+        )
+    endforeach(source_file)
+  endif()
+
+  # Complement list of translation units with the name of ub
+  set(${SOURCE_VARIABLE_NAME}
+      ${${SOURCE_VARIABLE_NAME}} ${unit_build_file}
+      PARENT_SCOPE)
+endfunction(enable_unity_build)
+
+function(add_library_unity NAME MODE)
+  set(SRCS ${ARGN})
+  if(NOT DISABLE_UNITY)
+    enable_unity_build(${NAME} SRCS)
+  endif()
+  add_library(${NAME} OBJECT ${SRCS})
+endfunction()
+
 function(disable_target_warnings NAME)
   if(MSVC)
     target_compile_options(${NAME} PRIVATE "/W0")

diff --git a/Makefile b/Makefile
@@ -8,30 +8,32 @@ imdb: third_party/imdb/data
 GENERATOR=
 FORCE_COLOR=
 WARNINGS_AS_ERRORS=
+DISABLE_UNITY_FLAG=
 ifeq ($(GEN),ninja)
 	GENERATOR=-G "Ninja"
 	FORCE_COLOR=-DFORCE_COLORED_OUTPUT=1
 endif
 ifeq (${TREAT_WARNINGS_AS_ERRORS}, 1)
 	WARNINGS_AS_ERRORS=-DTREAT_WARNINGS_AS_ERRORS=1
 endif
-
-EXTENSIONS="-DBUILD_PARQUET_EXTENSION=TRUE -DBUILD_ICU_EXTENSION=TRUE"
-
+ifeq (${DISABLE_UNITY}, 1)
+	DISABLE_UNITY_FLAG=-DDISABLE_UNITY=1
+endif
+EXTENSIONS=-DBUILD_PARQUET_EXTENSION=TRUE -DBUILD_ICU_EXTENSION=TRUE
 
 clean:
 	rm -rf build
 
 debug:
 	mkdir -p build/debug && \
 	cd build/debug && \
-	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Debug ../.. && \
+	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Debug ../.. && \
 	cmake --build .
 
 release_expanded:
 	mkdir -p build/release_expanded && \
 	cd build/release_expanded && \
-	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Release ../.. && \
+	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Release ../.. && \
 	cmake --build .
 
 unittest: debug
@@ -52,34 +54,34 @@ release:
 	mkdir -p build/release && \
 	python scripts/amalgamation.py && \
 	cd build/release && \
-	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Release -DAMALGAMATION_BUILD=1 ../.. && \
+	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Release -DAMALGAMATION_BUILD=1 ../.. && \
 	cmake --build .
 
 reldebug:
 	mkdir -p build/reldebug && \
 	cd build/reldebug && \
-	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=RelWithDebInfo ../.. && \
+	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=RelWithDebInfo ../.. && \
 	cmake --build .
 
 amaldebug:
 	mkdir -p build/amaldebug && \
 	python scripts/amalgamation.py && \
 	cd build/amaldebug && \
-	cmake $(GENERATOR) $(FORCE_COLOR) -DAMALGAMATION_BUILD=1 ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Debug ../.. && \
+	cmake $(GENERATOR) $(FORCE_COLOR) ${EXTENSIONS} -DAMALGAMATION_BUILD=1 -DCMAKE_BUILD_TYPE=Debug ../.. && \
 	cmake --build .
 
 jdbc:
 	mkdir -p build/release && \
 	python scripts/amalgamation.py && \
 	cd build/release && \
-	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Release -DAMALGAMATION_BUILD=1 -DJDBC_DRIVER=1 ../.. && \
+	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Release -DAMALGAMATION_BUILD=1 -DJDBC_DRIVER=1 ../.. && \
 	cmake --build .
 
 
 jdbcdebug:
 	mkdir -p build/jdbcdebug && \
 	cd build/jdbcdebug && \
-	cmake $(GENERATOR) $(FORCE_COLOR) -DJDBC_DRIVER=1 -DENABLE_SANITIZER=FALSE ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Debug ../.. && \
+	cmake $(GENERATOR) $(FORCE_COLOR) ${EXTENSIONS} -DJDBC_DRIVER=1 -DENABLE_SANITIZER=FALSE -DCMAKE_BUILD_TYPE=Debug ../.. && \
 	cmake --build .
 
 test_compile: # test compilation of individual cpp files

diff --git a/extension/parquet/parquet-extension.cpp b/extension/parquet/parquet-extension.cpp
@@ -4,6 +4,7 @@
 #include <fstream>
 #include <cstring>
 #include <iostream>
+#include <sstream>
 
 #include "parquet-extension.hpp"
 
@@ -422,7 +423,7 @@ class ParquetScanFunction : public TableFunction {
 		for (idx_t i = 0; i < count; i++) {
 			if (col_data.defined_buf.ptr[i]) {
 				auto offset = col_data.offset_buf.read<int32_t>();
-				if (offset > col_data.dict_size) {
+				if ((idx_t) offset > col_data.dict_size) {
 					throw runtime_error("Offset " + to_string(offset) + " greater than dictionary size " +
 					                    to_string(col_data.dict_size) + " at " + to_string(i + target_offset) +
 					                    ". Corrupt file?");
@@ -685,7 +686,7 @@ class ParquetScanFunction : public TableFunction {
 			data.current_group++;
 			data.group_offset = 0;
 
-			if (data.current_group == data.file_meta_data.row_groups.size()) {
+			if ((idx_t) data.current_group == data.file_meta_data.row_groups.size()) {
 				data.finished = true;
 				return;
 			}
@@ -789,7 +790,7 @@ class ParquetScanFunction : public TableFunction {
 						for (idx_t i = 0; i < current_batch_size; i++) {
 							if (col_data.defined_buf.ptr[i]) {
 								auto offset = col_data.offset_buf.read<int32_t>();
-								if (offset >= col_data.string_collection->count) {
+								if ((idx_t) offset >= col_data.string_collection->count) {
 									throw runtime_error("string dictionary offset out of bounds");
 								}
 								auto &chunk = col_data.string_collection->chunks[offset / STANDARD_VECTOR_SIZE];
@@ -815,7 +816,7 @@ class ParquetScanFunction : public TableFunction {
 						// bit packed this
 						auto target_ptr = FlatVector::GetData<bool>(output.data[out_col_idx]);
 						int byte_pos = 0;
-						for (int32_t i = 0; i < current_batch_size; i++) {
+						for (idx_t i = 0; i < current_batch_size; i++) {
 							if (!col_data.defined_buf.ptr[i]) {
 								FlatVector::SetNull(output.data[out_col_idx], i + output_offset, true);
 								continue;

diff --git a/scripts/amalgamation.py b/scripts/amalgamation.py
@@ -1,5 +1,8 @@
 # this script creates a single header + source file combination out of the DuckDB sources
-import os, re, sys, shutil
+import os
+import re
+import sys
+import shutil
 amal_dir = os.path.join('src', 'amalgamation')
 header_file = os.path.join(amal_dir, "duckdb.hpp")
 source_file = os.path.join(amal_dir, "duckdb.cpp")

diff --git a/scripts/asset-upload.py b/scripts/asset-upload.py
@@ -1,4 +1,9 @@
-import json, os, sys, glob, mimetypes, urllib.request
+import json
+import os
+import sys
+import glob
+import mimetypes
+import urllib.request
 
 api_url = 'https://api.github.com/repos/cwida/duckdb/'
 

diff --git a/scripts/generate_grammar.py b/scripts/generate_grammar.py
@@ -1,7 +1,9 @@
 # use bison to generate the parser files
 # the following version of bison is used:
 # bison (GNU Bison) 2.3
-import os, subprocess, re
+import os
+import subprocess
+import re
 
 bison_location     = "bison"
 base_dir           = 'third_party/libpg_query/grammar'

diff --git a/scripts/include_analyzer.py b/scripts/include_analyzer.py
@@ -0,0 +1,75 @@
+import amalgamation
+import os
+import re
+import sys
+import shutil
+
+include_counts = {}
+include_chains = {}
+cached_includes = {}
+
+def analyze_include_file(fpath, already_included_files, prev_include = ""):
+    if fpath in already_included_files:
+        return
+    if fpath in amalgamation.always_excluded:
+        return
+    if fpath not in cached_includes:
+        # print(fpath)
+        with open(fpath, 'r') as f:
+            text = f.read()
+        (statements, includes) = amalgamation.get_includes(fpath, text)
+        cached_includes[fpath] = includes
+    else:
+        includes = cached_includes[fpath]
+
+    if fpath in include_counts:
+        include_counts[fpath] += 1
+    else:
+        include_counts[fpath] = 1
+
+    if fpath not in include_chains:
+        include_chains[fpath] = {}
+    if prev_include not in include_chains[fpath]:
+        include_chains[fpath][prev_include] = 0
+    include_chains[fpath][prev_include] += 1
+
+    already_included_files.append(fpath)
+    if fpath.endswith('.h') or fpath.endswith('.hpp'):
+        prev_include = fpath
+    for include in includes:
+        analyze_include_file(include, already_included_files, prev_include)
+
+def analyze_includes(dir):
+    files = os.listdir(dir)
+    files.sort()
+    for fname in files:
+        if fname in amalgamation.excluded_files:
+            continue
+        fpath = os.path.join(dir, fname)
+        if os.path.isdir(fpath):
+            analyze_includes(fpath)
+        elif fname.endswith('.cpp') or fname.endswith('.c') or fname.endswith('.cc'):
+            analyze_include_file(fpath, [])
+
+for compile_dir in amalgamation.compile_directories:
+    analyze_includes(compile_dir)
+
+kws = []
+for entry in include_counts.keys():
+    kws.append([entry, include_counts[entry]])
+
+kws.sort(key = lambda tup: -tup[1])
+for k in range(0, len(kws)):
+    include_file = kws[k][0]
+    include_count = kws[k][1]
+    print("------------------------------------------------------------")
+    print(include_file + " (" + str(include_count) + ")")
+    print("------------------------------------------------------------")
+    print("FILE INCLUDED FROM:")
+    chainkws = []
+    for chain in include_chains[include_file]:
+        chainkws.append([chain, include_chains[include_file][chain]])
+        chainkws.sort(key = lambda tup: -tup[1])
+    for l in range(0, min(5, len(chainkws))):
+        print(chainkws[l])
+
diff --git a/scripts/test_vector_sizes.py b/scripts/test_vector_sizes.py
@@ -20,7 +20,7 @@ def replace_in_file(fname, regex, replace):
 
 for vector_size in vector_sizes:
 	print("TESTING STANDARD_VECTOR_SIZE=%d" % (vector_size,))
-	replace_in_file('src/include/duckdb/common/constants.hpp', '#define STANDARD_VECTOR_SIZE \d+', '#define STANDARD_VECTOR_SIZE %d' % (vector_size,))
+	replace_in_file('src/include/duckdb/common/vector_size.hpp', '#define STANDARD_VECTOR_SIZE \d+', '#define STANDARD_VECTOR_SIZE %d' % (vector_size,))
 	execute_system_command('rm -rf build')
 	execute_system_command('mkdir -p build/release')
 	os.chdir(build_dir)

diff --git a/src/catalog/CMakeLists.txt b/src/catalog/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_subdirectory(catalog_entry)
 
-add_library(duckdb_catalog
+add_library_unity(duckdb_catalog
                   OBJECT
                   catalog_entry.cpp
                   catalog.cpp

diff --git a/src/catalog/catalog.cpp b/src/catalog/catalog.cpp
@@ -1,4 +1,5 @@
 #include "duckdb/catalog/catalog.hpp"
+#include "duckdb/catalog/catalog_set.hpp"
 
 #include "duckdb/catalog/catalog_entry/list.hpp"
 #include "duckdb/common/exception.hpp"
@@ -17,12 +18,14 @@
 #include "duckdb/planner/parsed_data/bound_create_table_info.hpp"
 #include "duckdb/storage/storage_manager.hpp"
 #include "duckdb/main/database.hpp"
+#include "duckdb/catalog/dependency_manager.hpp"
 
 using namespace duckdb;
 using namespace std;
 
-Catalog::Catalog(StorageManager &storage) : storage(storage), schemas(*this), dependency_manager(*this) {
+Catalog::Catalog(StorageManager &storage) : storage(storage), schemas(make_unique<CatalogSet>(*this)), dependency_manager(make_unique<DependencyManager>(*this)) {
 }
+Catalog::~Catalog(){}
 
 Catalog &Catalog::GetCatalog(ClientContext &context) {
 	return context.catalog;
@@ -69,7 +72,7 @@ CatalogEntry *Catalog::CreateSchema(ClientContext &context, CreateSchemaInfo *in
 	unordered_set<CatalogEntry *> dependencies;
 	auto entry = make_unique<SchemaCatalogEntry>(this, info->schema);
 	auto result = entry.get();
-	if (!schemas.CreateEntry(context.ActiveTransaction(), info->schema, move(entry), dependencies)) {
+	if (!schemas->CreateEntry(context.ActiveTransaction(), info->schema, move(entry), dependencies)) {
 		if (info->on_conflict == OnCreateConflict::ERROR) {
 			throw CatalogException("Schema with name %s already exists!", info->schema.c_str());
 		} else {
@@ -89,7 +92,7 @@ void Catalog::DropSchema(ClientContext &context, DropInfo *info) {
 		                       info->name.c_str());
 	}
 
-	if (!schemas.DropEntry(context.ActiveTransaction(), info->name, info->cascade)) {
+	if (!schemas->DropEntry(context.ActiveTransaction(), info->name, info->cascade)) {
 		if (!info->if_exists) {
 			throw CatalogException("Schema with name \"%s\" does not exist!", info->name.c_str());
 		}
@@ -118,7 +121,7 @@ SchemaCatalogEntry *Catalog::GetSchema(ClientContext &context, const string &sch
 	if (schema_name == TEMP_SCHEMA) {
 		return context.temporary_objects.get();
 	}
-	auto entry = schemas.GetEntry(context.ActiveTransaction(), schema_name);
+	auto entry = schemas->GetEntry(context.ActiveTransaction(), schema_name);
 	if (!entry) {
 		throw CatalogException("Schema with name %s does not exist!", schema_name.c_str());
 	}

diff --git a/src/catalog/catalog_entry/CMakeLists.txt b/src/catalog/catalog_entry/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_library(duckdb_catalog_entries
+add_library_unity(duckdb_catalog_entries
                   OBJECT
                   index_catalog_entry.cpp
                   schema_catalog_entry.cpp