DuckDB treats all column names in a case insensitive manner. So "Value" and "VALUE" are treated the same, thus "VALUE" becomes "VALUE_1"

An example of this happening is productId 38100105

In [1]:
from collections import Counter
import glob
import pprint

import duckdb
import pyarrow.parquet as pq

In [2]:
con = duckdb.connect()

issue = con.execute("SELECT * FROM '/data/tables/output/en/june_20_2025/38100105.parquet'").df()

issue.tail()

Unnamed: 0,REF_DATE,REF_START_DATE,REF_END_DATE,GEO,DGUID,Value,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE_1,STATUS,SYMBOL,TERMINATED,DECIMALS
753,2010,2010-01-01,2010-12-31,Canada,,"Present value calculation, timber stocks, meth...",Dollars,81,millions,6,v3822241,1.2,124971.2,,,,1
754,2011,2011-01-01,2011-12-31,Canada,,"Present value calculation, timber stocks, meth...",Dollars,81,millions,6,v3822240,1.1,120498.5,,,,1
755,2011,2011-01-01,2011-12-31,Canada,,"Present value calculation, timber stocks, meth...",Dollars,81,millions,6,v3822241,1.2,120498.5,,,,1
756,2012,2012-01-01,2012-12-31,Canada,,"Present value calculation, timber stocks, meth...",Dollars,81,millions,6,v3822240,1.1,113132.6,,,,1
757,2012,2012-01-01,2012-12-31,Canada,,"Present value calculation, timber stocks, meth...",Dollars,81,millions,6,v3822241,1.2,113132.6,,,,1


In [3]:
files = glob.glob("/data/tables/output/en/june_20_2025/*.parquet")

In [4]:
duplicate_column_names = {}
for file in files:
    # Open the Parquet file metadata
    dataset = pq.ParquetFile(file)
    # Get the column names
    column_names = [x.upper() for x in dataset.schema.names]
    count_of_column_names = [x for x in column_names if column_names.count(x) > 1]
    if count_of_column_names:
        product_id = file.split('/')[-1].split('.parquet')[0]
        duplicate_column_names[product_id] = list(set(count_of_column_names))

pprint.pprint(duplicate_column_names)

{'10100164': ['VALUE'],
 '13100904': ['STATUS'],
 '23100049': ['VALUE'],
 '23100050': ['VALUE'],
 '23100268': ['STATUS'],
 '36100374': ['VALUE'],
 '36100396': ['VALUE'],
 '36100397': ['VALUE'],
 '36100658': ['VALUE'],
 '38100104': ['VALUE'],
 '38100105': ['VALUE']}


In [5]:
for file in files:
    # Open the Parquet file metadata
    dataset = pq.VALUEParquetFile(file)
    # Get the column names
    column_names = dataset.schema.names
    has_dot = ['.' in x for x in column_names if '.' in x]
    if has_dot:
        print(file)
        print(column_names)

/data/tables/output/en/june_20_2025/11100235.parquet
['REF_DATE', 'REF_START_DATE', 'REF_END_DATE', 'GEO', 'DGUID', 'North American Product Classification System (NAPCS) Canada 2012 Version 1.1', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']
/data/tables/output/en/june_20_2025/11100236.parquet
['REF_DATE', 'REF_START_DATE', 'REF_END_DATE', 'GEO', 'DGUID', 'North American Product Classification System (NAPCS) Canada 2012 Version 1.1', 'North American Industry Classification System (NAICS)', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']
/data/tables/output/en/june_20_2025/13100902.parquet
['REF_DATE', 'REF_START_DATE', 'REF_END_DATE', 'GEO', 'DGUID', 'Age group in 2019', 'Gender in 2019', 'Status.1', 'Domains', 'Characteristics', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'T