In [None]:
import pandas as pd
import glob

In [None]:
# Set the path to the directory where your CSV files are located
csv_directory = "./Data/CSV/"

# Get a list of all CSV files in the directory
csv_files = glob.glob(csv_directory + "*.csv")

In [None]:
# Merge csvs with no column headrs and in order
# Create an empty list to store the individual DataFrames
dataframes = []

# Define the column names
column_names = [
    "Postcode",
    "Positional_quality_indicator",
    "Eastings",
    "Northings",
    "Country_code",
    "NHS_regional_HA_code",
    "NHS_HA_code",
    "Admin_county_code",
    "Admin_district_code",
    "Admin_ward_code"
]

# Read each CSV file and append its DataFrame to the list
for csv_file in csv_files:
    df = pd.read_csv(csv_file, names=column_names)
    dataframes.append(df)

# Merge all DataFrames into a single DataFrame
merged_df = pd.concat(dataframes, ignore_index=True)

# Print the merged DataFrame
print(merged_df)

In [None]:
# First csv headers as intial reference for columnn names
# Ensure the CSV files list is not empty
if csv_files:
    # Use the first CSV file in the list as the reference
    first_csv_file = csv_files[0]

    # Read the first CSV to detect columns
    reference_columns = pd.read_csv(first_csv_file).columns.tolist()

    # Read and merge other CSVs, ensuring the same columns order
    dataframes = []
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        df = df.reindex(columns=reference_columns, fill_value=None)
        dataframes.append(df)

    merged_df = pd.concat(dataframes, ignore_index=True)
    print(merged_df)
else:
    print("No CSV files found in the directory.")

In [None]:
# Handling if Columns in each file are not in order:
# Create an empty list to store the individual DataFrames
dataframes = []

# Find common columns among all CSV files
common_columns = None
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    if common_columns is None:
        common_columns = df.columns
    else:
        common_columns = common_columns.intersection(df.columns)

# Read each CSV file, filter columns, and append DataFrame
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    df = df[common_columns]  # Filter columns to include only common columns
    dataframes.append(df)

# Merge all DataFrames into a single DataFrame
merged_df = pd.concat(dataframes, ignore_index=True)

# Print the merged DataFrame
print(merged_df)

In [None]:
# Handling Extra and Missing Columns:
# Create an empty list to store the individual DataFrames
dataframes = []

# Dictionary to store columns for each CSV file
csv_columns = {}

# Find common columns among all CSV files
common_columns = None
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    csv_columns[csv_file] = set(df.columns)  # Store columns for each CSV
    if common_columns is None:
        common_columns = df.columns
    else:
        common_columns = common_columns.intersection(df.columns)

# Read each CSV file, filter columns, and append DataFrame
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    missing_columns = list(common_columns - csv_columns[csv_file])
    extra_columns = list(csv_columns[csv_file] - common_columns)
    print(f"CSV File: {csv_file}")
    print(f"Missing Columns: {missing_columns}")
    print(f"Extra Columns: {extra_columns}")

    df = df[common_columns]  # Filter columns to include only common columns
    dataframes.append(df)

# Merge all DataFrames into a single DataFrame
merged_df = pd.concat(dataframes, ignore_index=True)

# Print the merged DataFrame
print(merged_df)

# JSON Merge

In [None]:
import pandas as pd
import glob, os, json


json_dir = '/home/lohitd@nextbillion.ai/Downloads/sorted-melbourne-pizza/sorted/6-false_positive'

json_pattern = os.path.join(json_dir, '*.json')
file_list = glob.glob(json_pattern)

dfs = []
for file in file_list:
    with open(file) as f:
        json_data = pd.json_normalize(json.loads(f.read()))
    dfs.append(json_data)
df = pd.concat(dfs)