Join all the different datasets into one big dataset and save it into the desired folder

In [1]:
import pandas as pd
import geopandas as gpd
import os
import glob

In [3]:
environmental_path_root_folder = "/mnt/d/desktop/COPERNICUS/Classes/3-semester/ml/species/output/EnvironmentalRasters"

## Craw into every folder and look for .csv files
# This will collect full paths to every .csv under the root folder
all_csvs = []
train_files = []
test_files = []

for dirpath, dirnames, filenames in os.walk(environmental_path_root_folder):
    # iterate files in the current directory
    for fname in filenames:
        if not fname.lower().endswith('.csv'):
            continue
        fullpath = os.path.join(dirpath, fname)
        all_csvs.append(fullpath)
        name_lower = fname.lower()
        # classify by filename first, then by parent folder name as fallback
        if 'train' in name_lower:
            train_files.append(fullpath)
        elif 'test' in name_lower:
            test_files.append(fullpath)
        else:
            parent = os.path.basename(dirpath).lower()
            if 'train' in parent:
                train_files.append(fullpath)
            elif 'test' in parent:
                test_files.append(fullpath)

# quick sanity: remove duplicates (if any) while preserving order
def dedupe_keep_order(seq):
    seen = set()
    out = []
    for x in seq:
        if x not in seen:
            seen.add(x)
            out.append(x)
    return out

all_csvs = dedupe_keep_order(all_csvs)
train_files = dedupe_keep_order(train_files)
test_files = dedupe_keep_order(test_files)

print(f'Found {len(all_csvs)} CSV files total: {len(train_files)} train, {len(test_files)} test')

Found 12 CSV files total: 6 train, 6 test


In [4]:
## Checking if all the files have SurveyID as its column.
for file in train_files:
    df = pd.read_csv(file,nrows=0)
    #print(df.columns)
    if "surveyId" in df.columns:
        print(f"The file {os.path.basename(file)} is okay")
    else:
        print(f"NOT OKAY: {os.path.basename(file)}")
        
## Test
for file in test_files:
    df = pd.read_csv(file,nrows=0)
    #print(df.columns)
    if "surveyId" in df.columns:
        print(f"The file {os.path.basename(file)} is okay")

The file GLC24-PA-train-bioclimatic-average.csv is okay
The file GLC24-PA-train-bioclimatic-monthly.csv is okay
The file GLC24-PA-train-elevation.csv is okay
The file GLC24-PA-train-human-footprint.csv is okay
The file GLC24-PA-train-landcover.csv is okay
The file GLC24-PA-train-soilgrids.csv is okay
The file GLC24-PA-test-bioclimatic-average.csv is okay
The file GLC24-PA-test-bioclimatic-monthly.csv is okay
The file GLC24-PA-test-elevation.csv is okay
The file GLC24-PA-test-human-footprint.csv is okay
The file GLC24-PA-test-landcover.csv is okay
The file GLC24-PA-test-soilgrids.csv is okay


In [None]:
## Pop the monthly bioclimatic cause its too big
train_files.pop(1)
test_files.pop(1)

'/mnt/d/desktop/COPERNICUS/Classes/3-semester/ml/species/output/EnvironmentalRasters/Climate/GLC24-PA-test-bioclimatic-monthly.csv'

In [24]:
test_files

['/mnt/d/desktop/COPERNICUS/Classes/3-semester/ml/species/output/EnvironmentalRasters/Climate/GLC24-PA-test-bioclimatic-average.csv',
 '/mnt/d/desktop/COPERNICUS/Classes/3-semester/ml/species/output/EnvironmentalRasters/Elevation/GLC24-PA-test-elevation.csv',
 '/mnt/d/desktop/COPERNICUS/Classes/3-semester/ml/species/output/EnvironmentalRasters/HumanFootprint/GLC24-PA-test-human-footprint.csv',
 '/mnt/d/desktop/COPERNICUS/Classes/3-semester/ml/species/output/EnvironmentalRasters/LandCover/GLC24-PA-test-landcover.csv',
 '/mnt/d/desktop/COPERNICUS/Classes/3-semester/ml/species/output/EnvironmentalRasters/Soilgrids/GLC24-PA-test-soilgrids.csv']

###  Train

In [21]:
out_folder = "/mnt/d/desktop/COPERNICUS/Classes/3-semester/ml/species/output"
out_file = "train_join.parquet"

result = None

for file_path in train_files:
    # get basename 
    basename = os.path.splitext(os.path.basename(file_path))[0].split('-')[-1]
    df = pd.read_csv(file_path)
    
    # Set surveyID as index temporarily
    df.set_index('surveyId', inplace=True)
    
    # Create MultiIndex columns (basename, column_name)
    df.columns = pd.MultiIndex.from_product([[basename], df.columns])
    
    # Concatenate incrementally
    if result is None:
        result = df
        print(f"Shape: {result.shape}")
    else:
        result = pd.concat([result, df], axis=1, join='outer')
        print(f"Shape:{result.shape}")
    
    # Clean up to free memory
    del df

# Reset index to bring surveyID back as a column
result = result.reset_index()

print(f"\nShape: {result.shape}")

# Flatten the MultiIndex columns
if isinstance(result.columns, pd.MultiIndex):
    result.columns = ['_'.join(col).strip() if col[1] else col[0] 
                      for col in result.columns.values]

Shape: (88987, 19)
Shape:(88987, 20)
Shape:(88987, 36)
Shape:(88987, 37)
Shape:(88987, 46)

Shape: (88987, 47)


In [22]:
## Save as a parquet file
result.to_parquet(
    os.path.join(out_folder, out_file), 
    engine='fastparquet'
)

### Test

In [25]:
out_folder = "/mnt/d/desktop/COPERNICUS/Classes/3-semester/ml/species/output"
out_file = "test_join.parquet"

result = None

for file_path in test_files:
    # get basename 
    basename = os.path.splitext(os.path.basename(file_path))[0].split('-')[-1]
    df = pd.read_csv(file_path)
    
    # Set surveyID as index temporarily
    df.set_index('surveyId', inplace=True)
    
    # Create MultiIndex columns (basename, column_name)
    df.columns = pd.MultiIndex.from_product([[basename], df.columns])
    
    # Concatenate incrementally
    if result is None:
        result = df
        print(f"Shape: {result.shape}")
    else:
        result = pd.concat([result, df], axis=1, join='outer')
        print(f"Shape:{result.shape}")
    
    # Clean up to free memory
    del df

# Reset index to bring surveyID back as a column
result = result.reset_index()

print(f"\nShape: {result.shape}")

# Flatten the MultiIndex columns
if isinstance(result.columns, pd.MultiIndex):
    result.columns = ['_'.join(col).strip() if col[1] else col[0] 
                      for col in result.columns.values]

Shape: (4716, 19)
Shape:(4716, 20)
Shape:(4716, 36)
Shape:(4716, 37)
Shape:(4716, 46)

Shape: (4716, 47)


In [26]:
## Save as a parquet file
result.to_parquet(
    os.path.join(out_folder, out_file), 
    engine='fastparquet'
)