In [1]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm

In [2]:
def combine_parquet(folder_path, output_file_path, output_file_name):
    folder_path = Path(folder_path)
    output_file_path = Path(output_file_path)

    folder_name = folder_path.name

    # Get a list of all folders in the main folder
    subfolders = [f for f in folder_path.iterdir() if f.is_dir()]

    # Initialize lists to store folder information
    folders_with_files = []
    folders_without_files = []

    # Initialize a list to store the DataFrames
    dfs = []

    # Iterate over each subfolder
    for subfolder in subfolders:
        # Get a list of all Parquet files in the subfolder
        parquet_files = list(subfolder.glob("*.parquet"))

        if parquet_files:
            # Subfolder contains parquet files
            folders_with_files.append(subfolder.name)
            for file in parquet_files:
                # Append the DataFrame to the list
                df = pd.read_parquet(file)
                df["filename"] = file.stem
                df["filepath"] = str(file)

                # Convert the 'quotedTweet' column to nullable integer type
                df["quotedTweet"] = (
                    df["quotedTweet"]
                    .apply(lambda x: x if isinstance(x, int) else pd.NA)
                    .astype(pd.Int64Dtype())
                )
                df["inReplyToUser"] = (
                    df["inReplyToUser"]
                    .apply(lambda x: x if isinstance(x, int) else pd.NA)
                    .astype(pd.Int64Dtype())
                )
                dfs.append(df)

        else:
            # Empty subfolder
            folders_without_files.append(subfolder.name)

    if dfs:
        # Concatenate all DataFrames into a single DataFrame
        combined_data = pd.concat(dfs, ignore_index=True)

        # Write the combined data to a Parquet file
        combined_data.to_parquet(output_file_path / output_file_name, index=False)

    else:
        # No parquet files found
        return None

    # Create folder information list
    folder_info = []
    for folder in folders_with_files:
        num_files = sum(folder_name in str(file) for file in parquet_files)
        folder_info.append([folder_name, folder, num_files])

    # Save folder information as a CSV file
    folder_info_df = pd.DataFrame(folder_info, columns=["Main Folder", "Subfolder", "File Count"])
    folder_info_df.to_csv(output_file_path / "folder_info.csv", index=False)

    return folder_info

In [3]:
# Define the folder path where the Parquet files are located
folder_path = "/Users/fahad/Desktop/Data/More data"

# Define the output file path for the combined Parquet file
output_file_path = "/Users/fahad/Desktop/processed_data"

# Define the output file name for the combined Parquet file
output_file_name = "combined_data9.parquet"

In [4]:
folder = combine_parquet(folder_path, output_file_path, output_file_name)

IsADirectoryError: [Errno 21] Failed to open local file '/Users/fahad/Desktop/processed_data/combined_data9.parquet'. Detail: [errno 21] Is a directory

In [None]:
import os

parent_folder = '/Users/fahad/Desktop/Data/More data'
folders_without_files = []
folders_with_files = {}

# Loop through subfolders
for folder_name in os.listdir(parent_folder):
    folder_path = os.path.join(parent_folder, folder_name)

    # Check if it's a directory
    if os.path.isdir(folder_path):
        files = os.listdir(folder_path)

        # Check if there are any files in the folder
        if len(files) == 0:
            folders_without_files.append(folder_name)
        else:
            folders_with_files[folder_name] = {
                'file_count': len(files),
                'files': files
            }



In [2]:
# Save folders without files to a text file
with open('folders_without_files.txt', 'w') as file:
    file.write("Folders without files:\n")
    for folder_name in folders_without_files:
        file.write(folder_name + '\n')

# Save folders with files to a text file
with open('folders_with_files.txt', 'w') as file:
    file.write("Folders with files:\n")
    for folder_name, folder_info in folders_with_files.items():
        file.write(f"{folder_name} ({folder_info['file_count']} files):\n")
        for file_name in folder_info['files']:
            file.write(file_name + '\n')

In [None]:
import os
import pandas as pd

parent_folder = 'More data'
output_file = 'combined.parquet'

# Initialize an empty DataFrame to store the combined data
combined_data = pd.DataFrame()

# Loop through the subfolders
for root, dirs, files in os.walk(parent_folder):
    for file in files:
        if file.endswith('.parquet'):
            file_path = os.path.join(root, file)
            
            # Read the parquet file
            df = pd.read_parquet(file_path)
            
            # Add the file path as a column
            df['FilePath'] = file_path
            
            # Append the data to the combined DataFrame
            combined_data = combined_data.append(df)

# Save the combined data as a parquet file
combined_data.to_parquet(output_file, index=False)

print(f"Combined parquet file saved as '{output_file}'.")
