# Check File Type
- Files downloaded from the database at [https://academictorrents.com/details/1614740ac8c94505e4ecb9d88be8bed7b6afddd4/tech&filelist=1] often do not have a file extension on the end. They can be in either JSON or ZST format.
- Use this script to determine the file type of the raw data.

- Then see either `./scripts/zst_to_csv.py` or `./scripts/json_to_csv.py` to convert to CSV format.

In [2]:
# check_file_type.py
# This script inspects a file to determine its type.
# It first checks for the zstandard (zst) magic header and gzip header.
# If those don't match, it attempts to read the file as text and checks
# whether the first non-whitespace character indicates JSON (either '{' or '[').
#
# Usage:
#   python check_file_type.py

import os

# Update this path to the file you want to check
file_path = r"..\data\raw\AITAH_comments"

# Read the first 4 bytes from the file in binary mode
with open(file_path, 'rb') as f:
    header = f.read(4)
print("Header bytes:", header)

# Check for zstandard magic header: b'\x28\xb5\x2f\xfd'
if header == b'\x28\xb5\x2f\xfd':
    print("This file appears to be a valid zst file (zstandard compressed).")
# Check for gzip magic header: b'\x1f\x8b'
elif header.startswith(b'\x1f\x8b'):
    print("This file appears to be a valid gzip file.")
else:
    # If the file is not recognized by known binary signatures,
    # try reading it as text and checking if it appears to be JSON.
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            # Read a portion of the file and strip leading whitespace
            initial_text = f.read(1024).lstrip()
        if initial_text.startswith('{') or initial_text.startswith('['):
            print("This file appears to be a JSON file.")
        else:
            print("This file does not appear to be a zst, gzip, or JSON file.")
    except Exception as e:
        print("Could not read file as text. It may be a binary file of an unknown format.")
        print("Error:", e)


Header bytes: b'{"al'
This file appears to be a JSON file.
