# File Type Analysis
In our analysis, we will focus only on files with `.pdf` and `.docx` extensions. This decision is based on the file type distribution observed in the `./hngr-isps` folder, where `.pdf` and `.docx` files make up the majority of the dataset (with 249 and 76 files, respectively). Other file types, such as `.html`, `.mp4`, and `.zip`, occur far less frequently and are therefore excluded from our main analysis to streamline processing and ensure we concentrate on the most relevant documents.

In [6]:
import os 
from collections import defaultdict

In [9]:
import os
from collections import defaultdict

def get_unique_file_types(folder_path):
    unique_extensions = set()
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            _, extension = os.path.splitext(file)
            if extension:
                unique_extensions.add(extension.lower())
    return unique_extensions

def get_file_type_counts(folder_path):
    file_type_counts = defaultdict(int)
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            _, extension = os.path.splitext(file)
            if extension:
                file_type_counts[extension.lower()] += 1
    return file_type_counts

def main(folder_path):
    unique_file_types = get_unique_file_types(folder_path)
    file_type_counts = get_file_type_counts(folder_path)
    
    print("Unique file types found:")
    for file_type in sorted(unique_file_types):
        print(f"  {file_type}")
    
    print("\nFile type counts:")
    for file_type, count in sorted(file_type_counts.items()):
        print(f"  {file_type}: {count}")

main("../hngr-isps")

Unique file types found:
  .docx
  .html
  .mp4
  .pdf
  .zip

File type counts:
  .docx: 76
  .html: 1
  .mp4: 1
  .pdf: 249
  .zip: 2
