##### This code allows you to extract data from CSV files and store it in a Parquet file format, which facilitates easier loading and retrieval of the data in the future.

In [None]:
import os
import pandas as pd

In [None]:
def read_files(directory_path, desired_police_stations):
    """
    Read in all CSV files in a directory and concatenate them into a single dataframe.
    
    Args:
        directory_path (str): Path to the directory containing the CSV files.
        desired_police_stations (list): List of desired police station names.
        
    Returns:
        pd.DataFrame: Combined dataframe containing data from all CSV files.
    """
    # Clean desired police station names
    desired_police_stations = [station.lower().replace('-', '') for station in desired_police_stations]
    
    # Get a list of all CSV files in the directory
    file_names = [f for f in os.listdir(directory_path) if f.endswith('.csv')]
    
    # Initialize an empty list to store dataframes
    dfs = []
    
    # Read each CSV file and append the dataframes to the list
    for file_name in file_names:
        # Extract the police station name from the file name
        police_station = ''.join(file_name.lower().split('-'))[6:-10]
        
        if police_station in desired_police_stations:
            print(f'Reading file: {file_name}')
            file_path = os.path.join(directory_path, file_name)
            df = pd.read_csv(file_path)
            dfs.append(df)
    
    # Concatenate all dataframes into a single dataframe
    combined_df = pd.concat(dfs, ignore_index=True)
    
    return combined_df


# Set the directory path
directory_path = 'Data/street'

# Define the desired police stations
police_stations = ['metropolitan']

# Read the files and get the combined dataframe
combined_data = read_files(directory_path, police_stations)

In [None]:
combined_data.head(5)

In [None]:
# Filter the combined dataframe for 'Burglary' crime type
burglary_df = combined_data[combined_data['Crime type'] == 'Burglary'].copy()

In [None]:
# Save the burglary dataframe to a Parquet file
burglary_df.to_parquet('burglary.parquet') 