In [1]:
import altair as alt
import pandas as pd

# ICPSR Data Processing

## Filtering and Exporting Hate Crime Records

In [None]:
import pandas as pd

# Dictionary mapping each year to its corresponding TSV file path
year_paths = {
    2023: '../data/ICPSR_2023/DS0003/39270-0003-Data.tsv',
    2022: '../data/ICPSR_2022/DS0003/38925-0003-Data.tsv',
    2021: '../data/ICPSR_2021/DS0003/38807-0003-Data.tsv',
    2020: '../data/ICPSR_2020/DS0003/38566-0003-Data.tsv',
    2019: '../data/ICPSR_2019/DS0003/38565-0003-Data.tsv',
    2018: '../data/ICPSR_2018/DS0003/37649-0003-Data.tsv',
    2017: '../data/ICPSR_2017/DS0003/37650-0003-Data.tsv',
    2016: '../data/ICPSR_2016/DS0003/37066-0003-Data.tsv',
    2015: '../data/ICPSR_2015/DS0003/36851-0003-Data.tsv',
}

# Set the chunk size for reading large TSV files
chunk_size = 500_000

# Loop through each year and process the corresponding file
for year, tsv_file in year_paths.items():
    print(f"Processing {year}...")
    filtered_chunks = []

    # Read TSV file in chunks to avoid memory overload
    for chunk in pd.read_csv(tsv_file, sep='\t', chunksize=chunk_size, dtype={'V20201': str}):
        # Filter rows where bias motivation code is not "88" (i.e., not 'None')
        filtered_chunk = chunk[chunk['V20201'] != '88']
        filtered_chunks.append(filtered_chunk)

    # Combine all filtered chunks into a single DataFrame
    df_filtered = pd.concat(filtered_chunks, ignore_index=True)

    # Save the filtered data to a CSV file named by year
    df_filtered.to_csv(f'{year}-03.csv', index=False)
    print(f"{year} data saved to {year}-03.csv")

## Combining Data

In [None]:
# Load CSV files from 2023 to 2015 into a list of DataFrames
df_list = [pd.read_csv(f'{year}-03.csv') for year in range(2023, 2014, -1)]

# Combine all DataFrames into a single DataFrame
combined_df = pd.concat(df_list, ignore_index=True)

# Check the combined DataFrame
print(combined_df.head())
print(combined_df.shape)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('2015-2023_03.csv', index=False)