In [1]:
import pandas as pd
import numpy as np

In [2]:
#Import the consolidated data set
file_path = "data\\2_staging\\stg_texas_border_report.csv"

border_report = pd.read_csv(
    file_path, 
    parse_dates=['nibrs_start_date', 'report_year'], 
    date_format='%Y-%m-%d')

border_report.head()

Unnamed: 0,agency_name,county,population,nibrs_start_date,murder_and_nonnegligent_manslaughter,negligent_manslaughter,rape,robbery,assault,burglary,larceny_theft,motor_vehicle_theft,arson,human_trafficking_commercial_sex_acts,human_trafficking_involuntary_servitude,months_reported,report_year
0,ALAMO PD,Hidalgo County,19351,2018-08-01,0,0,0,0,0,0,0,0,0,0,0,0,2017-12-31
1,ALPINE PD,Brewster County,5983,2020-04-01,0,0,0,0,0,0,0,0,0,0,0,0,2017-12-31
2,ALTON PD,Hidalgo County,17432,2019-09-01,0,0,0,0,0,0,0,0,0,0,0,0,2017-12-31
3,ANTHONY PD,El Paso County,5681,2020-12-01,0,0,0,0,0,0,0,0,0,0,0,0,2017-12-31
4,BREWSTER CO SO,Brewster County,3196,2015-01-01,1,0,2,0,19,4,9,3,1,0,0,11,2017-12-31


In [3]:
#Create column to identify the different agency types within the report
border_report['agency_type'] = ''

# Define conditions for different agency types
police_condition = border_report['agency_name'].str.contains('PD', case=False)
sheriff_condition = border_report['agency_name'].str.contains('SO', case=False)
constable_condition = border_report['agency_name'].str.contains('CONSTABLE', case=False)
marshal_condition = border_report['agency_name'].str.contains('MARSHAL', case=False)
school_condition = border_report['agency_name'].str.contains('ISD', case=False)
university_condition = border_report['agency_name'].str.contains('UNIV|COLLEGE|TSTC', case=False)


# Assign the agency type based on conditions
border_report.loc[police_condition, 'agency_type'] = 'Police Department'
border_report.loc[sheriff_condition, 'agency_type'] = "Sheriff's Office"
border_report.loc[constable_condition, 'agency_type'] = 'Constable'
border_report.loc[marshal_condition, 'agency_type'] = 'Marshal'
border_report.loc[school_condition, 'agency_type'] = 'School Police'
border_report.loc[university_condition, 'agency_type'] = 'University Police'


In [4]:
#Use the newly created column to count the number of agencies for each agency type
agency_names_and_types = border_report[['agency_name', 'agency_type']].drop_duplicates()

# Group by agency_type and count the number of occurrences of agency_name within each group
counts_per_type = agency_names_and_types.groupby('agency_type').size()

# Calculate percentages
total_agencies = len(agency_names_and_types)
percentages = round((counts_per_type / total_agencies) * 100, 2)

# Create a DataFrame to store counts and percentages
result_agency_names_and_types = pd.DataFrame({
    'Count of Agencies': counts_per_type,
    'Percentage': percentages
}).reset_index()

# Reset index to make agency_type a column
result_agency_names_and_types = result_agency_names_and_types.sort_values(by='Count of Agencies', ascending=False).reset_index(drop=True)

# Rename the column 'agency_type' to 'Agency Type'
result_agency_names_and_types = result_agency_names_and_types.rename(columns={'agency_type': 'Agency Type'})


#Code to generate markdown table
print('There are {} agencies in the dataset.'.format(total_agencies))
print(result_agency_names_and_types.to_markdown(index=False))

There are 85 agencies in the dataset.
| Agency Type       |   Count of Agencies |   Percentage |
|:------------------|--------------------:|-------------:|
| Police Department |                  44 |        51.76 |
| Sheriff's Office  |                  17 |        20    |
| School Police     |                  11 |        12.94 |
| Constable         |                   6 |         7.06 |
| University Police |                   6 |         7.06 |
| Marshal           |                   1 |         1.18 |


In [5]:
#Calculate how many months each agency should have reported data
def nibrs_eligible_months(row):
    # Directly use 'nibrs_start_date' as it's already a datetime object
    nibrs_start_date = row['nibrs_start_date']
    
    # Extract the year directly from 'report_year' (assuming it's already a datetime object)
    report_year = row['report_year'].year

    # Calculate the number of eligible months based on conditions
    if nibrs_start_date.year < report_year:
        return 12
    elif nibrs_start_date.year == report_year:
        return (12 - nibrs_start_date.month + 1)  # +1 to include the start month itself
    else:
        return 0

#Add nibrs_eligible_months column    
border_report['nibrs_eligible_months'] = border_report.apply(nibrs_eligible_months, axis=1)

#Add NIBRS preliminary reporting flag
border_report['has_preliminary_nibrs_reporting'] = border_report['nibrs_eligible_months'] < border_report['months_reported']

#Check NIBRS eligible months
print("Check NIBRS eligible months")
print(border_report['nibrs_eligible_months'].describe())

print()
#Check NIBRS preliminary reporting flag
print(border_report['has_preliminary_nibrs_reporting'].value_counts())

Check NIBRS eligible months
count    595.000000
mean       6.433613
std        5.831259
min        0.000000
25%        0.000000
50%       12.000000
75%       12.000000
max       12.000000
Name: nibrs_eligible_months, dtype: float64

has_preliminary_nibrs_reporting
False    571
True      24
Name: count, dtype: int64


In [6]:
# Filter the border_report DataFrame to include only the records with preliminary NIBRS reporting
preliminary_nibrs_reporting = border_report[border_report['has_preliminary_nibrs_reporting'] == True].reset_index(drop=True)

# Count the number of records with preliminary NIBRS reporting
preliminary_reporting_records = sum(preliminary_nibrs_reporting['has_preliminary_nibrs_reporting'])
print('Preliminary_reporting_records:', preliminary_reporting_records)

# Calculate the total number of records in the border_report DataFrame
number_of_records = len(border_report)
print('Total number of records:', number_of_records)

# Calculate the percentage of records with preliminary NIBRS reporting
percent_preliminary_reporting = (preliminary_reporting_records / number_of_records) * 100
print('Percent of records with preliminary reporting:', round(percent_preliminary_reporting, 2))

# Count the number of unique agencies with preliminary NIBRS reporting
agencies_with_preliminary_nibrs_reporting = len(preliminary_nibrs_reporting['agency_name'].unique())
print('Agencies with preliminary NIBRS reporting:', agencies_with_preliminary_nibrs_reporting)

# Count the total number of unique agencies in the border_report DataFrame
count_of_agencies = border_report['agency_name'].nunique()
print('Count of agencies:', count_of_agencies)

# Calculate the percentage of agencies with preliminary NIBRS reporting
percent_agencies_with_preliminary_nibrs_reporting = (agencies_with_preliminary_nibrs_reporting / count_of_agencies) * 100
print('Percent of agencies with preliminary NIBRS reporting:', percent_agencies_with_preliminary_nibrs_reporting)

Preliminary_reporting_records: 24
Total number of records: 595
Percent of records with preliminary reporting: 4.03
Agencies with preliminary NIBRS reporting: 17
Count of agencies: 85
Percent of agencies with preliminary NIBRS reporting: 20.0


In [7]:
# Define the function to calculate the NIBRS contribution percentage
def calculate_nibrs_contribution_percentage(row):
    # Normalizing the cases where agencies are reporting data earlier
    if row['nibrs_eligible_months'] < row['months_reported']:
        return 100  # If the number of eligible months is less than the number of months reported, the contribution is 100%
    # Calculate the contribution percentage based on the ratio of months reported to eligible months
    elif row['nibrs_eligible_months'] > 0:
        return round((row['months_reported'] / row['nibrs_eligible_months']) * 100, 2)
    # If there are no eligible months, the contribution is NaN. This filters out the preliminary reporting cases
    else:
        return np.nan

#Apply logic to calculate the NIBRS contribution percentage
border_report['nibrs_contribution_percentage'] = border_report.apply(calculate_nibrs_contribution_percentage, axis=1)

#Check the NIBRS contribution percentage
print("Check NIBRS contribution percentages")
border_report['nibrs_contribution_percentage'].describe()

Check NIBRS contribution percentages


count    358.000000
mean      81.949804
std       31.035957
min        0.000000
25%       76.250000
50%      100.000000
75%      100.000000
max      100.000000
Name: nibrs_contribution_percentage, dtype: float64

In [8]:
#Convert 'report_year' to to year
border_report['report_year'] = border_report['report_year'].dt.year

# Create 'cohort_year' column
border_report['cohort_year'] = border_report['nibrs_start_date'].dt.year

# Create 'cohort_quarter' column
border_report['cohort_quarter'] = border_report['nibrs_start_date'].dt.quarter

# Create 'cohort' column
border_report['cohort'] = border_report['cohort_year'].astype(str) + 'Q' + border_report['cohort_quarter'].astype(str)


In [9]:
# Define the conditions for NIBRS adoption status
conditions = [
    border_report['cohort_year'] < 2021,
    border_report['cohort_year'] == 2021,
    border_report['cohort_year'] > 2021,
]

# Define the values to assign under each condition for adoption status
adoption_status_choices = ['early', 'on time', 'late']

# Assign the adoption status based on conditions
border_report['adoption_status'] = np.select(conditions, adoption_status_choices, default='unknown')

# Define the values to assign under each condition for adoption status numeric
adoption_status_numeric_choices = [1, 2, 3]

# Assign the numeric adoption status based on conditions
border_report['adoption_status_numeric'] = np.select(conditions, adoption_status_numeric_choices, default=0)

border_report['adoption_status'].value_counts()

adoption_status
early      329
on time    196
late        70
Name: count, dtype: int64

In [10]:
nibrs_cohort_check = border_report[[
                                    'agency_name', 
                                    'report_year', 
                                    'nibrs_start_date',
                                    'adoption_status',
                                    'adoption_status_numeric', 
                                    'cohort',
                                    'nibrs_eligible_months', 
                                    'nibrs_contribution_percentage',
                                    'months_reported',
                                    'has_preliminary_nibrs_reporting'
                                ]]
nibrs_cohort_check.sort_values('nibrs_contribution_percentage', ascending=False).head()

Unnamed: 0,agency_name,report_year,nibrs_start_date,adoption_status,adoption_status_numeric,cohort,nibrs_eligible_months,nibrs_contribution_percentage,months_reported,has_preliminary_nibrs_reporting
409,SOCORRO PD,2021,2021-01-01,on time,2,2021Q1,12,100.0,12,False
426,ALPINE PD,2022,2020-04-01,early,1,2020Q2,12,100.0,12,False
431,BROWNSVILLE PD,2022,2018-03-01,early,1,2018Q1,12,100.0,12,False
434,CAMERON CO SO,2022,2021-01-01,on time,2,2021Q1,12,100.0,12,False
436,COMBES PD,2022,2019-10-01,early,1,2019Q4,12,100.0,12,False


In [11]:
#Store resulting dataframe into 2_staging
border_report.to_csv('data/3_intermediate/int_texas_border_report.csv', index=False)