In [1]:
import pandas as pd

In [2]:
#Import the consolidated data set
file_path = "data\\2_staging\\stg_texas_border_report.csv"

border_report = pd.read_csv(
    file_path, 
    parse_dates=['nibrs_start_date', 'report_year'], 
    date_format='%Y-%m-%d')

border_report.head()

Unnamed: 0,agency_name,county,population,nibrs_start_date,murder_and_nonnegligent_manslaughter,negligent_manslaughter,rape,robbery,assault,burglary,larceny_theft,motor_vehicle_theft,arson,human_trafficking_commercial_sex_acts,human_trafficking_involuntary_servitude,months_reported,report_year
0,ALAMO PD,Hidalgo County,19351,2018-08-01,0,0,0,0,0,0,0,0,0,0,0,0,2017-12-31
1,ALPINE PD,Brewster County,5983,2020-04-01,0,0,0,0,0,0,0,0,0,0,0,0,2017-12-31
2,ALTON PD,Hidalgo County,17432,2019-09-01,0,0,0,0,0,0,0,0,0,0,0,0,2017-12-31
3,ANTHONY PD,El Paso County,5681,2020-12-01,0,0,0,0,0,0,0,0,0,0,0,0,2017-12-31
4,BREWSTER CO SO,Brewster County,3196,2015-01-01,1,0,2,0,19,4,9,3,1,0,0,11,2017-12-31


In [3]:
#Create column to identify the different agency types within the report
border_report['agency_type'] = ''

# Define conditions for different agency types
police_condition = border_report['agency_name'].str.contains('PD', case=False)
sheriff_condition = border_report['agency_name'].str.contains('SO', case=False)
constable_condition = border_report['agency_name'].str.contains('CONSTABLE', case=False)
marshal_condition = border_report['agency_name'].str.contains('MARSHAL', case=False)


# Assign the agency type based on conditions
border_report.loc[police_condition, 'agency_type'] = 'Police Department'
border_report.loc[sheriff_condition, 'agency_type'] = "Sheriff's Office"
border_report.loc[constable_condition, 'agency_type'] = 'Constable'
border_report.loc[marshal_condition, 'agency_type'] = 'Marshal'

In [4]:
#Use the newly created column to count the number of agencies for each agency type
agency_names_and_types = border_report[['agency_name', 'agency_type']].drop_duplicates()

# Group by agency_type and count the number of occurrences of agency_name within each group
counts_per_type = agency_names_and_types.groupby('agency_type').size()

# Calculate percentages
total_agencies = len(agency_names_and_types)
percentages = round((counts_per_type / total_agencies) * 100, 2)

# Create a DataFrame to store counts and percentages
result_agency_names_and_types = pd.DataFrame({
    'Count of Agencies': counts_per_type,
    'Percentage': percentages
}).reset_index()

# Reset index to make agency_type a column again
result_agency_names_and_types = result_agency_names_and_types.sort_values(by='Count of Agencies', ascending=False).reset_index(drop=True)

# Rename the column 'agency_type' to 'Agency Type'
result_agency_names_and_types = result_agency_names_and_types.rename(columns={'agency_type': 'Agency Type'})

result_agency_names_and_types

#Code to generate markdown table
print(result_agency_names_and_types.to_markdown(index=False))

| Agency Type       |   Count of Agencies |   Percentage |
|:------------------|--------------------:|-------------:|
| Police Department |                  57 |        67.06 |
| Sheriff's Office  |                  21 |        24.71 |
| Constable         |                   6 |         7.06 |
| Marshal           |                   1 |         1.18 |


In [19]:
#Create column to label the univerity police departments
border_report['university_pd'] = border_report['agency_name'].str.contains('UNIV|COLLEGE|TSTC', case=False, na=False)

# Filter the border_report DataFrame to include only the university police departments
university_pds = border_report[border_report['university_pd'] == True]['agency_name'].unique()

# Create a DataFrame called 'university_pds' to store the unique university police departments
university_pds = pd.DataFrame(university_pds, columns=['university_police_departments'])

num_univ_police_departments = university_pds.shape[0] - 1
print("There are", num_univ_police_departments,"university police departments included in the report.")

percent_of_univ_pds = round(num_univ_police_departments / total_agencies * 100, 2)
print("This represents", percent_of_univ_pds, "% of the total number of agencies.")

# Mapping of original values to new values
mapping = {
    'EL PASO COMM COLLEGE PD': 'El Paso Community College PD',
    'SUL ROSS STATE UNIV PD': 'Sul Ross State University PD',
    'TSTC: HARLINGEN PD': 'Texas State Technical College: Harlingen PD',
    'TX A&M UNIV INTERNATIONAL PD': 'Texas A&M International University PD',
    'UNIV OF TX RIO GRANDE VALLEY PD': 'University of Texas Rio Grande Valley PD',
    'UNIV OF TX: EL PASO PD': 'University of Texas: El Paso PD',
}

# Add a new column with renamed values
university_pds['University Police Departments'] = university_pds['university_police_departments'].replace(mapping)

#Code to generate markdown table
print(university_pds['University Police Departments'].to_markdown(index=False))


There are 5 university police departments included in the report.
This represents 5.88 % of the total number of agencies.
| University Police Departments               |
|:--------------------------------------------|
| El Paso Community College PD                |
| Sul Ross State University PD                |
| Texas State Technical College: Harlingen PD |
| Texas A&M International University PD       |
| University of Texas Rio Grande Valley PD    |
| University of Texas: El Paso PD             |


In [7]:
#Calculate how many months each agency should have reported data
def nibrs_eligible_months(row):
    # Directly use 'nibrs_start_date' as it's already a datetime object
    nibrs_start_date = row['nibrs_start_date']
    
    # Extract the year directly from 'report_year' (assuming it's already a datetime object)
    report_year = row['report_year'].year

    # Calculate the number of eligible months based on conditions
    if nibrs_start_date.year < report_year:
        return 12
    elif nibrs_start_date.year == report_year:
        return (12 - nibrs_start_date.month + 1)  # +1 to include the start month itself
    else:
        return 0

#Add nibrs_eligible_months column    
border_report['nibrs_eligible_months'] = border_report.apply(nibrs_eligible_months, axis=1)

#Add NIBRS preliminary reporting flag
border_report['has_preliminary_nibrs_reporting'] = border_report['nibrs_eligible_months'] < border_report['months_reported']


#Check nibrs_eligible_months logic
nibrs_eligible_months_check = border_report[
        ['agency_name', 
        'report_year', 
        'nibrs_start_date', 
        'nibrs_eligible_months', 
        'months_reported',
        'has_preliminary_nibrs_reporting']
    ].sort_values('months_reported', ascending=False).reset_index(drop=True)

nibrs_eligible_months_check.head()

Unnamed: 0,agency_name,report_year,nibrs_start_date,nibrs_eligible_months,months_reported,has_preliminary_nibrs_reporting
0,ZAPATA CO SO,2023-12-31,2020-01-01,12,12,False
1,LOS FRESNOS PD,2019-12-31,2017-10-01,12,12,False
2,EL PASO ISD PD,2022-12-31,2021-01-01,12,12,False
3,HARLINGEN PD,2019-12-31,2018-01-01,12,12,False
4,EL PASO CO SO,2022-12-31,2019-10-01,12,12,False


In [8]:
preliminary_nibrs_reporting = nibrs_eligible_months_check[nibrs_eligible_months_check['has_preliminary_nibrs_reporting'] == True].reset_index(drop=True)
preliminary_nibrs_reporting.head()

Unnamed: 0,agency_name,report_year,nibrs_start_date,nibrs_eligible_months,months_reported,has_preliminary_nibrs_reporting
0,BROWNSVILLE PD,2018-12-31,2018-03-01,10,12,True
1,DONNA PD,2017-12-31,2017-04-01,9,10,True
2,WEBB CO SO,2019-12-31,2019-09-01,4,6,True
3,EL PASO PD,2019-12-31,2019-08-01,5,6,True
4,EL PASO CO SO,2019-12-31,2019-10-01,3,6,True


In [9]:
preliminary_reporting_records = sum(preliminary_nibrs_reporting['has_preliminary_nibrs_reporting'])
print('Preliminary_reporting_records:', preliminary_reporting_records)

number_of_records = len(nibrs_eligible_months_check)
print('Total number of records:', number_of_records)

percent_preliminary_reporting = (preliminary_reporting_records / number_of_records) * 100
print('Percent of records with preliminary reporting:', round(percent_preliminary_reporting, 2))

agencies_with_preliminary_nibrs_reporting = len(preliminary_nibrs_reporting['agency_name'].unique())
print('Agencies with preliminary NIBRS reporting:', agencies_with_preliminary_nibrs_reporting)

count_of_agencies = nibrs_eligible_months_check['agency_name'].nunique()
print('Count of agencies:', count_of_agencies)

percent_agencies_with_preliminary_nibrs_reporting = (agencies_with_preliminary_nibrs_reporting / count_of_agencies) * 100
print('Percent of agencies with preliminary NIBRS reporting:', percent_agencies_with_preliminary_nibrs_reporting)

Preliminary_reporting_records: 24
Total number of records: 595
Percent of records with preliminary reporting: 4.03
Agencies with preliminary NIBRS reporting: 17
Count of agencies: 85
Percent of agencies with preliminary NIBRS reporting: 20.0


In [10]:
# Define the function to calculate the NIBRS contribution percentage
def calculate_nibrs_contribution_percentage(row):
    # Normalizing the cases where agencies are reporting data earlier
    if row['nibrs_eligible_months'] < row['months_reported']:
        return 100  # If the number of eligible months is less than the number of months reported, the contribution is 100%
     # Calculate the contribution percentage based on the ratio of months reported to eligible months
    elif row['nibrs_eligible_months'] > 0:
        return round((row['months_reported'] / row['nibrs_eligible_months']) * 100, 2) 
   # If there are no eligible months, the contribution is 0%. This filters out the preliminary reporting cases
    else:
        return 0  

# Apply the function to create the new column with rounded values
border_report['nibrs_contribution_percentage'] = border_report.apply(calculate_nibrs_contribution_percentage, axis=1)

# Check logic
border_report[['agency_name', 
               'report_year', 
               'nibrs_start_date', 
               'nibrs_eligible_months', 
               'nibrs_contribution_percentage',
               'months_reported',
               'has_preliminary_nibrs_reporting'
              ]].sort_values('nibrs_contribution_percentage', ascending=False).head()


Unnamed: 0,agency_name,report_year,nibrs_start_date,nibrs_eligible_months,nibrs_contribution_percentage,months_reported,has_preliminary_nibrs_reporting
594,ZAPATA CO SO,2023-12-31,2020-01-01,12,100.0,12,False
428,ANTHONY PD,2022-12-31,2020-12-01,12,100.0,12,False
425,ALAMO PD,2022-12-31,2018-08-01,12,100.0,12,False
221,PHARR PD,2019-12-31,2021-01-01,0,100.0,2,True
220,PENITAS PD,2019-12-31,2019-01-01,12,100.0,12,False


In [11]:
#Store resulting dataframe into 2_staging
border_report.to_csv('data/3_intermediate/int_texas_border_report.csv', index=False)