In [1]:
import pandas as pd

In [2]:
#Import the consolidated data set
file_path = "data\\2_staging\\stg_texas_border_report.csv"

border_report = pd.read_csv(
    file_path, 
    parse_dates=['nibrs_start_date', 'report_year'], 
    date_format='%Y-%m-%d')

border_report.head()

Unnamed: 0,agency_name,county,population,nibrs_start_date,murder_and_nonnegligent_manslaughter,negligent_manslaughter,rape,robbery,assault,burglary,larceny_theft,motor_vehicle_theft,arson,human_trafficking_commercial_sex_acts,human_trafficking_involuntary_servitude,months_reported,report_year
0,ALAMO PD,Hidalgo County,19351,2018-08-01,0,0,0,0,0,0,0,0,0,0,0,0,2017-12-31
1,ALPINE PD,Brewster County,5983,2020-04-01,0,0,0,0,0,0,0,0,0,0,0,0,2017-12-31
2,ALTON PD,Hidalgo County,17432,2019-09-01,0,0,0,0,0,0,0,0,0,0,0,0,2017-12-31
3,ANTHONY PD,El Paso County,5681,2020-12-01,0,0,0,0,0,0,0,0,0,0,0,0,2017-12-31
4,BREWSTER CO SO,Brewster County,3196,2015-01-01,1,0,2,0,19,4,9,3,1,0,0,11,2017-12-31


In [3]:
#Create column to identify the different agency types within the report
border_report['agency_type'] = ''

# Define conditions for different agency types
police_condition = border_report['agency_name'].str.contains('PD', case=False)
sheriff_condition = border_report['agency_name'].str.contains('SO', case=False)
constable_condition = border_report['agency_name'].str.contains('CONSTABLE', case=False)

# Assign the agency type based on conditions
border_report.loc[police_condition, 'agency_type'] = 'Police Department'
border_report.loc[sheriff_condition, 'agency_type'] = "Sheriff's Office"
border_report.loc[constable_condition, 'agency_type'] = 'Constable'

#Create column to label the univerity police departments
border_report['university_pd'] = border_report['agency_name'].str.contains('UNIV|COLLEGE', case=False, na=False)

#Test logic
unique_agency_names = list(border_report[border_report['university_pd'] == True]['agency_name'].unique())
unique_agency_names


['EL PASO COMM COLLEGE PD',
 'SUL ROSS STATE UNIV PD',
 'TX A&M UNIV INTERNATIONAL PD',
 'UNIV OF TX RIO GRANDE VALLEY PD',
 'UNIV OF TX: EL PASO PD']

In [4]:
#Calculate how many months each agency should have reported data
def nibrs_eligible_months(row):
    # Directly use 'nibrs_start_date' as it's already a datetime object
    nibrs_start_date = row['nibrs_start_date']
    
    # Extract the year directly from 'report_year' (assuming it's already a datetime object)
    report_year = row['report_year'].year

    # Calculate the number of eligible months based on conditions
    if nibrs_start_date.year < report_year:
        return 12
    elif nibrs_start_date.year == report_year:
        return (12 - nibrs_start_date.month + 1)  # +1 to include the start month itself
    else:
        return 0

#Add nibrs_eligible_months column    
border_report['nibrs_eligible_months'] = border_report.apply(nibrs_eligible_months, axis=1)

#Add NIBRS preliminary reporting flag
border_report['has_preliminary_nibrs_reporting'] = border_report['nibrs_eligible_months'] < border_report['months_reported']


#Check nibrs_eligible_months logic
border_report[
        ['agency_name', 
        'report_year', 
        'nibrs_start_date', 
        'nibrs_eligible_months', 
        'months_reported',
        'has_preliminary_nibrs_reporting']
    ].sort_values('months_reported', ascending=False).head()

Unnamed: 0,agency_name,report_year,nibrs_start_date,nibrs_eligible_months,months_reported,has_preliminary_nibrs_reporting
594,ZAPATA CO SO,2023-12-31,2020-01-01,12,12,False
210,LOS FRESNOS PD,2019-12-31,2017-10-01,12,12,False
445,EL PASO ISD PD,2022-12-31,2021-01-01,12,12,False
193,HARLINGEN PD,2019-12-31,2018-01-01,12,12,False
443,EL PASO CO SO,2022-12-31,2019-10-01,12,12,False


In [5]:
# Define the function to calculate the NIBRS contribution percentage
def calculate_nibrs_contribution_percentage(row):
    #Normalizing the cases where agencies are reporting data earlier
    if row['nibrs_eligible_months'] < row['months_reported']:
        return 100
    elif row['nibrs_eligible_months'] > 0:
        return round((row['months_reported'] / row['nibrs_eligible_months']) * 100, 2)
    else:
        return 0


# Apply the function to create the new column with rounded values
border_report['nibrs_contribution_percentage'] = border_report.apply(calculate_nibrs_contribution_percentage, axis=1)

#Check logic
border_report[['agency_name', 
                     'report_year', 
                     'nibrs_start_date', 
                     'nibrs_eligible_months', 
                     'nibrs_contribution_percentage',
                     'months_reported',
                     'has_preliminary_nibrs_reporting'
                     ]].sort_values('nibrs_contribution_percentage', ascending=False).head()

Unnamed: 0,agency_name,report_year,nibrs_start_date,nibrs_eligible_months,nibrs_contribution_percentage,months_reported,has_preliminary_nibrs_reporting
594,ZAPATA CO SO,2023-12-31,2020-01-01,12,100.0,12,False
428,ANTHONY PD,2022-12-31,2020-12-01,12,100.0,12,False
425,ALAMO PD,2022-12-31,2018-08-01,12,100.0,12,False
221,PHARR PD,2019-12-31,2021-01-01,0,100.0,2,True
220,PENITAS PD,2019-12-31,2019-01-01,12,100.0,12,False


In [6]:
#Store resulting dataframe into 2_staging
border_report.to_csv('data/3_intermediate/int_texas_border_report.csv', index=False)