# <center> Data enriching

- The goal of this notebook is to work on enriching the data.
- The final output should be a new dataset (or a functions that generates the new dataset)

# Step 1 : Getting the base dataset

In [None]:
!pip install geopandas

In [8]:
# Calculus
import pandas as pd

import os

# Data viz
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'

import geopandas as gpd
import folium
import pandas as pd
import branca.colormap as cm

In [None]:
# fetch all the data from the raw_data folder
current_dir = os.getcwd()
current_dir

file_path = os.path.join(current_dir, '..', 'raw_data', 'data.csv')
df = pd.read_csv(file_path)

# Filtering the columns to use

In [None]:
columns_keep = [
#'division_number',
'date_reported',
'date_occurred',
#'area',
'area_name',
#'reporting_district',
#'part',
#'crime_code',
'crime_description',
#'modus_operandi',
'victim_age',
'victim_sex',
'victim_descent',
#'premise_code',
'premise_description',
#'weapon_code',
'weapon_description',
#'status',
'status_description',
#'crime_code_1',
#'crime_code_2',
#'crime_code_3',
#'crime_code_4',
'location',
#'cross_street',
'latitude',
'longitude',
]

# Adding the dates infos

In [None]:
df = df[columns_keep]
df['counter']=1

# Dates
df['date_occurred'] = pd.to_datetime(df['date_occurred'], errors='coerce')
df['year_occurred'] = df['date_occurred'].dt.year
df['month_occurred'] = df['date_occurred'].dt.month
df['hour_occurred'] = df['date_occurred'].dt.hour

df.head(3)

In [None]:
# Load the shapefile

# fetch all the data from the raw_data folder

file_path = os.path.join(current_dir, '..', 'raw_data','geo_data','cfbcc20d-2c5d-4c30-9dfa-627d46ec1a742020328-1-9ulknm.pzqsm.shp')

neighborhoods = gpd.read_file(file_path)


In [None]:
# Convert DataFrame to GeoDataFrame
crime_data_gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
crime_data_gdf.set_crs(neighborhoods.crs, inplace=True).head(1)


In [None]:
# Perform a spatial join
joined_gdf = gpd.sjoin(crime_data_gdf, neighborhoods, how='left', op='within')
joined_gdf.head(1)

# Now "name" replace "area_name" and as the neiborhood name

# Let's add a scoring per crimes types

In [None]:
crimes_df = pd.DataFrame(joined_gdf['crime_description'].unique())
crimes_df.columns=['crime_description']
crimes_df

In [None]:
# Assigning gravity scores based on the type of crime
# This is a subjective assignment and should ideally be based on more detailed criteria

def assign_gravity(crime_description):
    lower_case_description = crime_description.lower()
    if any(word in lower_case_description for word in ['petty theft', 'vandalism', 'minor fraud', 'trespass','stole']):
        return 1  # Low Gravity
    elif any(word in lower_case_description for word in ['burglary', 'serious fraud', 'aggravated assault', 'robbery']):
        return 2  # Medium Gravity
    elif any(word in lower_case_description for word in ['homicide', 'rape', 'kidnapping', 'arson','dead','penetration','penis','child pornography']):
        return 3  # High Gravity
    else:
        return 1  # Default to Low Gravity if not clearly fitting other categories

# Apply the gravity scoring function to the dataset
crimes_df['gravity_for_tourist'] = crimes_df['crime_description'].apply(assign_gravity)

crimes_df.head()


In [None]:
# Apply the gravity scoring function to the dataset
joined_gdf['gravity_for_tourist'] = crimes_df['crime_description'].apply(assign_gravity)

joined_gdf.head()

In [6]:
import sys
import os

# Path to the folder containing my_module.py
folder_path = os.path.abspath(os.path.join('..', 'library'))

# Add this path to sys.path
sys.path.insert(0, folder_path)

# Now you can import your module or functions
import la_functions as la

In [12]:
df_enriched = la.data_enriching('data.csv')


The `op` parameter is deprecated and will be removed in a future release. Please use the `predicate` parameter instead.



In [13]:
df_enriched

Unnamed: 0,date_reported,date_occurred,area_name,crime_description,victim_age,victim_sex,victim_descent,premise_description,weapon_description,status_description,...,longitude,counter,year_occurred,month_occurred,hour_occurred,geometry,index_right,OBJECTID,name,gravity_for_tourist
0,2020-01-08,2020-01-08 22:30:00,Southwest,BATTERY - SIMPLE ASSAULT,36,F,B,SINGLE FAMILY DWELLING,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",Adult Other,...,-118.2978,1,2020,1,22,POINT (-118.29780 34.01410),31.0,32.0,Exposition Park,1
1,2020-01-02,2020-01-01 03:30:00,Central,BATTERY - SIMPLE ASSAULT,25,M,H,SIDEWALK,UNKNOWN WEAPON/OTHER WEAPON,Invest Cont,...,-118.2545,1,2020,1,3,POINT (-118.25450 34.04590),23.0,24.0,Downtown,1
2,2020-04-14,2020-02-13 12:00:00,Central,SEX OFFENDER REGISTRANT OUT OF COMPLIANCE,0,X,X,POLICE FACILITY,,Adult Arrest,...,-118.2474,1,2020,2,12,POINT (-118.24740 34.04480),23.0,24.0,Downtown,1
3,2020-01-01,2020-01-01 17:30:00,N Hollywood,VANDALISM - MISDEAMEANOR ($399 OR UNDER),76,F,W,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",,Invest Cont,...,-118.4019,1,2020,1,17,POINT (-118.40190 34.16850),95.0,96.0,Valley Village,1
4,2020-01-01,2020-01-01 04:15:00,Mission,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",31,X,X,BEAUTY SUPPLY STORE,,Invest Cont,...,-118.4468,1,2020,1,4,POINT (-118.44680 34.21980),71.0,72.0,Panorama City,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
852945,2023-03-22,2023-03-22 10:00:00,Foothill,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",25,F,H,SIDEWALK,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",Invest Cont,...,-118.4116,1,2023,3,10,POINT (-118.41160 34.27900),69.0,70.0,Pacoima,2
852946,2023-04-12,2023-04-12 16:30:00,77th Street,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",29,M,B,LAUNDROMAT,UNKNOWN WEAPON/OTHER WEAPON,Invest Cont,...,-118.2915,1,2023,4,16,POINT (-118.29150 33.98410),99.0,100.0,Vermont-Slauson,2
852947,2023-07-02,2023-07-01 00:01:00,Central,PICKPOCKET,24,F,H,NIGHT CLUB (OPEN EVENINGS ONLY),,Invest Cont,...,-118.2485,1,2023,7,0,POINT (-118.24850 34.04670),23.0,24.0,Downtown,1
852948,2023-03-05,2023-03-05 09:00:00,Van Nuys,VANDALISM - MISDEAMEANOR ($399 OR UNDER),53,F,H,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",,Invest Cont,...,-118.4487,1,2023,3,9,POINT (-118.44870 34.19510),96.0,97.0,Van Nuys,1


prompt asked to chat gpt: 

User
Here is a series of crimes. I want you to had a column "gravity" and a column "probability to happen to a toursit" and grade the proba from 0 to 1 and the gravity from 1 to 3 with 1 being low and high biing 3
