# Bike Thefts Berlin - EDA

In [61]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [62]:
PATH = '../data'

In [63]:
time_parseable = ['start_date_delict', 'end_date_delict']

## Prep data inspection

In [64]:
class BikeThefts():
    '''Read, inspect and transform initial data.'''

    def __init__(self, path = PATH) -> None:
        self.path = path

    def read_initial_data(self, file) -> pd.DataFrame:
        '''Return DataFrame with feature matrix and labels as values.'''
        df = pd.read_csv(f'{self.path}/{file}', index_col=0, parse_dates=True, encoding = 'unicode_escape')
        return df

    def read_extracted_data(self, file) -> pd.DataFrame:
        '''Return DataFrame extracted from initial data.'''
        df = pd.read_csv(f'{self.path}/{file}', encoding = 'unicode_escape')
        return df
    
    def check_unique(self, serie) -> pd.Series:
        '''Return unique values of selected columns.'''
        return serie.unique()

    def include_timestamps(self, df) -> pd.DataFrame:
        '''Return DataFrame with time-stamps.'''
        df['year'] = df.index.year
        df['month'] = df.index.month
        return df

    def time_parser(self, df, time_parsables) -> pd.DataFrame:
        '''Parse columns encoded as strings to datetime-objects.'''
        for col in time_parsables:
            df[col] = pd.to_datetime(df[col])
        return df

    def crime_duration_days(self, df, start, end) -> pd.DataFrame: #use for plotting?
        '''Calculate duration of crime in days.'''
        df['crime_duration_days'] = end - start
        return df
    
    def crime_duration_hours(self, df, start, end) -> pd.DataFrame: #use for plotting?
        '''Calculate duration of crime in hours.'''
        df['crime_duration_hours'] = abs(start - end)
        return df
    
    def fill_ints(self, df) -> pd.DataFrame:
        '''Reencode LOR into 8-digit values.'''
        df['LOR'] = df['LOR'].apply(lambda x: str(x))
        df['LOR'] = df['LOR'].apply(lambda x: f'0{(x)}'[-8:])
        return df
    
    def fill_ints_grouped(self, df) -> pd.DataFrame:
        '''Group by LOR. Return df including number of bike thefts per LOR-group.'''
        df_grouped = pd.DataFrame(df.groupby('LOR').size(),
                          columns=['bike_thefts']).reset_index()
        return df_grouped
        
    def rename_cols(self, df) -> pd.DataFrame:
        return df.rename(columns={ 
            'ANGELEGT_AM' : 'track_date',
            'TATZEIT_ANFANG_DATUM' : 'start_date_delict',
            'TATZEIT_ANFANG_STUNDE' : 'start_time_delict',
            'TATZEIT_ENDE_DATUM' : 'end_date_delict',
            'TATZEIT_ENDE_STUNDE' : 'end_time_delict',
            'SCHADENSHOEHE' : 'damage_amount',
            'VERSUCH' : 'intent_delict',
            'ART_DES_FAHRRADS' : 'bike_type',
            'DELIKT' : 'delict',
            'ERFASSUNGSGRUND' : 'description'
            })
    
    def reencode_LOR(self, df, dictionary) -> pd.DataFrame:
        '''Reencode LOR into name of district.'''
        for idx, row in df['LOR'].items():
            for key in dictionary.keys():
                #print(f'comparing {row}, with {key}')
                if key in row[0:2]:
                    #print(f'found key {key} in row {row[0:2]}')
                    district = dictionary[key]
                    #print(f'value to be inserted into row {row} is {district}')
                    row = district
                    #print(f"row has now value {row}")
                else:
                    continue
            return df
    
    def save_intermediate_data(self, df, file: str):
        '''Save extracted data locally as csv-file.'''
        return df.to_csv(f'{self.path}/{file}.csv')

    def save_LOR_bike_thefts(self, df, group_by: str, col_names: str, file: str):
        '''Save extracted LOR-bike thefts-data locally as csv-file.'''
        bike_thefts_LOR = pd.DataFrame(df.groupby(group_by).size(),
                        columns = [col_names]).reset_index()
        return bike_thefts_LOR.to_csv(f'{self.path}/{file}.csv')

In [65]:
# class PlotBikeThefts():
#     '''Plot data with seaborn.'''

#     def plot_categoricals(self, df, ordinate) -> sns:
#             return sns.catplot(
#             data=df, y=ordinate, kind="count",
#             palette="pastel", edgecolor=".6",
#         )

#     def plot_correlations(self, df) -> sns:
#         corr = df.corr()
#         mask = np.triu(np.ones_like(corr, dtype=bool))
#         f, ax = plt.subplots(figsize=(8, 6))
#         plt.xticks(rotation=45)
#         cmap = sns.diverging_palette(230, 20, as_cmap=True)
#         return sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
#                     square=True, linewidths=.5, cbar_kws={"shrink": .5})
    

## Create transformed dataset

In [66]:
bike_thefts = BikeThefts()

In [67]:
df = bike_thefts.read_initial_data('Fahrraddiebstahl.csv')
df.head()

Unnamed: 0_level_0,TATZEIT_ANFANG_DATUM,TATZEIT_ANFANG_STUNDE,TATZEIT_ENDE_DATUM,TATZEIT_ENDE_STUNDE,LOR,SCHADENSHOEHE,VERSUCH,ART_DES_FAHRRADS,DELIKT,ERFASSUNGSGRUND
ANGELEGT_AM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-02-19,18.02.2023,18,19.02.2023,7,3701658,3472,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
2023-02-19,18.02.2023,20,19.02.2023,9,1100308,2277,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
2023-02-19,18.02.2023,16,19.02.2023,0,6100102,746,Nein,Damenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
2023-02-19,18.02.2023,17,19.02.2023,0,7601544,1849,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
2023-02-19,17.02.2023,17,17.02.2023,17,3601346,1863,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern


In [68]:
df = bike_thefts.rename_cols(df)
df.columns

Index(['start_date_delict', 'start_time_delict', 'end_date_delict',
       'end_time_delict', 'LOR', 'damage_amount', 'intent_delict', 'bike_type',
       'delict', 'description'],
      dtype='object')

## Parse time

In [69]:
df_transformed = bike_thefts.time_parser(df, time_parseable)
df_transformed.head(3)

Unnamed: 0_level_0,start_date_delict,start_time_delict,end_date_delict,end_time_delict,LOR,damage_amount,intent_delict,bike_type,delict,description
ANGELEGT_AM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-02-19,2023-02-18,18,2023-02-19,7,3701658,3472,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
2023-02-19,2023-02-18,20,2023-02-19,9,1100308,2277,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
2023-02-19,2023-02-18,16,2023-02-19,0,6100102,746,Nein,Damenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern


In [70]:
df_transformed = bike_thefts.crime_duration_days(df_transformed, 
                                            start=df_transformed['start_date_delict'], 
                                            end = df_transformed['end_date_delict'])

In [71]:
df_transformed = bike_thefts.crime_duration_hours(df_transformed, 
                                            start=df_transformed['start_time_delict'], 
                                            end = df_transformed['end_time_delict'])
df_transformed.columns

Index(['start_date_delict', 'start_time_delict', 'end_date_delict',
       'end_time_delict', 'LOR', 'damage_amount', 'intent_delict', 'bike_type',
       'delict', 'description', 'crime_duration_days', 'crime_duration_hours'],
      dtype='object')

In [72]:
df_transformed = bike_thefts.include_timestamps(df_transformed)
df_transformed.columns

Index(['start_date_delict', 'start_time_delict', 'end_date_delict',
       'end_time_delict', 'LOR', 'damage_amount', 'intent_delict', 'bike_type',
       'delict', 'description', 'crime_duration_days', 'crime_duration_hours',
       'year', 'month'],
      dtype='object')

In [73]:
df_transformed = bike_thefts.fill_ints(df_transformed)
df_transformed.columns

Index(['start_date_delict', 'start_time_delict', 'end_date_delict',
       'end_time_delict', 'LOR', 'damage_amount', 'intent_delict', 'bike_type',
       'delict', 'description', 'crime_duration_days', 'crime_duration_hours',
       'year', 'month'],
      dtype='object')

In [74]:
df_transformed.head(2)

Unnamed: 0_level_0,start_date_delict,start_time_delict,end_date_delict,end_time_delict,LOR,damage_amount,intent_delict,bike_type,delict,description,crime_duration_days,crime_duration_hours,year,month
ANGELEGT_AM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2023-02-19,2023-02-18,18,2023-02-19,7,3701658,3472,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern,1 days,11,2023,2
2023-02-19,2023-02-18,20,2023-02-19,9,1100308,2277,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern,1 days,11,2023,2


In [75]:
district_dict = {
    '01': 'Mitte',
    '02': 'Friedrichshain-Kreuzberg',
    '03': 'Pankow',
    '04': 'Charlottenburg-Wilmersdorf',
    '05': 'Spandau',
    '06': 'Steglitz-Zehlendorf',
    '07': 'Tempelhof-Schöneberg',
    '08': 'Neukölln',
    '09': 'Treptow-Köpenick',
    '10': 'Marzahn-Hellersdorf',
    '11': 'Lichtenberg',
    '12': 'Reinickendorf',
    }

In [76]:
# df_transformed['districts'] = df_transformed['LOR']
# assert len(df_transformed['districts'].unique()) == len(df_transformed['LOR'].unique())

In [81]:
def insert_district(df, dictionary):
    for idx, row in df['LOR'].items():
        for key in dictionary.keys():
            print(f'comparing {row}, with {key}')
            if key in row[0:2]:
                print(f'found key {key} in value {row}')
                district = dictionary[key]
                print(f'new value to be inserted into value {row} is {district}')
                row = district
                print(f"new value is {district}")
                df_transformed['districts'] = district
            else:
                break
    return df

In [82]:
test = insert_district(df_transformed, district_dict)

comparing 03701658, with 01
comparing 01100308, with 01
found key 01 in value 01100308
new value to be inserted into value 01100308 is Mitte
new value is Mitte
comparing Mitte, with 02
comparing 06100102, with 01
comparing 07601544, with 01
comparing 03601346, with 01
comparing 03400723, with 01
comparing 01200522, with 01
found key 01 in value 01200522
new value to be inserted into value 01200522 is Mitte
new value is Mitte
comparing Mitte, with 02
comparing 05300737, with 01
comparing 03701554, with 01
comparing 03400620, with 01
comparing 01300834, with 01
found key 01 in value 01300834
new value to be inserted into value 01300834 is Mitte
new value is Mitte
comparing Mitte, with 02
comparing 02500835, with 01
comparing 03601245, with 01
comparing 02200208, with 01
comparing 02100104, with 01
comparing 03601141, with 01
comparing 08100312, with 01
comparing 09501940, with 01
comparing 02200211, with 01
comparing 12400720, with 01
comparing 06400839, with 01
comparing 09100304, with 

In [83]:
test.districts.unique()

array(['Mitte'], dtype=object)

In [None]:
df_transformed = bike_thefts.reencode_LOR(df_transformed, district_dict)

In [None]:
df_transformed.head()

In [None]:
df_transformed

In [None]:
test_frame = bike_thefts.reencode_LOR(df_transformed, district_dict)

In [None]:
test_frame['LOR'].unique()

In [None]:
bike_thefts.save_intermediate_data(df_transformed, 'bike_thefts_transformed') # assign additional col with district names

## Create sub DataFrame with `LOR` and `bike_theft_count`

In [None]:
df_count_bike_thefts = bike_thefts.fill_ints_grouped(df_transformed)
df_count_bike_thefts.head(2)

In [None]:
bike_thefts.save_LOR_bike_thefts(df_count_bike_thefts, 'LOR', 'bike_thefts_count', 'bike_thefts_LOR' )

## Extract dataframes for individual years

In [None]:
extract_2022 = df_transformed.loc['2022-01-01':'2022-12-31']

In [None]:
checkable = ['start_date_delict', 'end_date_delict', 'start_time_delict', 'end_time_delict']

for col in checkable:
    assert 2023 not in extract_2022[col]  # write a test function out of this

In [None]:
bike_thefts.save_intermediate_data(extract_2022, 'bike_thefts_df_2022')

In [None]:
extract_22_resampled = extract_2022.resample('M')
extract_22_resampled = extract_22_resampled['LOR', 'damage_amount'].mean().round(2)
extract_22_resampled.head(2)

In [None]:
bike_thefts.save_intermediate_data(extract_22_resampled, 'extract_22_resampled')

In [None]:
extract_2023 = df_transformed.loc['2023-01-01':'2023-02-19']
extract_2023.head(2)

In [None]:
bike_thefts.save_intermediate_data(extract_2023, 'bike_thefts_df_2023')

In [None]:
checkable = ['start_date_delict', 'end_date_delict', 'start_time_delict', 'end_time_delict']

for col in checkable:
    assert 2022 not in extract_2023[col]  # write a test function out of this

In [None]:
extract_23_resampled = extract_2023.resample('M')
extract_23_resampled = extract_23_resampled['LOR', 'damage_amount'].mean().round(2)
extract_23_resampled.head(2)

In [None]:
bike_thefts.save_intermediate_data(extract_23_resampled, 'extract_23_resampled')

## Extract series for individual years and plot numeric data: "Schadenshöhe"

In [None]:
extract_22_resampled.plot.line(title = 'Mean Values of Monthly Bike Thefts (EUR) - 2022'); #how can this be rendered in app?

In [None]:
extract_22_resampled.plot.hist(bins = 10, title = 'Values of Monthly Bike Thefts - 2022'); #how can this be rendered in app?

## Extract series for individual years and plot numeric data: thefts

In [None]:
extract_2022.reset_index(inplace=True)

In [None]:
bike_thefts_series_22 = pd.DataFrame(extract_2022.groupby(['LOR', 'ANGELEGT_AM']).size(),
                       columns = ['bike_theft_count']).reset_index()
bike_thefts_series_22 = bike_thefts_series_22.set_index(bike_thefts_series_22['ANGELEGT_AM'])
bike_thefts_series_22 = bike_thefts_series_22['bike_theft_count']
bike_thefts_series_22.hist(); #render in app

In [None]:
bike_thefts_series_22.describe() #render in app

In [None]:
(bike_thefts_series_22
.resample('D')  
.mean()
.plot.line(title = '2022 Mean Number of Daily Bike Thefts') #render in app; eventually by changing frequency
);

In [None]:
(bike_thefts_series_22
 .resample('M')
 .sum()
 .div(bike_thefts_series_22.sum())
 .mul(100)
 .rename(lambda idx: idx.month_name())
 .plot.barh(title = '2022 Monthly Percentage of Bike Thefts (over yearly total)') #render in app
 );

In [None]:
bike_thefts.save_intermediate_data(bike_thefts_series_22, 'bike_thefts_series_2022')

In [None]:
extract_2023.reset_index(inplace=True)
extract_2023.head(2)

In [None]:
bike_thefts_series_23 = pd.DataFrame(extract_2023.groupby(['LOR', 'ANGELEGT_AM']).size(),
                       columns = ['bike_theft_count']).reset_index()
bike_thefts_series_23 = bike_thefts_series_23.set_index(bike_thefts_series_23['ANGELEGT_AM'])
bike_thefts_series_23 = bike_thefts_series_23['bike_theft_count']
bike_thefts_series_23.hist();

In [None]:
bike_thefts_series_23.describe() #render in app

In [None]:
(bike_thefts_series_23
.resample('D')
.mean()
.plot.line(title = '2022 Mean Number of Daily Bike Thefts') #ignore
);

In [None]:
(bike_thefts_series_23
.resample('W')
.mean()
.plot.barh() #ignore
);

In [None]:
(bike_thefts_series_23
 .resample('M')
 .sum()
 .div(bike_thefts_series_23.sum())
 .mul(100)
 .rename(lambda idx: idx.month_name())
 .plot.barh(title = '2023 Monthly Percent Bike Thefts')
 );

In [None]:
bike_thefts.save_intermediate_data(bike_thefts_series_23, 'bike_thefts_series_2023')

In [None]:
df_transformed.drop(df_transformed.iloc[:, 0:4], axis=1, inplace=True)
df_transformed.head(2) #check

In [None]:
df_transformed.reset_index(inplace=True)
df_transformed.head(2)

In [None]:
bike_thefts_LOR_year = pd.DataFrame(df_transformed.groupby(['LOR', 'year']).size(),
                       columns = ['bike_theft_count'])
bike_thefts_LOR_year = bike_thefts_LOR_year.reset_index()
bike_thefts_LOR_year = bike_thefts_LOR_year.set_index('year').reset_index()
bike_thefts_LOR_year.head(2)

In [None]:
bike_thefts.save_LOR_bike_thefts(
    df = bike_thefts_LOR_year, 
    group_by =['LOR', 'year'],
    col_names='bike_theft_count',
    file = 'bike_thefts_year_LOR_count'
    ) 