In [2]:
# Load datasets if working on Google Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from pydrive.auth import GoogleAuth
    from pydrive.drive import GoogleDrive
    from google.colab import auth
    from oauth2client.client import GoogleCredentials

    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)

    file_id = '...'
    downloaded = drive.CreateFile({'id':file_id})
    downloaded.FetchMetadata(fetch_all=True)
    downloaded.GetContentFile(downloaded.metadata['title'])

    f = open("V2data_6mounts2022.csv.zip", "wb")
    f.write(downloaded.content.getbuffer())
    f.close()

    !unzip V2data_6mounts2022.csv.zip

In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import urllib.request
import json 
import time
import sys

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression

from datetime import datetime, date
import calendar

import plotly.express as px
import plotly.graph_objects as go

import osmnx as ox
import folium

sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)

# Dataset Import

### Import both the old and the new datasets recieved

In [4]:
# Assing url deppending if working on Google Colab or outside of it
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    df_main_url = '/content/data.csv'
    df_old_url = '/content/output/all_in_one.csv'
else: 
    df_main_url = '../../data/datav2.csv'
    df_old_url = '../../Information/output/all_in_one.csv'

In [5]:
merge_old_df = False

df_main = pd.read_csv(df_main_url, dtype = {'place.id': object})

if merge_old_df:
    df_old = pd.read_csv(df_old_url)
else:
    print("To merge the old dataframe, change merge_old_df to True. [NOT WORKING FOR THE MOMENT")

To merge the old dataframe, change merge_old_df to True. [NOT WORKING FOR THE MOMENT


### Drop unneeded columns

In [6]:
not_needed_columns = ['value.Vehicle_Mode', 'speed', '_id',]
df_main.drop(not_needed_columns, axis=1, inplace=True, errors='ignore')

if 'df_old' in locals():  # Only run if the old DF was created
    df_old.drop(not_needed_columns, axis=1, inplace=True, errors='ignore')

### Rename columns to naming convention used

In [7]:
df_main.rename(columns = {'suitcase.id':'suitcase_id', 'date.utc':'date_utc', 
                    'edge.id':'edge_id', 'edge.osmid':'edge_osmid', 
                    'place.id':'place_id', 'osm.highway':'osm_highway'}, inplace=True, errors='ignore')

if 'df_old' in locals():  # Only run if the old DF was created
    df_old.rename(columns = {'osm_id':'edge_osmid', 'osm_type':'osm_highway'}, inplace=True, errors='ignore')

### Merge the two DataFrames and rearrange columns

In [8]:
if 'df_old' in locals():  # Only run if the old DF was created
    df_merged = pd.concat([df_old, df_main], axis=0)
    # Remove columns at the tail and add them to a variable, then insert where it makes more sense
    df_merged.insert(7, 'place_id', df_merged.pop('place_id'))
    df_merged.insert(15, '10', df_merged.pop('10'))
    df_merged.insert(16, '13', df_merged.pop('13'))
    df_merged.insert(19, '19', df_merged.pop('19'))
    df_merged.insert(20, '21', df_merged.pop('21'))

### Make a copy of the dataframe to work with

In [9]:
df = df_merged.copy() if 'df_old' in locals() else df_main.copy()
# df_test = df[~df['place.id'].isna()]

# Data Cleaning

### Basic data cleaning

In [10]:
# List of the litter categories columns
litter_columns = []
for item in df.columns.to_list():
    if item.isdigit(): litter_columns.append(item) 

def clean_df(df_arg, mapping):
    df = df_arg

    # Drop empty rows
    df = df.dropna(subset=[mapping]).copy()

    # Drop place_id for now, it's only present in 4% of the data
    #   and it has the same use as edge_id, but with different naming convention
    df.drop('place_id', axis=1, inplace=True)

    # Convert to datetime and remove the time
    df['date_utc'] = pd.to_datetime(df['date_utc']).dt.date

    # Fill NaNs with 0 and convert litter columns to integer
    df[litter_columns] = df[litter_columns].fillna(0)
    df[litter_columns] = df[litter_columns].astype(np.int64) 

    # Clean and sort edge_id
    # df['edge_id'] = df['edge_id'].apply(lambda x: tuple(x[1:-1].split(', ')[0:2]))
    # df['edge_id'] = df['edge_id'].apply(lambda x: sorted(x))  # --> sort it so order doesn't matter
    # df['edge_id'] = df['edge_id'].apply(lambda x: (', ').join(x))  # --> make it a string
    # We might not sort and modify the syntax of the edge_id anymore, since it is needed to compare with the OSM geojson file

    # Add a sum of all the litter
    df['total_litter'] = df[litter_columns].sum(axis=1)

    return df

In [11]:
df = clean_df(df, 'edge_id')

# Data Aggregation

## Aggregate on date and edge

In [12]:
# Aggregate based on unique edge_id, date_utc and suitcase_id
aggregation_type = 'sum'

to_agg = {'edge_osmid' : 'first', 'osm_highway' : 'first', 'total_litter' : aggregation_type}

for litter in litter_columns:
    to_agg[litter] = aggregation_type

df = df.groupby(['date_utc', 'edge_id'], as_index=False).agg(to_agg)

### Small function to prove if there needs to be aggregation or everything is aggregated

In [13]:
# TODO: Change from iterrows to itertuples

check_agg = False

this_date = ''
day_edges = []
log = []

if check_agg:
    for index, row in df.iterrows():
        if row['date_utc'] != this_date:
            this_date = row['date_utc']  # Set the day we will look into for the next iterations
            day_edges = []  # Reset day_edges since it's a new day
        
        if row['edge_id'] not in day_edges:
            day_edges.append(row['edge_id'])
        else:  # This means this edge is repeating in a single day, we want to log this
            log.append([index, this_date, row['edge_id']])
    print(f"There are {len(log)} rows that need to be aggregated")
else:
    print("To check if the Dataframe needs to be aggregated, change the variable 'chack_agg' to True")
    print("Warning: this function takes 2 to 4 minutes to run")

To check if the Dataframe needs to be aggregated, change the variable 'chack_agg' to True


# Feauture Creation

## Splitting the date

In [14]:
# Splitting date to year, month and day to make it a feauture for modeling
df['Year'] = pd.DatetimeIndex(df['date_utc']).year
df['month'] = pd.DatetimeIndex(df['date_utc']).month
df['day'] = pd.DatetimeIndex(df['date_utc']).day

# Converting it to object so it's treated as a categorical data when making the pipelines
df['Year'] = df['Year'].astype(object)
df['month'] = df['month'].astype(object)
df['day'] = df['day'].astype(object)

## Adding weekday

In [15]:
weekdays = ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')
df['weekday'] = pd.to_datetime(df['date_utc']).dt.date.apply(lambda x: x.weekday())
df['weekday'] = df['weekday'].apply(lambda x: weekdays[x])

## Adding weather features

#### Read the data

In [16]:
# Assing url deppending if working on Google Colab or outside of it
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    df_weather = pd.read_csv("https://raw.githubusercontent.com/dominik117/data-science-toolkit/main/data/weather_basel_2021-2022.csv")
else: 
    df_weather = pd.read_csv("../src/data/weather_basel_2021-2022.csv")
# Data obtained from https://www.meteoblue.com/en/weather/archive/export/basel_switzerland_2661604

#### Clean the weather data

In [17]:
### Rename columns
weather_columns = {'location':'date_utc', 'Basel':'temperature_max', 'Basel.1':'temperature_min', 'Basel.2':'temperature_mean',
                'Basel.3':'precipitation', 'Basel.4':'snowfall', 'Basel.5':'humidity_max', 'Basel.6':'humidity_min', 
                'Basel.7':'humidity_mean', 'Basel.8':'cloud_coverage', 'Basel.9':'wind_speed_max',
                'Basel.10':'wind_speed_min', 'Basel.11':'wind_speed_mean'}
df_weather.rename(columns = weather_columns, inplace = True)

### Drop unneeded columns
df_weather.drop(['Basel.12'], axis=1, inplace=True)

### Drop first rows with metadata
df_weather = df_weather.iloc[9:].copy()

### Normalize date column
df_weather['date_utc'] = pd.to_datetime(df_weather['date_utc']).dt.date

### Convert all columns (except date) to float, and round them
weather_date = df_weather.pop('date_utc') # Pop date so it doesn't get converted
df_weather = df_weather.apply(pd.to_numeric)
df_weather = df_weather.round(decimals = 1)
df_weather.insert(0, 'date_utc', weather_date) # Reassign date to df

#### Merge weather data with main data

In [18]:
### Merge weather dataframe with current dataframe
df = pd.merge(df, df_weather, how="left", on="date_utc")

## Add Latitude and Longitude

#### Make the dataframe with edge_id and it's latitudes based on geojson file

In [19]:
add_coordinates = True

if add_coordinates:
    if IN_COLAB:
        with urllib.request.urlopen('https://raw.githubusercontent.com/dominik117/data-science-toolkit/main/data/edges.geojson') as url:
            data = json.loads(url.read().decode())
    else:
        with open('../references/edges.geojson') as f:
            data = json.load(f)

    # Get the features key, the others have nothing of importance
    df_edges = data['features']
    df_edges = pd.DataFrame(df_edges)

    # Explode properties dictionary
    df_edges = pd.concat([df_edges.drop(['properties'], axis=1), df_edges['properties'].apply(pd.Series)], axis=1)

    # Rename edge column to match main dataframe
    df_edges.rename(columns = {'id':'edge_id'}, inplace = True)

    # Rearrange BBOX (lat_north, lat_south, lon_east, lon_west)
    def sort_bbox(x):
        lat = sorted([x[1], x[3]], key=float, reverse=True)
        lon = sorted([x[0], x[2]], key=float, reverse=True)
        return lat + lon
    df_edges['bbox'] = df_edges['bbox'].apply(sort_bbox)

    # Make dataframe only with edge_id and bbox(exploded)
    df_coordinates = df_edges[['edge_id', 'bbox']].copy()

    # Make columns with lat_north, lat_south, lon_east, lon_west from bbox
    bbox_exploded = pd.DataFrame(df_coordinates["bbox"].to_list(), columns=['lat_north', 'lat_south', 'lon_east', 'lon_west'])
    df_coordinates = pd.concat([df_coordinates, bbox_exploded], axis=1)
    df_coordinates.drop(['bbox'], axis=1, inplace=True)

    # Merge coordinates to main dataframe
    df = pd.merge(df, df_coordinates, how="left", on="edge_id")
else:
    print("To add coordinates change the add_coordinates variable to true")

## Add length of edge (street segment)

### Formula to get distance between two points on Earth

In [20]:
from math import atan, cos, radians, sin, tan, asin, sqrt

def haversine_distance(lat1, lon1, lat2, lon2):
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    EQUATORIAL_RADIUS = 6378 # Radius of earth in kilometers
    return c * EQUATORIAL_RADIUS

def lamberts_ellipsoidal_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
    # Radius of Earth (meters)
    AXIS_A = 6378137.0
    AXIS_B = 6356752.314245
    EQUATORIAL_RADIUS = 6378137

    # Equation Parameters
    flattening = (AXIS_A - AXIS_B) / AXIS_A
    # Parametric latitudes
    b_lat1 = atan((1 - flattening) * tan(radians(lat1)))
    b_lat2 = atan((1 - flattening) * tan(radians(lat2)))

    # Compute central angle between two points
    # using haversine theta. sigma =  haversine_distance / equatorial radius
    sigma = haversine_distance(lat1, lon1, lat2, lon2) / EQUATORIAL_RADIUS

    # Intermediate P and Q values
    P_value = (b_lat1 + b_lat2) / 2
    Q_value = (b_lat2 - b_lat1) / 2

    # Intermediate X value
    # X = (sigma - sin(sigma)) * sin^2Pcos^2Q / cos^2(sigma/2)
    X_numerator = (sin(P_value) ** 2) * (cos(Q_value) ** 2)
    X_demonimator = cos(sigma / 2) ** 2
    X_value = (sigma - sin(sigma)) * (X_numerator / X_demonimator)

    # Intermediate Y value
    # Y = (sigma + sin(sigma)) * cos^2Psin^2Q / sin^2(sigma/2)
    Y_numerator = (cos(P_value) ** 2) * (sin(Q_value) ** 2)
    Y_denominator = sin(sigma / 2) ** 2
    Y_value = (sigma + sin(sigma)) * (Y_numerator / Y_denominator)

    distance = abs(EQUATORIAL_RADIUS * (sigma - ((flattening / 2) * (X_value + Y_value))))

    return int(distance)

In [21]:
df['edge_length'] = df.apply(lambda x: lamberts_ellipsoidal_distance(x.lat_north, x.lon_east, x.lat_south, x.lon_west), axis=1)

## Add features from Open Street Maps (OSM)

#### Make the datasets with points of interest

In [22]:
add_points_of_interest = True

# Since we need longitude and latitude to add OSM features, turn it on when adding points of interest if off
if add_coordinates is False and add_points_of_interest:
    add_points_of_interest = True
    add_coordinates = True
    print("WARNING: add_coordinates was turned on since it's needed to aggregate points of interest")
    print(f"add_coordinates = {add_coordinates},  add_points_of_interest = {add_points_of_interest}")

In [23]:
# Extraction of the features on the dictionary from OSM

basel = 'Basel, Basel, Switzerland'
# Features dictionary at https://wiki.openstreetmap.org/wiki/Map_features

tags = {'amenity': ['vending_machine', 'bench', 'bar', 'fast_food', 'ice_cream', 'kindergarten', 'school', 'hospital', 'cinema', 
                    'fountain', 'dog_toilet', 'recycling', 'waste_basket', 'waste_disposal', 'childcare', 'marketplace',
                    'bus_station', 'fuel', 'taxi', 'parking', 'atm', 'clinic', 'nightclub', 'toilets'
                    ]}
amenity = ox.geometries_from_place(basel, tags=tags)

# tags = {'building': ['hotel', 'kiosk', 'commercial', 'retail', 'supermarket']}
# building = ox.geometries_from_place(basel, tags=tags)

# tags = {'leisure': ['dog_park', 'park']}
# leisure = ox.geometries_from_place(basel, tags=tags)

  for merged_outer_linestring in list(merged_outer_linestrings):
  for merged_outer_linestring in list(merged_outer_linestrings):


#### Make dataframe with points of interest and their coordinates

In [24]:
# All this code will return a dataframe with the type of amenity and it's coordinates

df_osm = pd.DataFrame(amenity)  # <-- Convert to DF
df_osm = df_osm[['amenity', 'geometry']].copy()  # <-- Select the only needed columns
df_osm['osm_id'] = df_osm.index.to_numpy()  # <-- Detach the index and assign it to a normal column
df_osm.reset_index(drop=True, inplace=True)  # <-- Drop index

# Explode index since it contains two indices
osm_id_exploded = pd.DataFrame(df_osm["osm_id"].to_list(), columns=['type', 'osm'])  
df_osm = pd.concat([df_osm, osm_id_exploded], axis=1) 

# Fix osm_id column, then drop it for the moment, we don't need it
df_osm.drop(['osm_id'], axis=1, inplace=True)
df_osm.columns = df_osm.columns.str.replace('osm', 'osm_id')
df_osm = df_osm.drop(df_osm[df_osm['type'] == 'relation'].index)  # <-- Drop Multipoligon points
df_osm = df_osm.drop(df_osm[df_osm['type'] == 'way'].index)  # <-- Drop Multipoligon points
df_osm.drop(['osm_id'], axis=1, inplace=True)  # <-- Drop it for the moment

#### Clean the coordinates from the GeoPandas geometry format to latitude and longitude columns
df_osm['lon'] = df_osm[df_osm['type'] == "node"]['geometry'].apply(lambda p: p.x)
df_osm['lat'] = df_osm[df_osm['type'] == "node"]['geometry'].apply(lambda p: p.y)
df_osm.drop(['geometry', 'type'], axis=1, inplace=True)

#### Merge the amenity based on coordinates with the corresponding coordinates from the edge_id

In [25]:
# This code will identify the amenities that exist on each edge_id, not all edge_id will have amenities

if add_points_of_interest:
    # Since the same edge_id always has the same coordiantes, we reduce the dataframe to one with only the edge_id with it's corresponding coordinates
    df_edges_coordinates = df[['edge_id', 'lat_north', 'lat_south', 'lon_east', 'lon_west']].copy()
    df_edges_coordinates = df_edges_coordinates.drop_duplicates(subset='edge_id', keep='first')

    # Make a list of the edges that have an amenity to them based on lat,lon conditional
    def is_between(a, x, b):
        return min(a, b) < x < max(a, b)

    edges_dict = []
    for edges_row in df_edges_coordinates.itertuples():
        for osm_row in df_osm.itertuples():
            if is_between(edges_row.lat_south, osm_row.lat, edges_row.lat_north) and is_between(edges_row.lon_west, osm_row.lon, edges_row.lon_east):
                edges_dict.append([edges_row.edge_id, osm_row.amenity])

    # Group by edge_id and get the value counts per amenity
    df_edges_dict = pd.DataFrame(edges_dict, columns = ['edge_id', 'amenity'])
    df_edges_dict = df_edges_dict.groupby('edge_id')['amenity'].value_counts().unstack(fill_value=0).reset_index()

    ### Merge df with counts per amenity with main DF on edge_id
    df = pd.merge(df, df_edges_dict, how="left", on="edge_id")
    osm_columns = list(df_edges_dict.columns[1:])
    df[osm_columns] = df[osm_columns].fillna(value=0)  # <-- Fill missing values, since not all edges have amenities
    df[osm_columns] = df[osm_columns].astype(int)
else:
    print("To add a count of points of interest from OSM, change add_points_of_interest variable to True")

# Quick EDA

In [66]:
columns_df_litters = ['edge_id', 'date_utc', 'month', 'day', 'weekday', 'total_litter']
columns_df_litters.extend(litter_columns)
df_litters = df[columns_df_litters].copy()
aggregation_type = 'sum'
to_agg = {'total_litter' : aggregation_type}
for litter in litter_columns:
    to_agg[litter] = aggregation_type
df_litters_agg = df_litters.groupby(['weekday'], as_index=False).agg(to_agg)

In [69]:
sum_of_litters = df[litter_columns].sum(axis=0)

# Classification Models

## Make the classification output

In [55]:
#### We want half to be clean half dirty for the moment
df['total_litter_ratio'] = [1 if x>10 else 0 for x in df['total_litter']]
df['total_litter_ratio'].value_counts()

0    357032
1    241775
Name: total_litter_ratio, dtype: int64

## Train / Test and Pipelines for Classification Models

In [27]:
columns_to_drop = ['total_litter_ratio', 'date_utc', 'edge_osmid', 'total_litter']
columns_to_drop.extend(litter_columns)
#columns_to_drop.extend(osm_columns)

X = df.drop(columns=columns_to_drop)
y = df['total_litter_ratio']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
print(X_train.shape, X_test.shape)
X_train.columns

(538926, 44) (59881, 44)


Index(['edge_id', 'osm_highway', 'Year', 'month', 'day', 'weekday',
       'temperature_max', 'temperature_min', 'temperature_mean',
       'precipitation', 'snowfall', 'humidity_max', 'humidity_min',
       'humidity_mean', 'cloud_coverage', 'wind_speed_max', 'wind_speed_min',
       'wind_speed_mean', 'lat_north', 'lat_south', 'lon_east', 'lon_west',
       'edge_length', 'atm', 'bar', 'bench', 'childcare', 'cinema', 'clinic',
       'fast_food', 'fountain', 'fuel', 'ice_cream', 'kindergarten',
       'marketplace', 'nightclub', 'parking', 'recycling', 'school', 'taxi',
       'toilets', 'vending_machine', 'waste_basket', 'waste_disposal'],
      dtype='object')

In [28]:
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
print(f"Categorical features: {categorical_features}")
print(f"Numeric features: {numeric_features}")

Categorical features: ['edge_id', 'osm_highway', 'Year', 'month', 'day', 'weekday']
Numeric features: ['temperature_max', 'temperature_min', 'temperature_mean', 'precipitation', 'snowfall', 'humidity_max', 'humidity_min', 'humidity_mean', 'cloud_coverage', 'wind_speed_max', 'wind_speed_min', 'wind_speed_mean', 'lat_north', 'lat_south', 'lon_east', 'lon_west', 'edge_length', 'atm', 'bar', 'bench', 'childcare', 'cinema', 'clinic', 'fast_food', 'fountain', 'fuel', 'ice_cream', 'kindergarten', 'marketplace', 'nightclub', 'parking', 'recycling', 'school', 'taxi', 'toilets', 'vending_machine', 'waste_basket', 'waste_disposal']


In [29]:
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))
                                          ])
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())
                                      ])
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features),
                                               ("cat", categorical_transformer, categorical_features)
                                               ])

## Models and predictions

In [30]:
classification_scores = {}

### Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression

model_logistic_r = LogisticRegression(random_state=7, solver='liblinear')

pipeline_logistic_r = Pipeline(steps=[("pre_process", preprocessor), 
                              ("model", model_logistic_r)
                              ])

In [32]:
start_time = time.time()  # <-- Just to count how long the model takes to predict

pipeline_logistic_r.fit(X_train, y_train)
y_pred_logistic_r = pipeline_logistic_r.predict(X_test)

print(f"--- {(time.time() - start_time)/60} minutes ---")

--- 4.113759617010753 minutes ---


In [33]:
# Prediction scores
class_labels = pipeline_logistic_r.named_steps['model'].classes_

print(classification_report(y_test, y_pred_logistic_r))

pd.DataFrame(confusion_matrix(y_test, y_pred_logistic_r), columns=class_labels, index=class_labels)

# Add scores to dictionary
classification_scores['logistic_r'] = round(f1_score(y_test, y_pred_logistic_r, average='weighted'), 3)

              precision    recall  f1-score   support

           0       0.80      0.85      0.82     35700
           1       0.75      0.68      0.72     24181

    accuracy                           0.78     59881
   macro avg       0.77      0.77      0.77     59881
weighted avg       0.78      0.78      0.78     59881



### SVC

In [34]:
from sklearn.svm import LinearSVC

model_svc = LinearSVC(random_state=42)

pipeline_svc = Pipeline([("pre_process", preprocessor), 
                         ("model", model_svc)])

In [35]:
start_time = time.time()  # <-- Just to count how long the model takes to predict

pipeline_svc.fit(X_train, y_train)
y_pred_svc = pipeline_svc.predict(X_test)

print(f"--- {(time.time() - start_time)/60} minutes ---")



--- 4.308692884445191 minutes ---


In [36]:
# Prediction scores
class_labels = pipeline_svc.named_steps['model'].classes_

print(classification_report(y_test, y_pred_svc))

pd.DataFrame(confusion_matrix(y_test, y_pred_svc), columns=class_labels, index=class_labels)

# Add scores to dictionary
classification_scores['svc'] = round(f1_score(y_test, y_pred_svc, average='weighted'), 3)

              precision    recall  f1-score   support

           0       0.80      0.84      0.82     35700
           1       0.75      0.69      0.72     24181

    accuracy                           0.78     59881
   macro avg       0.77      0.76      0.77     59881
weighted avg       0.78      0.78      0.78     59881



## Models Summary

In [43]:
classification_result = pd.DataFrame({
    "Model" : classification_scores.keys(),
    'F1-Score': classification_scores.values()})
classification_result

Unnamed: 0,Model,F1-Score
0,logistic_r,0.779
1,svc,0.778


## XGBoost and Feature Importance

In [44]:
from xgboost import XGBClassifier

categorical_transformer_xgb = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))
                                          ])
numeric_transformer_xgb = Pipeline(steps=[("knn_imputer", KNNImputer(n_neighbors=5))
                                      ])
preprocessor_xgb = ColumnTransformer(transformers=[("num", numeric_transformer_xgb, numeric_features),
                                               ("cat", categorical_transformer_xgb, categorical_features)
                                               ])

xgb = XGBClassifier(random_state=42)

pipeline_xgb = Pipeline([("pre_process", preprocessor_xgb),
                         ("model", xgb)])

In [46]:
start_time = time.time()  # <-- Just to count how long the model takes to predict

pipeline_xgb.fit(X_train, y_train)
y_pred_xgb = pipeline_xgb.predict(X_test)

print(f"--- {(time.time() - start_time)/60} minutes ---")

--- 1.010558044910431 minutes ---


In [47]:
class_labels = pipeline_xgb.named_steps['model'].classes_

print(classification_report(y_test, y_pred_xgb))

pd.DataFrame(confusion_matrix(y_test, y_pred_xgb),
             columns=class_labels, index=class_labels)

              precision    recall  f1-score   support

           0       0.74      0.84      0.79     35700
           1       0.71      0.56      0.63     24181

    accuracy                           0.73     59881
   macro avg       0.72      0.70      0.71     59881
weighted avg       0.73      0.73      0.72     59881



Unnamed: 0,0,1
0,30044,5656
1,10550,13631


In [56]:
xgb_model = pipeline_xgb['model']
num_features = list(pipeline_xgb['pre_process'].transformers_[0][2])
cat_features = list(pipeline_xgb['pre_process'].transformers_[1][1]['onehot'].get_feature_names(categorical_features))
feature_cols = num_features + cat_features
xgb_importances = pd.DataFrame({"feature": feature_cols, "importance": np.round(xgb_model.feature_importances_, 3)})
xgb_importances = xgb_importances[~xgb_importances['feature'].isin(cat_features)]
xgb_importances = xgb_importances.sort_values("importance", ascending=False).set_index("feature")
xgb_importances.head(5)




Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
temperature_min,0.111
edge_length,0.097
temperature_mean,0.052
wind_speed_mean,0.045
vending_machine,0.03


# Regression Models

## Train / Test and Pipelines for Regression Models

In [56]:
columns_to_drop = ['total_litter_ratio', 'date_utc', 'edge_osmid', 'total_litter', 'lat_north', 'lat_south', 'lon_east', 'lon_west','temperature_max', 'temperature_min', 'temperature_mean']

columns_to_drop.extend(litter_columns)
columns_to_drop.extend(osm_columns)

X = df.drop(columns=columns_to_drop)
y = df['total_litter']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print(X_train.shape, X_test.shape)
X_train.head(1)

(538926, 16) (59881, 16)


Unnamed: 0,edge_id,osm_highway,Year,month,day,weekday,precipitation,snowfall,humidity_max,humidity_min,humidity_mean,cloud_coverage,wind_speed_max,wind_speed_min,wind_speed_mean,edge_length
334934,"(273223783, 273223789, 0)",residential,2022,4,12,Tuesday,0.0,0.0,66.0,31.0,46.2,39.2,18.2,6.0,13.9,201


In [57]:
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
print(f"Categorical features: {categorical_features}")
print(f"Numeric features: {numeric_features}")

Categorical features: ['edge_id', 'osm_highway', 'Year', 'month', 'day', 'weekday']
Numeric features: ['precipitation', 'snowfall', 'humidity_max', 'humidity_min', 'humidity_mean', 'cloud_coverage', 'wind_speed_max', 'wind_speed_min', 'wind_speed_mean', 'edge_length']


In [58]:
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))
                                          ])
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())
                                      ])
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features),
                                               ("cat", categorical_transformer, categorical_features)
                                               ])

## Models and Predictions

In [59]:
regression_scores = {}

### Linear Regression

In [53]:
model_linear_r =  LinearRegression()

pipeline_linear_r = Pipeline(steps=[("pre_process", preprocessor), 
                              ("model", model_linear_r)
                              ])

In [54]:
start_time = time.time()  # <-- Just to count how long the model takes to predict

pipeline_linear_r.fit(X_train, y_train)
y_pred_linear_r = pipeline_linear_r.predict(X_test)

print(f"--- {(time.time() - start_time)/60} minutes ---")

--- 0.6465527057647705 minutes ---


In [55]:
from sklearn import metrics

print('MAE', metrics.mean_absolute_error(y_test, y_pred_linear_r))
print('MSE', metrics.mean_squared_error(y_test, y_pred_linear_r))
print('RMSE', np.sqrt(metrics.mean_squared_error(y_test, y_pred_linear_r)))
print('R2 Score', metrics.r2_score(y_test, y_pred_linear_r))

# Add scores to dictionary
regression_scores['linear_r'] = round(metrics.r2_score(y_test, y_pred_linear_r), 3)

MAE 32.68097113239258
MSE 6890.85798237662
RMSE 83.01119191034797
R2 Score 0.28299151720442406


### Poisson Regression

In [60]:
from sklearn import linear_model

model_poisson = linear_model.PoissonRegressor()

pipeline_poisson = Pipeline(steps=[
                              ("pre_process", preprocessor),
                              ("poisson_model", model_poisson)
                              ])

In [61]:
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.metrics import confusion_matrix, classification_report

grid_search_poisson = {'poisson_model__alpha' : [1e-10],
                       'poisson_model__max_iter' : [1000],
                        }

model_poisson = GridSearchCV(estimator=pipeline_poisson, param_grid=grid_search_poisson,
                             scoring='neg_mean_poisson_deviance', verbose=5, n_jobs=-1)

In [62]:
start_time = time.time()  # <-- Just to count how long the model takes to predict

model_poisson.fit(X_train,y_train)
predictionforest = model_poisson.best_estimator_.predict(X_test)

print(f"--- {(time.time() - start_time)/60} minutes ---")

Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


[CV 2/5] END poisson_model__alpha=1e-10, poisson_model__max_iter=1000;, score=-34.722 total time= 2.2min
[CV 1/5] END poisson_model__alpha=1e-10, poisson_model__max_iter=1000;, score=-35.363 total time= 2.2min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


[CV 3/5] END poisson_model__alpha=1e-10, poisson_model__max_iter=1000;, score=-35.725 total time= 2.2min
[CV 4/5] END poisson_model__alpha=1e-10, poisson_model__max_iter=1000;, score=-35.707 total time= 2.2min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


[CV 5/5] END poisson_model__alpha=1e-10, poisson_model__max_iter=1000;, score=-36.683 total time= 2.2min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


--- 3.6699963172276813 minutes ---


In [61]:
best_model_poisson = model_poisson.best_estimator_
y_pred_poisson = best_model_poisson.predict(X_test)
y_pred_poisson = y_pred_poisson.astype(int)
print()




In [62]:
from sklearn import metrics

print('MAE', metrics.mean_absolute_error(y_test, y_pred_poisson))
print('MSE', metrics.mean_squared_error(y_test, y_pred_poisson))
print('RMSE', np.sqrt(metrics.mean_squared_error(y_test, y_pred_poisson)))
print('R2 Score', metrics.r2_score(y_test, y_pred_poisson))
print('D2 Score', best_model_poisson.score(X_test, y_test))

# Add scores to dictionary
regression_scores['poisson'] = round(best_model_poisson.score(X_test, y_test), 3)

MAE 25.103672283362002
MSE 5902.256224845944
RMSE 76.82614284763973
R2 Score 0.38585764041708615
D2 Score 0.6089607957836356


### Random Forest

In [63]:
# TODO: Add grid and parameter tunning

from sklearn.ensemble import RandomForestRegressor

rfr_model =  RandomForestRegressor(n_estimators=20,
                             max_depth=10,
                             criterion='mse',
                            )

pipeline_rfr = Pipeline(steps=[("pre_process", preprocessor), 
                               ("model", rfr_model)
                              ])

In [64]:
start_time = time.time()  # <-- Just to count how long the model takes to predict

pipeline_rfr.fit(X_train, y_train)
pred_rfr = pipeline_rfr.predict(X_test)

print(f"--- {(time.time() - start_time)/60} minutes ---")



--- 1.9101166685422262 minutes ---


In [65]:
from sklearn import metrics

print('MAE', metrics.mean_absolute_error(y_test, pred_rfr))
print('MSE', metrics.mean_squared_error(y_test, pred_rfr))
print('RMSE', np.sqrt(metrics.mean_squared_error(y_test, pred_rfr)))
print('R2 Score', metrics.r2_score(y_test, pred_rfr))

MAE 32.67003320057788
MSE 7988.979951550463
RMSE 89.3810939267945
R2 Score 0.16872958218043888
