In [None]:
# Install OSMNX only if working on Google Colab
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    !pip install osmnx
    !pip install matplotlib==3.1.1

In [2]:
# Load datasets if working on Google Colab
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    from pydrive.auth import GoogleAuth
    from pydrive.drive import GoogleDrive
    from google.colab import auth
    from oauth2client.client import GoogleCredentials
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)
    file_id = '...'
    downloaded = drive.CreateFile({'id':file_id})
    downloaded.FetchMetadata(fetch_all=True)
    downloaded.GetContentFile(downloaded.metadata['title'])
    f = open("V2data_6mounts2022.csv.zip", "wb")
    f.write(downloaded.content.getbuffer())
    f.close()
    !unzip V2data_6mounts2022.csv.zip

Archive:  V2data_6mounts2022.csv.zip
  inflating: data.csv                
  inflating: __MACOSX/._data.csv     


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import sys
import osmnx
import requests
import pickle
import cloudpickle as cp
from urllib.request import urlopen

IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    r = requests.get('https://raw.githubusercontent.com/dominik117/cortexia-darkzones-prediction/main/src/helper_scripts/data_processor.py')
    with open('data_processor.py', 'w') as f:
        f.write(r.text)
        import data_processor
    r = requests.get('https://raw.githubusercontent.com/dominik117/cortexia-darkzones-prediction/main/src/helper_scripts/darkzone_creator.py')
    with open('darkzone_creator.py', 'w') as f:
        f.write(r.text)
        import darkzone_creator
else:
    sys.path.insert(1, '../../src/')
    import helper_scripts.data_processor as data_processor
    import helper_scripts.dz_creator as darkzone_creator

sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)

## Dataset Import

In [3]:
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB: df_main_url = '/content/data.csv'
else: df_main_url = '../../data/datav2.csv'
df_main = pd.read_csv(df_main_url, dtype = {'place.id': object})

In [4]:
from random import randint
def encrypt_edge_id(x):
    if x != x:
        return np.nan
    foo = tuple(x[1:-1].split(', ')[0:2])
    foo1 = randint(10**(len(foo[0])-1), 10**len(foo[0])-1) 
    foo2 = randint(10**(len(foo[1])-1), 10**len(foo[1])-1) 
    return f"({foo1}, {foo2}, 0)"

edges = list(df_main["edge.id"].unique())
encrypted_edges = {}
for edge in edges:
    encrypted_edges[edge] = encrypt_edge_id(edge)

df_main["edge.id"] = df_main["edge.id"].map(encrypted_edges)

In [None]:
df_darkzones = darkzone_creator.create_darkzones(df_main)
# Runtime ~2 minutes

In [5]:
df_main

Unnamed: 0,_id,suitcase.id,date.utc,edge.id,edge.osmid,place.id,osm.highway,1,2,3,4,5,7,8,10,13,14,16,19,21,25,26,27,28,29,30,31,32,33,35,36,37,39,49,61,63
0,RLftkn8B6D05RDJPRFzw,101,2022-03-16 13:31:23.556000,"(66861445, 5777577020, 0)",148827821.0,,secondary,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2rftkn8B6D05RDJPg1zA,101,2022-03-16 13:31:40.397000,"(4784490177, 879290314, 0)",249810956.0,,secondary,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,7Lftkn8B6D05RDJPilxI,84,2022-03-16 13:31:42.588000,"(5024071332, 2468498684, 0)",148744643.0,,footway,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,r_Ttkn8BECWPrCgui1Ov,102,2022-03-16 13:18:31.370000,"(131242325, 366524790, 0)",25371931.0,,footway,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Abftkn8B6D05RDJPkl0B,84,2022-03-16 13:31:44.839000,"(4112634135, 6927774285, 0)",680591900.0,,secondary,0,1,0,5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1380547,GdQwSIEB6D05RDJPRgFx,110,2022-06-09 11:18:30.202000,"(8859143893, 7271978688, 0)",218421857.0,,residential,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1380548,HdQwSIEB6D05RDJPRwGz,107,2022-06-09 11:18:34.652000,"(392518067, 121526083, 0)",25307428.0,,tertiary,0,0,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1380549,i7MwSIEBjBK6gFa3V62V,105,2022-06-09 11:16:05.803000,"(190115287, 9768290867, 0)",182402103.0,,unclassified,12,0,3,22,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0
1380550,5hHYTIEBECWPrCguIrVd,70,2022-06-10 09:00:06.043000,"(6238636487, 231741978, 0)",794466424.0,,secondary,3,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


### Clean and Add Features to Main Dataframe

In [5]:
df_main = data_processor.clean_df(df_main)
df_main = data_processor.aggregate_df(df_main)
df_main = data_processor.create_date_features(df_main)
df_main = data_processor.create_coordinates_features(df_main)
df_main = data_processor.create_weather_features(df_main)
df_main = data_processor.create_osm_features(df_main)
osm_columns = data_processor.create_osm_columns()
# Runtime ~4 minutes

### Add Features to Darkzones Dataframe

In [6]:
df_darkzones = data_processor.create_date_features(df_darkzones)
df_darkzones = data_processor.create_coordinates_features(df_darkzones)
df_darkzones = data_processor.create_weather_features(df_darkzones)
df_darkzones = data_processor.create_osm_features(df_darkzones)
# Runtime ~3 minutes

# Poission Prediction

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import PoissonRegressor
from sklearn.model_selection import GridSearchCV

## Train Model

In [8]:
def train_test(df, output):
    columns_to_drop = ['total_litter', 'total_litter_ratio']
    columns_to_drop.extend(data_processor.get_litter_columns(df))
    X = df_main.drop(columns=columns_to_drop, errors='ignore')
    y = df_main[output]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    return X_train, X_test, y_train, y_test

def train_poisson_model(df, output):
    start_time = time.time()  # <-- Just to count how long the model takes to predict
    X_train, X_test, y_train, y_test = train_test(df, output)

    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
    numeric_features = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])
    numeric_transformer = Pipeline(steps=[("scaler", RobustScaler())])
    preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features),
                                                   ("cat", categorical_transformer, categorical_features)])
    model_poisson = PoissonRegressor()
    pipeline_poisson = Pipeline(steps=[("pre_process", preprocessor), ("poisson_model", model_poisson)])
    grid_search_poisson = {'poisson_model__alpha' : [1e-12],
                           'poisson_model__max_iter' : [500]}
    model_poisson = GridSearchCV(estimator=pipeline_poisson, param_grid=grid_search_poisson,
                                scoring='neg_mean_poisson_deviance', verbose=7, n_jobs=-1)
    model_poisson.fit(X_train, y_train)
    best_model_poisson = model_poisson.best_estimator_
    y_pred_poisson = best_model_poisson.predict(X_test)
    y_pred_poisson = y_pred_poisson.astype(int)
    time2train = round((time.time() - start_time)/60, 1)
    print(f"The fitting took: {time2train} minutes")
    score = best_model_poisson.score(X_test, y_test)
    print(f'Litter {output} D2 Score: {score}')
    print(f'#################################')
    return best_model_poisson, X_test, y_test, time2train

In [9]:
def make_models(df):
    models = {}
    litters = data_processor.get_litter_columns(df_main)
    litters.extend(['total_litter'])
    for litter in litters:
        model, X_test, y_test, time2train = train_poisson_model(df, litter)
        score = round(model.score(X_test, y_test), 4)
        models[litter] = [model, y_test, score, time2train]
    return models, X_test

## Prediction Models

In [10]:
litter_labels = [['1', 'Cigarette'],['2', 'Leaf'],['3', 'Leaves'],['4', 'Paper/Carton'],['5', 'CAN'],['7', 'Glass bottle'],['8', 'PET'],['9', 'Carton drink'],
                ['10', 'FF Cup'],['11', 'FF Foam Polystrene'],['12', 'Other Foam Polystrene'],['13', 'Food packaging'],['14', 'Newspaper'],['15', 'Small bag'],
                ['16', 'Glass Splinter'],['17', 'Syringe'],['18', 'Organic food littering'],['19', 'Dog fouling'],['21', 'Garbage bags'],['22', 'Sand/Grit/Granulate'],
                ['23', 'Chewing- gum'],['24', 'Vomit'],['25', 'FF Cup'],['26', 'FF Lid'],['27', 'FF Straw'],['28', 'FF Fries cartin'],['29', 'Unclear bottles'],
                ['30', 'FF Burger Box'],['31', 'FF Paper'],['32', 'FF Other Paper'],['33', 'iQos'],['34', 'Confettis (pile)'],['35', 'Medium/big stain'],
                ['36', 'Transparent plastic'],['37', 'Opaque plastic'],['38', 'Fabric'],['39', 'Unrecognizable'],['40', 'Capsule'],['41', 'Carcass'],['42', 'Furniture'],
                ['43', 'Tag'],['44', 'Poster'],['45', 'Waste bin stain'],['46', 'Waste bin tag'],['47', 'Waste bin sticker'],['48', 'Waste bin Ouverture'],['49', 'Waste bin'],
                ['50', 'Cigarette white'],['51', 'Cigarette rolled'],['52', 'Cigarette unknown'],['53', 'Waste container too full'],['54', 'Illegal advertising poster'],
                ['55', 'Illegal advertising poster (influenceable)'],['56', 'Illegal litters'],['57', 'Spray painting, graffiti'],['58', 'Spray painting, graffiti (influenceable)'],
                ['59', 'Feuille mouillée'],['60', 'Poubelles remplies'],['61', 'Robydog'],['62', 'Wooden or plastic crate'],['63', 'Mask'],['total_litter', 'Total Litter']]

In [11]:
models, X_test = make_models(df_main)

for key, value in models.items():
  for item in litter_labels:
    if key == item[0]:
      models[key].append(item[1])
# Runtime ~3 minutes per litter and ~60 minutes for all litters

Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.3 minutes
Litter 1 D2 Score: 0.5946284324918871
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.2 minutes
Litter 2 D2 Score: 0.6701196870573678
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.2 minutes
Litter 3 D2 Score: 0.45336086087467586
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.2 minutes
Litter 4 D2 Score: 0.655981500570578
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.1 minutes
Litter 5 D2 Score: 0.38622222769595016
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.0 minutes
Litter 7 D2 Score: 0.25838683279754615
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.1 minutes
Litter 8 D2 Score: 0.3913179975633959
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits
The fitting took: 0.2 minutes


  return 1 - dev / dev_null


Litter 10 D2 Score: -inf
#################################


  return 1 - dev / dev_null


Fitting 5 folds for each of 1 candidates, totalling 5 fits
The fitting took: 0.2 minutes
Litter 13 D2 Score: -0.0014397161272337033
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits
The fitting took: 1.7 minutes
Litter 14 D2 Score: 0.3038886349666077
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.0 minutes
Litter 16 D2 Score: 0.3575343251830517
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.1 minutes
Litter 19 D2 Score: 0.47143165426680755
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.1 minutes
Litter 21 D2 Score: 0.6725325156429542
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits
The fitting took: 0.3 minutes
Litter 25 D2 Score: 0.1069621240851093
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits
The fitting took: 0.3 minutes
Litter 26 D2 Score: 0.018310039741767592
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits
The fitting took: 0.9 minutes
Litter 27 D2 Score: 0.1820336649710017
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits
The fitting took: 0.2 minutes
Litter 28 D2 Score: 0.019667002305008507
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits
The fitting took: 0.2 minutes
Litter 29 D2 Score: 0.0031808852106278707
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits
The fitting took: 0.3

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.1 minutes
Litter 33 D2 Score: 0.46232633801295375
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.1 minutes
Litter 35 D2 Score: 0.4244996034567631
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits
The fitting took: 0.5 minutes
Litter 36 D2 Score: 0.2431812447163354
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.1 minutes
Litter 37 D2 Score: 0.2716093643647055
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.1 minutes
Litter 39 D2 Score: 0.45425624448094104
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.1 minutes
Litter 49 D2 Score: 0.36318917240034054
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.1 minutes
Litter 61 D2 Score: 0.47224833193263804
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.0 minutes
Litter 63 D2 Score: 0.38381506919498276
#################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


The fitting took: 2.3 minutes
Litter total_litter D2 Score: 0.6349679534877858
#################################


In [12]:
from sklearn.model_selection import train_test_split
def train_test(df, output):
    columns_to_drop = ['total_litter', 'total_litter_ratio']
    columns_to_drop.extend(data_processor.get_litter_columns(df))
    X = df_main.drop(columns=columns_to_drop, errors='ignore')
    y = df_main[output]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test(df_main, '1')

# Predict Darkzones

In [13]:
litter_columns = data_processor.get_litter_columns(df_darkzones)

for key, model in models.items():
    predictions = model[0].predict(df_darkzones)
    predictions = np.rint(predictions).astype(int)
    df_darkzones[f"{key}"] = predictions

columns_to_drop = ['Year', 'month', 'day', 'weekday', 'holiday', 'lat_north', 'lat_south', 'lon_east', 'lon_west', 'edge_length', 
        'temperature_max', 'temperature_min', 'temperature_mean', 'precipitation', 'snowfall', 'humidity_max', 'humidity_min', 
        'humidity_mean', 'cloud_coverage', 'wind_speed_max', 'wind_speed_min', 'wind_speed_mean']
columns_to_drop.extend(osm_columns)
df_darkzones.drop(columns_to_drop, axis=1, inplace=True, errors='ignore')
df_darkzones.rename(columns={'total_litter': 'predicted_total'}, inplace=True)
df_darkzones['actual_total'] = df_darkzones[litter_columns].sum(axis=1)

# Export Files

In [18]:
import pickle
with open('models_dictionary.pkl', 'wb') as f:
    pickle.dump(models, f)
# with open('models_dictionary.pkl', 'rb') as f:
#     models = pickle.load(f)

In [19]:
X_test.to_csv("X_test.csv")