In [2]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime, date
import calendar
import holidays

import plotly.express as px
import plotly.graph_objects as go

sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)

In [None]:
df_main = pd.read_csv('../../data/datav2.csv')
# df_main = pd.read_csv('/content/data.csv')
df_main.shape

In [45]:
# df = df_main.sample(n=500000)
df = df_main.copy()

In [48]:
from random import randint

def encrypt_edge_id(x):
    if x != x:
        return np.nan
    foo = tuple(x[1:-1].split(', ')[0:2])
    foo1 = randint(10**(len(foo[0])-1), 10**len(foo[0])-1) 
    foo2 = randint(10**(len(foo[1])-1), 10**len(foo[1])-1) 
    return f"({foo1}, {foo2}, 0)"

df["edge.id"] = df["edge.id"].apply(encrypt_edge_id)

## Data Cleaning

### Basic data cleaning

In [49]:
# List of the litter categories columns
categorical_columns = []
for item in df.columns.to_list():
    if item.isdigit(): categorical_columns.append(item) 

# Rename columns
df.rename(columns = {'suitcase.id':'suitcase_id', 'date.utc':'date_utc', 
                     'edge.id':'edge_id', 'edge.osmid':'edge_osmid', 
                     'place.id':'place_id', 'osm.highway':'osm_highway'}, inplace=True)

# Drop empty rows
df = df.dropna(subset=['edge_id']).copy()

# Drop place_id for now
df.drop('place_id', axis=1, inplace=True)

# Convert to datetime and remove the time
df['date_utc'] = pd.to_datetime(df['date_utc']).dt.date

# Drop not needed columns
not_needed_columns = ['value.Vehicle_Mode', 'speed', '_id',]
df.drop(not_needed_columns, axis=1, inplace=True, errors='ignore')

# Fill NaNs with 0 and convert categorical columns to integer
df[categorical_columns] = df[categorical_columns].fillna(0)
df[categorical_columns] = df[categorical_columns].astype(np.int64) 

# Clean and sort edge_id
df['edge_id'] = df['edge_id'].apply(lambda x: tuple(x[1:-1].split(', ')[0:2]))
df['edge_id'] = df['edge_id'].apply(lambda x: sorted(x))  # --> sort it so order doesn't matter
df['edge_id'] = df['edge_id'].apply(lambda x: (', ').join(x))  # --> make it a string

# Add a sum of all the litter
df['total_litter'] = df[categorical_columns].sum(axis=1)

## This next function proves that the original dataframe is not aggregated by edge and date, so we need to do this

In [6]:
this_date = ''
day_edges = []
log = []

for index, row in df.iterrows():
    if row['date_utc'] != this_date:
        this_date = row['date_utc']  # Set the day we will look into for the next iterations
        day_edges = []  # Reset day_edges since it's a new day
    
    if row['edge_id'] not in day_edges:
        day_edges.append(row['edge_id'])
    else:  # This means this edge is repeating in a single day, we want to log this
        log.append([index, this_date, row['edge_id']])

In [7]:
print(f"There are {len(log)} rows that need to be aggregated")

There are 298946 rows that need to be aggregated


### Aggregation based on Edge and Date

In [50]:
# Aggregate based on unique edge_id, date_utc and suitcase_id

to_agg = {'edge_osmid' : 'first',
          #'place_id' : 'first',
          'osm_highway' : 'first',
          'total_litter' : 'sum',
          '1':'sum', '2':'sum', '3':'sum', '4':'sum', '5':'sum', '7':'sum', '8':'sum',
          '10':'sum', '13':'sum', '14':'sum', '16':'sum', '19':'sum', '21':'sum',
          '25':'sum', '26':'sum', '27':'sum', '28':'sum', '29':'sum', '30':'sum', 
          '31':'sum', '32':'sum', '33':'sum', '35':'sum', '36':'sum', '37':'sum',
          '39':'sum', '49':'sum', '61':'sum', '63':'sum',}


df = df.groupby(['date_utc', 'edge_id'], as_index=False).agg(to_agg)

### Correct the naming of the columns
The groupby aggregation causes the names of the colums to have a sub-column, which makes it harder to extract data from them

So the next function merges the resulting 2 columns (if there was aggregation on them) Example, '39''sum' becomes '39_sum'

In [9]:
# # Merge the column names since they have sub-columns because of the groupby aggregation
# column_ravel = df.columns.ravel()
# column_ravel_list = []

# for i, column in enumerate(column_ravel):
#     if column[1] != '': column_ravel_list.append("_".join(column))  # <-- For the ones where there was agg
#     else: column_ravel_list.append(column[0])  # <-- For the ones without agg, just keep the previous name
# df.columns = column_ravel_list  # Rename the DF columns

## Feauture Creation

### Splitting the date

In [51]:
# Splitting date to year, month and day to make it a feauture for modeling
df['Year'] = pd.DatetimeIndex(df['date_utc']).year
df['month'] = pd.DatetimeIndex(df['date_utc']).month
df['day'] = pd.DatetimeIndex(df['date_utc']).day

# Converting it to object so it's treated as a categorical data when I make the pipelines
df['Year'] = df['Year'].astype(object)
df['month'] = df['month'].astype(object)
df['day'] = df['day'].astype(object)

### Adding weekday

In [52]:
weekdays = ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')
df['weekday'] = df['date_utc'].apply(lambda x: x.weekday())
df['weekday'] = df['weekday'].apply(lambda x: weekdays[x])

### Converting total litter to boolean

We want half to be clean half dirty for the moment

In [53]:
df['total_litter_ratio'] = [1 if x>10 else 0 for x in df['total_litter']]
df['total_litter_ratio'].value_counts()

0    941966
1    375221
Name: total_litter_ratio, dtype: int64

## Train / Test

In [54]:
df.head(1)

Unnamed: 0,date_utc,edge_id,edge_osmid,osm_highway,total_litter,1,2,3,4,5,7,8,10,13,14,16,19,21,25,26,27,28,29,30,31,32,33,35,36,37,39,49,61,63,Year,month,day,weekday,total_litter_ratio
0,2022-01-02,"1026448970, 495182453",766709950.0,secondary,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2022,1,2,Sunday,0


In [55]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer

In [56]:
columns_to_drop = ['total_litter_ratio', 'date_utc', 'edge_osmid', 'total_litter']
columns_to_drop.extend(categorical_columns)

X = df.drop(columns=columns_to_drop)
y = df['total_litter_ratio']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train.shape, X_test.shape

((1185468, 6), (131719, 6))

In [57]:
X_train.head(2)

Unnamed: 0,edge_id,osm_highway,Year,month,day,weekday
117494,"127060096, 547169690",residential,2022,1,20,Thursday
166764,"2452444575, 6310154543",footway,2022,1,27,Thursday


## Pipelines

In [29]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.neighbors import KNeighborsClassifier

In [30]:
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
# categorical_features, numeric_features

In [31]:
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))
                                          ])
numeric_transformer = Pipeline(steps=[("scaler", MinMaxScaler())
                                      ])
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features),
                                               ("cat", categorical_transformer, categorical_features)
                                               ])

## PyCaret

In [18]:
# from pycaret.classification import *

# reg = setup(data=pd.concat([X_train, y_train], axis=1),
#             target = 'total_litter',
#             session_id=None, 
#             log_experiment=True,
#             custom_pipeline=preprocessor,
#             fold_shuffle=True
#             # experiment_name='boston1'
# )

## Logistic Regression

In [32]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state=7, solver='liblinear')

pipeline_lr = Pipeline(steps=[("pre_process", preprocessor), 
                              ("model", lr_model)
                              ])

In [33]:
pipeline_lr.fit(X_train, y_train)
y_pred = pipeline_lr.predict(X_test)

In [34]:
class_labels = pipeline_lr.named_steps['model'].classes_

print(classification_report(y_test, y_pred))

pd.DataFrame(confusion_matrix(y_test, y_pred), 
             columns=class_labels, index=class_labels)

              precision    recall  f1-score   support

           0       0.80      0.85      0.82     35700
           1       0.75      0.68      0.71     24019

    accuracy                           0.78     59719
   macro avg       0.77      0.76      0.77     59719
weighted avg       0.78      0.78      0.78     59719



Unnamed: 0,0,1
0,30313,5387
1,7725,16294


## KNN Model

In [36]:
knn = KNeighborsClassifier()

pipeline_knn = Pipeline([("pre_process", preprocessor), 
                         ("model", knn)])

In [37]:
pipeline_knn.fit(X_train, y_train)
y_pred = pipeline_knn.predict(X_test)

In [39]:
class_labels = pipeline_knn.named_steps['model'].classes_

print(classification_report(y_test, y_pred))

pd.DataFrame(confusion_matrix(y_test, y_pred), 
             columns=class_labels, index=class_labels)

              precision    recall  f1-score   support

           0       0.68      0.73      0.71     35700
           1       0.55      0.49      0.52     24019

    accuracy                           0.63     59719
   macro avg       0.62      0.61      0.61     59719
weighted avg       0.63      0.63      0.63     59719



Unnamed: 0,0,1
0,26069,9631
1,12175,11844
