In [1]:
# Load datasets if working on Google Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from pydrive.auth import GoogleAuth
    from pydrive.drive import GoogleDrive
    from google.colab import auth
    from oauth2client.client import GoogleCredentials

    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)

    file_id = '...'
    downloaded = drive.CreateFile({'id':file_id})
    downloaded.FetchMetadata(fetch_all=True)
    downloaded.GetContentFile(downloaded.metadata['title'])

    f = open("V2data_6mounts2022.csv.zip", "wb")
    f.write(downloaded.content.getbuffer())
    f.close()

    !unzip V2data_6mounts2022.csv.zip

In [2]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import urllib.request
import json 
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, PoissonRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics

import osmnx as ox

sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)

## Dataset Import

In [3]:
if IN_COLAB: df_main_url = '/content/data.csv'
else: df_main_url = '../../data/datav2.csv'
df_main = pd.read_csv(df_main_url, dtype = {'place.id': object})

### Drop unneeded columns and rename columns to naming convention used

In [6]:
not_needed_columns = ['value.Vehicle_Mode', 'speed', '_id',]
df_main.drop(not_needed_columns, axis=1, inplace=True, errors='ignore')
df_main.rename(columns = {'suitcase.id':'suitcase_id', 'date.utc':'date_utc', 
                    'edge.id':'edge_id', 'edge.osmid':'edge_osmid', 
                    'place.id':'place_id', 'osm.highway':'osm_highway'}, inplace=True, errors='ignore')

### Make a copy of the dataframe to work with

In [7]:
df = df_main.copy()
# df_test = df[~df['place.id'].isna()]

## Data Cleaning

In [8]:
litter_columns = []
for item in df.columns.to_list():
    if item.isdigit(): litter_columns.append(item) 

def clean_df(df_arg, mapping):
    df = df_arg
    df = df.dropna(subset=[mapping]).copy()
    df.drop('place_id', axis=1, inplace=True)
    df['date_utc'] = pd.to_datetime(df['date_utc']).dt.date
    df[litter_columns] = df[litter_columns].fillna(0)
    df[litter_columns] = df[litter_columns].astype(np.int64) 
    df['total_litter'] = df[litter_columns].sum(axis=1)
    return df

df = clean_df(df, 'edge_id')

## Data Aggregation

In [9]:
aggregation_type = 'sum'
to_agg = {'edge_osmid' : 'first', 'osm_highway' : 'first', 'total_litter' : aggregation_type}
for litter in litter_columns:
    to_agg[litter] = aggregation_type
df = df.groupby(['date_utc', 'edge_id'], as_index=False).agg(to_agg)

## Feauture Creation

### Splitting the date

In [10]:
df['Year'] = pd.DatetimeIndex(df['date_utc']).year.astype(object)
df['month'] = pd.DatetimeIndex(df['date_utc']).month.astype(object)
df['day'] = pd.DatetimeIndex(df['date_utc']).day.astype(object)

### Add weekday

In [11]:
weekdays = ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')
df['weekday'] = pd.to_datetime(df['date_utc']).dt.date.apply(lambda x: x.weekday())
df['weekday'] = df['weekday'].apply(lambda x: weekdays[x])

### Add holiday

In [12]:
from datetime import datetime
from datetime import timedelta
from holidays import Switzerland

holiday = [holiday for holiday in Switzerland(years=[2021, 2022]).items()]
for day in set(holiday):
    holiday.append(((day[0] + timedelta(days=1)), day[1]))
    holiday.append(((day[0] + timedelta(days=2)), day[1]))

holidays_df = pd.DataFrame(holiday, columns=["date", "holiday"])
holidays_df['holiday'] = holidays_df['holiday'].astype(str)
df['holiday'] = df['date_utc'].apply(lambda x: 1 if x in holidays_df['date'].values else 0)

### Add latitude and longitude

#### Make a dataframe with edge_id and it's latitudes based on geojson file

In [14]:
add_coordinates = True
if add_coordinates:
    if IN_COLAB:
        with urllib.request.urlopen('https://raw.githubusercontent.com/dominik117/cortexia-darkzones-prediction/main/src/data/edges.geojson') as url:
            data = json.loads(url.read().decode())
    else:
        with open('../../src/data/edges.geojson') as f:
            data = json.load(f)
 
    df_edges = pd.DataFrame(data['features'])  # <-- The only column needed
    df_edges = pd.concat([df_edges.drop(['properties'], axis=1), df_edges['properties'].apply(pd.Series)], axis=1)  # <--Explode properties dictionary inside cells
    df_edges.rename(columns = {'id':'edge_id'}, inplace = True)
    # Rearrange BBOX (lat_north, lat_south, lon_east, lon_west)
    def sort_bbox(x):
        lat = sorted([x[1], x[3]], key=float, reverse=True)
        lon = sorted([x[0], x[2]], key=float, reverse=True)
        return lat + lon
    df_edges['bbox'] = df_edges['bbox'].apply(sort_bbox)
    df_coordinates = df_edges[['edge_id', 'bbox']].copy()
    bbox_exploded = pd.DataFrame(df_coordinates["bbox"].to_list(), columns=['lat_north', 'lat_south', 'lon_east', 'lon_west'])
    df_coordinates = pd.concat([df_coordinates, bbox_exploded], axis=1)
    df_coordinates.drop(['bbox'], axis=1, inplace=True)
    df = pd.merge(df, df_coordinates, how="left", on="edge_id")
else:
    print("To add coordinates change the add_coordinates variable to true")

### Add length of edge (street segment)

#### Formula to get distance between two points on Earth

In [15]:
from math import atan, cos, radians, sin, tan, asin, sqrt

def haversine_distance(lat1, lon1, lat2, lon2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    EQUATORIAL_RADIUS = 6378 # Radius of earth in kilometers
    return c * EQUATORIAL_RADIUS

def lamberts_ellipsoidal_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
    AXIS_A = 6378137.0
    AXIS_B = 6356752.314245
    EQUATORIAL_RADIUS = 6378137
    flattening = (AXIS_A - AXIS_B) / AXIS_A
    b_lat1 = atan((1 - flattening) * tan(radians(lat1)))
    b_lat2 = atan((1 - flattening) * tan(radians(lat2)))
    sigma = haversine_distance(lat1, lon1, lat2, lon2) / EQUATORIAL_RADIUS
    P_value = (b_lat1 + b_lat2) / 2
    Q_value = (b_lat2 - b_lat1) / 2
    X_numerator = (sin(P_value) ** 2) * (cos(Q_value) ** 2)
    X_demonimator = cos(sigma / 2) ** 2
    X_value = (sigma - sin(sigma)) * (X_numerator / X_demonimator)
    Y_numerator = (cos(P_value) ** 2) * (sin(Q_value) ** 2)
    Y_denominator = sin(sigma / 2) ** 2
    Y_value = (sigma + sin(sigma)) * (Y_numerator / Y_denominator)
    distance = abs(EQUATORIAL_RADIUS * (sigma - ((flattening / 2) * (X_value + Y_value))))
    return int(distance)

df['edge_length'] = df.apply(lambda x: lamberts_ellipsoidal_distance(x.lat_north, x.lon_east, x.lat_south, x.lon_west), axis=1)

### Add weather features

In [17]:
if IN_COLAB: df_weather = pd.read_csv("https://raw.githubusercontent.com/dominik117/cortexia-darkzones-prediction/main/src/data/weather_basel_2021-2022.csv")
else: df_weather = pd.read_csv("../../src/data/weather_basel_2021-2022.csv")
# Data obtained from https://www.meteoblue.com/en/weather/archive/export/basel_switzerland_2661604
df['date_utc'] = pd.to_datetime(df['date_utc']).dt.date
weather_columns = {'location':'date_utc', 'Basel':'temperature_max', 'Basel.1':'temperature_min', 'Basel.2':'temperature_mean', 'Basel.3':'precipitation',
                   'Basel.4':'snowfall', 'Basel.5':'humidity_max', 'Basel.6':'humidity_min', 'Basel.7':'humidity_mean', 'Basel.8':'cloud_coverage',
                   'Basel.9':'wind_speed_max', 'Basel.10':'wind_speed_min', 'Basel.11':'wind_speed_mean'}
df_weather.rename(columns = weather_columns, inplace = True)
df_weather.drop(['Basel.12'], axis=1, inplace=True)
df_weather = df_weather.iloc[9:].copy()  # <-- Rows with metadata
df_weather['date_utc'] = pd.to_datetime(df_weather['date_utc']).dt.date  # <-- Match main df type and name
weather_date = df_weather.pop('date_utc')  # <-- Pop date so it doesn't get converted
df_weather = df_weather.apply(pd.to_numeric)
df_weather = df_weather.round(decimals = 1)
df_weather.insert(0, 'date_utc', weather_date) # <-- Reassign unmodified date to df
df = pd.merge(df, df_weather, how="left", on="date_utc")
df['date_utc'] = df['date_utc'].astype(str)

### Add features from Open Street Maps (OSM)

#### Make the datasets with points of interest

In [18]:
add_points_of_interest = True
# Since we need longitude and latitude to add OSM features, turn it on when adding points of interest if off
if add_coordinates is False and add_points_of_interest:
    add_points_of_interest = True
    add_coordinates = True
    print("WARNING: add_coordinates was turned on since it's needed to aggregate points of interest")
    print(f"add_coordinates = {add_coordinates},  add_points_of_interest = {add_points_of_interest}")

# Extraction of the features on the dictionary from OSM
basel = 'Basel, Basel, Switzerland'
# Features dictionary at https://wiki.openstreetmap.org/wiki/Map_features

tags = {'amenity': ['vending_machine', 'bench', 'bar', 'fast_food', 'ice_cream', 'kindergarten', 'school', 'hospital', 'cinema', 
                    'fountain', 'dog_toilet', 'recycling', 'waste_basket', 'waste_disposal', 'childcare', 'marketplace',
                    'bus_station', 'fuel', 'taxi', 'parking', 'atm', 'clinic', 'nightclub', 'toilets'
                    ]}
amenity = ox.geometries_from_place(basel, tags=tags)
# tags = {'leisure': 'park'}
# leisure = ox.geometries_from_place(basel, tags=tags)

  for merged_outer_linestring in list(merged_outer_linestrings):
  for merged_outer_linestring in list(merged_outer_linestrings):


#### Make dataframe with points of interest and their coordinates

In [19]:
df_osm = pd.DataFrame(amenity)  # <-- Convert to DF
df_osm = df_osm[['amenity', 'geometry']].copy()  # <-- Select the only needed columns
df_osm['osm_id'] = df_osm.index.to_numpy()  # <-- Detach the index and assign it to a normal column
df_osm.reset_index(drop=True, inplace=True)  # <-- Drop index
osm_id_exploded = pd.DataFrame(df_osm["osm_id"].to_list(), columns=['type', 'osm'])  # <-- Explode index since it contains two indices
df_osm = pd.concat([df_osm, osm_id_exploded], axis=1) 
df_osm.drop(['osm_id', 'osm'], axis=1, inplace=True)
df_osm = df_osm[df_osm['type'] == 'node']  # <-- Drop Multipoligon points
#### Clean the coordinates from the GeoPandas geometry format to latitude and longitude columns
df_osm['lon'] = df_osm[df_osm['type'] == "node"]['geometry'].apply(lambda p: p.x)
df_osm['lat'] = df_osm[df_osm['type'] == "node"]['geometry'].apply(lambda p: p.y)
df_osm.drop(['geometry', 'type'], axis=1, inplace=True)

#### Merge the amenity based on coordinates with the corresponding coordinates from the edge_id

In [20]:
if add_points_of_interest:
    df_edges_coordinates = df[['edge_id', 'lat_north', 'lat_south', 'lon_east', 'lon_west']].copy()
    df_edges_coordinates = df_edges_coordinates.drop_duplicates(subset='edge_id', keep='first')

    # Make a list of the edges that have an amenity to them based on lat,lon conditional
    def is_between(a, x, b):
        return min(a, b) < x < max(a, b)
    edges_dict = []
    for edges_row in df_edges_coordinates.itertuples():
        for osm_row in df_osm.itertuples():
            if is_between(edges_row.lat_south, osm_row.lat, edges_row.lat_north) and is_between(edges_row.lon_west, osm_row.lon, edges_row.lon_east):
                edges_dict.append([edges_row.edge_id, osm_row.amenity])

    # Group by edge_id and get the value counts per amenity
    df_edges_dict = pd.DataFrame(edges_dict, columns = ['edge_id', 'amenity'])
    df_edges_dict = df_edges_dict.groupby('edge_id')['amenity'].value_counts().unstack(fill_value=0).reset_index()
    df = pd.merge(df, df_edges_dict, how="left", on="edge_id")
    osm_columns = list(df_edges_dict.columns[1:])
    df[osm_columns] = df[osm_columns].fillna(value=0)  # <-- Fill missing values, since not all edges have amenities
    df[osm_columns] = df[osm_columns].astype(int)
else:
    print("To add a count of points of interest from OSM, change add_points_of_interest variable to True")

## Quick EDA

In [21]:
columns_df_litters = ['edge_id', 'date_utc', 'month', 'day', 'weekday', 'total_litter']
columns_df_litters.extend(litter_columns)
df_litters = df[columns_df_litters].copy()
aggregation_type = 'sum'
to_agg = {'total_litter' : aggregation_type}
for litter in litter_columns:
    to_agg[litter] = aggregation_type
df_litters_agg = df_litters.groupby(['weekday'], as_index=False).agg(to_agg)

In [22]:
sum_of_litters = df[litter_columns].sum(axis=0)

## Regression Models

## Train / Test and Pipelines for Regression Models

In [40]:
# Train/ test split
columns_to_drop = ['total_litter', 'total_litter_ratio']
columns_to_drop.extend(litter_columns)
columns_to_drop.extend(osm_columns)

X = df.drop(columns=columns_to_drop, errors='ignore')
y = df['total_litter']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Pipeline
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X_train.select_dtypes(include=['int', 'float']).columns.tolist()

categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))
                                          ])
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())
                                      ])
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features),
                                               ("cat", categorical_transformer, categorical_features)
                                               ])

# print(X_train.shape, X_test.shape)
# print(f"Categorical features: {categorical_features}")
# print(f"Numeric features: {numeric_features}")
# X_train.head(1)

## Models and Predictions

### Poisson Regression

In [41]:
model_poisson = PoissonRegressor()

pipeline_poisson = Pipeline(steps=[("pre_process", preprocessor), ("poisson_model", model_poisson)])

grid_search_poisson = {'poisson_model__alpha' : [1e-10],
                       'poisson_model__max_iter' : [500],
                        }

model_poisson = GridSearchCV(estimator=pipeline_poisson, param_grid=grid_search_poisson,
                             scoring='neg_mean_poisson_deviance', verbose=5, n_jobs=-1)

In [42]:
start_time = time.time()  # <-- Just to count how long the model takes to predict

model_poisson.fit(X_train, y_train)

print(f"--- {(time.time() - start_time)/60} minutes ---")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
--- 2.124316159884135 minutes ---


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


In [43]:
best_model_poisson = model_poisson.best_estimator_
y_pred_poisson = best_model_poisson.predict(X_test)
y_pred_poisson = y_pred_poisson.astype(int)

print('MAE', metrics.mean_absolute_error(y_test, y_pred_poisson))
print('MSE', metrics.mean_squared_error(y_test, y_pred_poisson))
print('RMSE', np.sqrt(metrics.mean_squared_error(y_test, y_pred_poisson)))
print('R2 Score', metrics.r2_score(y_test, y_pred_poisson))
print('D2 Score', best_model_poisson.score(X_test, y_test))

MAE 24.114209849534912
MSE 5588.227100415825
RMSE 74.75444535554942
R2 Score 0.41853304116357004
D2 Score 0.6393622107772697


In [None]:
poisson_scores = []

for litter in litter_columns:
    X = df.drop(columns=columns_to_drop, errors='ignore')
    y = df[litter]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    model_poisson.fit(X_train,y_train)
    predictionforest = model_poisson.best_estimator_.predict(X_test)

    best_model_poisson = model_poisson.best_estimator_
    y_pred_poisson = best_model_poisson.predict(X_test)
    y_pred_poisson = y_pred_poisson.astype(int)

    score = best_model_poisson.score(X_test, y_test)
    poisson_scores.append([litter, score])

for value in poisson_scores:
    value[1] = round(value[1], 4)
from operator import itemgetter
poisson_scores = sorted(poisson_scores, key=itemgetter(1), reverse=True)
df_poisson_scores = pd.DataFrame(poisson_scores, columns=['litter', 'score'])
df_poisson_scores.head(10)

### Poisson Loop Scores

### Linear Regression

In [None]:
model_linear_r =  LinearRegression()
pipeline_linear_r = Pipeline(steps=[("pre_process", preprocessor), ("model", model_linear_r)])

In [None]:
start_time = time.time()  # <-- Just to count how long the model takes to predict

pipeline_linear_r.fit(X_train, y_train)
y_pred_linear_r = pipeline_linear_r.predict(X_test)

print(f"--- {(time.time() - start_time)/60} minutes ---")

--- 0.5773236989974976 minutes ---


In [None]:
from sklearn import metrics

print('MAE', metrics.mean_absolute_error(y_test, y_pred_linear_r))
print('MSE', metrics.mean_squared_error(y_test, y_pred_linear_r))
print('RMSE', np.sqrt(metrics.mean_squared_error(y_test, y_pred_linear_r)))
print('R2 Score', metrics.r2_score(y_test, y_pred_linear_r))

MAE 32.686415362403764
MSE 6887.252300769041
RMSE 82.98947102355238
R2 Score 0.2833666960726443


### Random Forest

In [None]:
# TODO: Add grid and parameter tunning

from sklearn.ensemble import RandomForestRegressor

rfr_model =  RandomForestRegressor(n_estimators=20,
                             max_depth=10,
                             criterion='mse',
                            )

pipeline_rfr = Pipeline(steps=[("pre_process", preprocessor), 
                               ("model", rfr_model)
                              ])

In [None]:
start_time = time.time()  # <-- Just to count how long the model takes to predict

pipeline_rfr.fit(X_train, y_train)
pred_rfr = pipeline_rfr.predict(X_test)

print(f"--- {(time.time() - start_time)/60} minutes ---")



--- 1.9101166685422262 minutes ---


In [None]:
from sklearn import metrics

print('MAE', metrics.mean_absolute_error(y_test, pred_rfr))
print('MSE', metrics.mean_squared_error(y_test, pred_rfr))
print('RMSE', np.sqrt(metrics.mean_squared_error(y_test, pred_rfr)))
print('R2 Score', metrics.r2_score(y_test, pred_rfr))

MAE 32.67003320057788
MSE 7988.979951550463
RMSE 89.3810939267945
R2 Score 0.16872958218043888


## Classification Model

### Make the classification output

In [None]:
#### We want half to be clean half dirty for the moment
df['total_litter_ratio'] = [1 if x>10 else 0 for x in df['total_litter']]
df['total_litter_ratio'].value_counts()

0    357032
1    241775
Name: total_litter_ratio, dtype: int64

### Train / Test and Pipelines for Classification Models

In [None]:
columns_to_drop = ['total_litter_ratio', 'date_utc', 'edge_osmid', 'total_litter']
columns_to_drop.extend(litter_columns)
#columns_to_drop.extend(osm_columns)

X = df.drop(columns=columns_to_drop, errors='ignore')
y = df['total_litter_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X_train.select_dtypes(include=['int', 'float']).columns.tolist()

categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))
                                          ])
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())
                                      ])
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features),
                                               ("cat", categorical_transformer, categorical_features)
                                               ])

print(X_train.shape, X_test.shape)
print(f"Categorical features: {categorical_features}")
print(f"Numeric features: {numeric_features}")
X_train.head(1)

(538926, 44) (59881, 44)
Categorical features: ['edge_id', 'osm_highway', 'Year', 'month', 'day', 'weekday']
Numeric features: ['temperature_max', 'temperature_min', 'temperature_mean', 'precipitation', 'snowfall', 'humidity_max', 'humidity_min', 'humidity_mean', 'cloud_coverage', 'wind_speed_max', 'wind_speed_min', 'wind_speed_mean', 'lat_north', 'lat_south', 'lon_east', 'lon_west', 'edge_length', 'atm', 'bar', 'bench', 'childcare', 'cinema', 'clinic', 'fast_food', 'fountain', 'fuel', 'ice_cream', 'kindergarten', 'marketplace', 'nightclub', 'parking', 'recycling', 'school', 'taxi', 'toilets', 'vending_machine', 'waste_basket', 'waste_disposal']


Unnamed: 0,edge_id,osm_highway,Year,month,day,weekday,temperature_max,temperature_min,temperature_mean,precipitation,snowfall,humidity_max,humidity_min,humidity_mean,cloud_coverage,wind_speed_max,wind_speed_min,wind_speed_mean,lat_north,lat_south,lon_east,lon_west,edge_length,atm,bar,bench,childcare,cinema,clinic,fast_food,fountain,fuel,ice_cream,kindergarten,marketplace,nightclub,parking,recycling,school,taxi,toilets,vending_machine,waste_basket,waste_disposal
334934,"(273223783, 273223789, 0)",residential,2022,4,12,Tuesday,24.0,6.3,15.5,0.0,0.0,66.0,31.0,46.2,39.2,18.2,6.0,13.9,47.561804,47.560409,7.600955,7.599665,201,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### SVC [Classification Model]

In [None]:
from sklearn.svm import LinearSVC

model_svc = LinearSVC(random_state=42)
pipeline_svc = Pipeline([("pre_process", preprocessor), 
                         ("model", model_svc)])

start_time = time.time()  # <-- Just to count how long the model takes to predict
pipeline_svc.fit(X_train, y_train)
y_pred_svc = pipeline_svc.predict(X_test)
print(f"--- {round(((time.time() - start_time)/60), 2)} minutes ---")



--- 9.62 minutes ---


In [None]:
# Prediction scores
class_labels = pipeline_svc.named_steps['model'].classes_
print(classification_report(y_test, y_pred_svc))
# pd.DataFrame(confusion_matrix(y_test, y_pred_svc), columns=class_labels, index=class_labels)
# classification_scores['svc'] = round(f1_score(y_test, y_pred_svc, average='weighted'), 3)

              precision    recall  f1-score   support

           0       0.80      0.84      0.82     35700
           1       0.75      0.69      0.72     24181

    accuracy                           0.78     59881
   macro avg       0.77      0.76      0.77     59881
weighted avg       0.78      0.78      0.78     59881

