## We predict the amount of trash for the next season for each beach.

##Machine Learning


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [2]:
#mount drive
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
#read the merged file
df = pd.read_csv("/content/cleaned_data.csv")

In [4]:
df.head(5)

Unnamed: 0,Beach_Code,Original_Beach_Name,API_Beach_Name,rating,user_ratings_total,lat,lon,Date of collection,Period,length,Trash amount,Beach was cleaned,The last time that beach has been cleaned,The last time that was cleaned from another group,further from the reference point above 100 meters,Trovato un animale abbandonato o morto,Numero di animali abbandonati o morti,next_season_trash
0,40,Aciddara,Spiaggia Aciddara,3.5,357,38.080366,13.537345,12/05/2025,Spring 2025,14953,1794.0,True,,False,False,True,2.0,236.0
1,40,Aciddara,Spiaggia Aciddara,3.5,357,38.080366,13.537345,13/07/2025,Summer 2025,14953,236.0,True,12/04/2024,False,False,False,,552.0
2,40,Aciddara,Spiaggia Aciddara,3.5,357,38.080366,13.537345,30/12/2025,Winter 2025-2026,14953,552.0,False,13/07/2025,False,True,False,0.0,
3,149,Bagni della Regina Giovanna - Sorrento (Na),Capo di Sorrento,4.7,153,40.633559,14.351224,14/12/2025,Winter 2025-2026,939,738.0,True,,False,False,True,1.0,
4,123,Boschetto Steccato di Cutro,Spiaggia Libera Steccato di Cutro,4.4,279,38.934628,16.931957,09/07/2025,Summer 2025,3521,418.0,True,18/07/2025,False,False,False,,181.0


In [5]:
#Bring the names into the same consistency
df["Period"] = df["Period"].replace({
"Winter 2024-2025" : "Winter 2024",
"Winter 2025-2026" : "Winter 2025"  })

# Strip column names
df.columns = df.columns.str.strip()

In [6]:
# Split the season and the Year so you can have more temporal features
df['Season'] = df["Period"].str.split().str[0]

In [7]:
df["Year"] = df['Period'].str.split().str[1].astype(int)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 189 entries, 62 to 46
Data columns (total 21 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Beach_Code                                         189 non-null    int64  
 1   Original_Beach_Name                                189 non-null    object 
 2   API_Beach_Name                                     189 non-null    object 
 3   rating                                             189 non-null    float64
 4   user_ratings_total                                 189 non-null    int64  
 5   lat                                                189 non-null    float64
 6   lon                                                189 non-null    float64
 7   Date of collection                                 189 non-null    object 
 8   Period                                             189 non-null    object 
 9   length         

In [35]:

# Convert numeric columns that might have comma as decimal separator
# Replace 'Trash amount' with any other numeric columns that might have this issue
if 'Trash amount' in df.columns:
    df['Trash amount'] = df['Trash amount'].astype(str).str.replace(',', '.').astype(float)

df['length'] = df['length'].astype(str).str.replace(',', '.').astype(float)

In [36]:
#define a season order and sort temporally
season_order = {"Winter":0, "Spring":1, "Summer": 2, "Autumn":3}
df["Season_order"] = df["Season"].map(season_order)
df = df.sort_values(["Beach_Code", "Year", "Season_order"])

In [37]:
#create next_season_trash which will be our target variable
df["next_season_trash"] = (
    df.groupby("Beach_Code")["Trash amount"]
      .shift(-1)
)

In [38]:
df_model = df.dropna(subset=["next_season_trash"])

In [39]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
Index: 110 entries, 62 to 47
Data columns (total 21 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Beach_Code                                         110 non-null    int64  
 1   Original_Beach_Name                                110 non-null    object 
 2   API_Beach_Name                                     110 non-null    object 
 3   rating                                             110 non-null    float64
 4   user_ratings_total                                 110 non-null    int64  
 5   lat                                                110 non-null    float64
 6   lon                                                110 non-null    float64
 7   Date of collection                                 110 non-null    object 
 8   Period                                             110 non-null    object 
 9   length         

In [42]:
df_model = df_model.copy()
df_model["current_trash"] = df_model["Trash amount"]

In [43]:
next_season_map = {
    "Winter": "Spring",
    "Spring": "Summer",
    "Summer": "Autumn",
    "Autumn": "Winter"
}

df_model["Next_Season"] = df_model["Season"].map(next_season_map)


In [44]:
X = df_model.drop(columns=["Numero di animali abbandonati o morti", "Trovato un animale abbandonato o morto", "next_season_trash"
, "further from the reference point above 100 meters", "The last time that beach has been cleaned", "Period",
                              "lon","lat", "API_Beach_Name", "Original_Beach_Name", "Date of collection", "Trash amount"])

In [45]:
## Once hot encoding
X = pd.get_dummies(
    X,
    columns=["Season", "Next_Season"],
    drop_first=True
)

In [46]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 110 entries, 62 to 47
Data columns (total 15 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Beach_Code                                         110 non-null    int64  
 1   rating                                             110 non-null    float64
 2   user_ratings_total                                 110 non-null    int64  
 3   length                                             110 non-null    float64
 4   Beach was cleaned                                  110 non-null    bool   
 5   The last time that was cleaned from another group  110 non-null    bool   
 6   Year                                               110 non-null    int64  
 7   Season_order                                       110 non-null    int64  
 8   current_trash                                      110 non-null    float64
 9   Season_Spring  

In [47]:
y = df_model["next_season_trash"]


###Train the model


In [48]:
train_mask = df_model["Year"] < 2025
test_mask  = df_model["Year"] == 2025

X_train = X[train_mask]
X_test  = X[test_mask]

y_train = y[train_mask]
y_test  = y[test_mask]


In [49]:
from sklearn.ensemble import RandomForestRegressor


model = RandomForestRegressor(
    n_estimators=300,        # enough, more won’t help
    max_depth=6,             # ⬅️ key change
    min_samples_leaf=8,      # ⬅️ prevents memorization
    max_features="sqrt",
    random_state=42
)


model.fit(X_train,y_train)

###Evaluate the model


In [51]:
from sklearn.metrics import mean_absolute_error, r2_score

pred = model.predict(X_test)

print("MAE: ", mean_absolute_error(y_test,pred) )
print("R2: ", r2_score(y_test,pred))

MAE:  545.2084342696187
R2:  0.0253188027014708


In [54]:
# Create a DataFrame to compare predictions with actual values
results = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': pred
})

# Add beach and season info for context
results['Beach_Code'] = df_model.loc[test_mask, 'Beach_Code'].values
results['Year'] = df_model.loc[test_mask, 'Year'].values
results['Season'] = df_model.loc[test_mask, 'Season'].values

# Show first 10-20 predictions
print("\n=== First 20 Predictions ===")
print(results.head(20))


=== First 20 Predictions ===
    Actual   Predicted  Beach_Code  Year  Season
0     91.0  514.022630          24  2025  Spring
1     67.0  538.198108          24  2025  Summer
2   1036.0  772.582274          25  2025  Spring
3   2760.0  796.757752          25  2025  Summer
4    214.0  789.084120          30  2025  Summer
5    367.0  721.621655          31  2025  Winter
6    213.0  664.242568          31  2025  Summer
7    281.0  570.347006          34  2025  Summer
8    230.0  547.875802          36  2025  Spring
9    195.0  705.022201          39  2025  Winter
10  1794.0  703.818441          40  2025  Winter
11   236.0  733.887668          40  2025  Spring
12    42.0  514.898540          45  2025  Winter
13   139.0  577.349514          50  2025  Summer
14    69.0  526.209496          51  2025  Autumn
15   256.0  551.619212          54  2025  Spring
16    23.0  511.674477          55  2025  Spring
17    27.0  528.497641          55  2025  Summer
18    21.0  529.726788          55  202

In [55]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n=== Top 10 Most Important Features ===")
print(feature_importance.head(10))


=== Top 10 Most Important Features ===
               feature  importance
8        current_trash    0.305556
0           Beach_Code    0.171296
2   user_ratings_total    0.157407
3               length    0.134259
7         Season_order    0.078704
1               rating    0.055556
12  Next_Season_Spring    0.041667
11       Season_Winter    0.037037
10       Season_Summer    0.018519
4    Beach was cleaned    0.000000


In [61]:
#implement a naive trendmodel
baseline_pred = df_model.loc[test_mask, "current_trash"]
baseline_mae = mean_absolute_error(y_test, baseline_pred)

print("Baseline MAE:", baseline_mae)


Baseline MAE: 425.9390243902439



**What went good**

The file hold a working Machine Learning Pipeline which predicts for each beach the amount of the trash for the next season.

---

**What went bad**

The dataset has a small amount of features and observation points as result the model has a high Bias. The models generalization is poor.

---

**What could have done better**

From the naive trendmodel the MAE is less than Random Forest as a result models that work better with trends could significally return better results.
