In [25]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("FINAL_fisheries_panel_dataset_2013_2022_NO_PRODUCTIVITY.csv")


In order to use linear regression we need to eliminate nan values. That's why ı looked for the columns which have the most nan values

In [15]:
nan_ratio = df.isnull().mean().sort_values(ascending=False)
print(nan_ratio)

fleet_size                      0.907843
fishing_patents                 0.860784
country                         0.823529
aquaculture_employment          0.294118
total_aquaculture_tonnes        0.294118
gdp                             0.137255
population                      0.137255
fish_catches                    0.000000
total_patents                   0.000000
iso3                            0.000000
year                            0.000000
fisheries_exports               0.000000
extreme_temperature_exposure    0.000000
fisheries_imports               0.000000
coastal_country                 0.000000
fao_fisheries_member            0.000000
dtype: float64


In [16]:
df = df.drop(columns=["fleet_size", "fishing_patents", "country"])

In [17]:
df_ml = df.dropna(subset=[
    "total_aquaculture_tonnes",
    "gdp",
    "population",
    "fish_catches",
    "fisheries_exports",
    "fisheries_imports",
    "extreme_temperature_exposure"
])

print(len(df_ml))


360


In [18]:
X = df_ml[
    [
        "gdp",
        "population",
        "fish_catches",
        "fisheries_exports",
        "fisheries_imports",
        "extreme_temperature_exposure"
    ]
]

y = np.log1p(df_ml["total_aquaculture_tonnes"])


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr = LinearRegression()
lr.fit(X_train_scaled, y_train)


In [20]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = lr.predict(X_test_scaled)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R²:", r2)


RMSE: 1.6612107417138489
R²: 0.5437542914437394


In [21]:
coef_df = pd.DataFrame({
    "feature": X.columns,
    "coefficient": lr.coef_
}).sort_values(by="coefficient", key=abs, ascending=False)

print(coef_df)


                        feature  coefficient
1                    population     2.134305
0                           gdp    -1.637260
3             fisheries_exports     1.095056
5  extreme_temperature_exposure     0.435464
2                  fish_catches     0.148201
4             fisheries_imports     0.119681


In [22]:
print(X.corr()["gdp"].sort_values(ascending=False))


gdp                             1.000000
population                      0.921406
fisheries_imports               0.894125
fisheries_exports               0.266093
extreme_temperature_exposure    0.163360
fish_catches                   -0.027546
Name: gdp, dtype: float64


Alternative Linear Model

In [23]:
X_alt = df_ml[
    [
        "gdp",
        "fish_catches",
        "fisheries_exports",
        "fisheries_imports",
        "extreme_temperature_exposure"
    ]
]

X_train, X_test, y_train, y_test = train_test_split(
    X_alt, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr_alt = LinearRegression()
lr_alt.fit(X_train_scaled, y_train)

y_pred_alt = lr_alt.predict(X_test_scaled)

rmse_alt = np.sqrt(mean_squared_error(y_test, y_pred_alt))
r2_alt = r2_score(y_test, y_pred_alt)

print("ALT RMSE:", rmse_alt)
print("ALT R²:", r2_alt)


ALT RMSE: 1.9046189824377184
ALT R²: 0.40025650248031364


In [None]:
#Dropping population reduces explanatory power significantly

In [24]:
coef_alt = pd.DataFrame({
    "feature": X_alt.columns,
    "coefficient": lr_alt.coef_
}).sort_values(by="coefficient", key=abs, ascending=False)

print(coef_alt)


                        feature  coefficient
2             fisheries_exports     0.964447
4  extreme_temperature_exposure     0.508064
3             fisheries_imports     0.421143
1                  fish_catches     0.222153
0                           gdp     0.044805
