In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from joblib import dump, load


In [10]:
# Assuming X_train and X_test are your training and testing data
scaler = StandardScaler()
linReg = LinearRegression()
rf_reg = RandomForestRegressor(n_estimators=100)

In [14]:
# Load the dataset (assuming you have a CSV file with 'datetime' and 'temperature' columns)
springData = pd.read_csv("../datasets/springData.csv")
fallData = pd.read_csv("../datasets/fallData.csv")
data = pd.concat([springData, fallData])

# Convert the 'datetime' column to datetime format
data['datetime'] = pd.to_datetime(data['datetime'], format='%d/%m/%y %H:%M')
data['timestamp'] = data['datetime'].astype(int) // 10**9

X = data[['timestamp','t2m', 'sp', 'spDayBefore', 'tp','tcc', 'tHeightAvg']]
y = data['vid']

# Split the data into training and testing sets
# I have data starting from 2008 to 2019
# I want to test 2019 data and rest for training
X_train = X[X['timestamp'] < 1546300800]
X_test = X[X['timestamp'] >= 1546300800]
y_train = y[X['timestamp'] < 1546300800]
y_test = y[X['timestamp'] >= 1546300800]
# Drop the 'datetime' column
X_train = X_train.drop(['timestamp'], axis=1)
print("🚀 ~ X_train:", X_train)
X_test = X_test.drop(['timestamp'], axis=1)

🚀 ~ X_train:                t2m           sp  spDayBefore            tp       tcc   
0       277.330067  103734.5229  103830.4815  4.350000e-05  0.991868  \
1       277.303063  103736.0159  103828.6015  4.330000e-05  0.991719   
2       277.276148  103737.5039  103826.7276  4.310000e-05  0.991569   
3       277.249324  103738.9869  103824.8600  4.290000e-05  0.991421   
4       277.222409  103740.4749  103822.9862  4.270000e-05  0.991271   
...            ...          ...          ...           ...       ...   
288745  280.097883  101186.9364  100709.9898  4.570000e-10  0.253918   
288746  280.097447  101186.8112  100710.2119  5.230000e-10  0.253959   
288747  280.097012  101186.6860  100710.4339  5.880000e-10  0.253999   
288748  280.096577  101186.5609  100710.6560  6.530000e-10  0.254040   
288749  280.096142  101186.4357  100710.8780  7.180000e-10  0.254081   

        tHeightAvg  
0       272.473920  
1       272.437186  
2       272.400574  
3       272.364084  
4       272.32747

In [None]:

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
linReg.fit(X_train_scaled, y_train)

In [13]:
rf_reg.fit(X_train_scaled, y_train)

In [14]:
dump(linReg, '../models/linearReg_ex_2019.joblib')
dump(rf_reg, '../models/rf_reg_ex_2019.joblib')

['../models/rf_reg_ex_2019.joblib']

In [11]:
# load the models
linearModel = load('../models/linearReg_ex_2019.joblib')
rfModel = load('../models/rf_reg_ex_2019.joblib')

In [15]:
# Predict
y_pred_linear = linearModel.predict(X_test_scaled)
y_pred_rf = rfModel.predict(X_test_scaled)

In [18]:
# calculate the accracy of the model
print("Linear Regression")
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred_linear))
print("Mean Squared Error: ", mean_squared_error(y_test, y_pred_linear))
print("R2 Score: ", r2_score(y_test, y_pred_linear))

print("Random Forest")
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred_rf))
print("Mean Squared Error: ", mean_squared_error(y_test, y_pred_rf))
print("R2 Score: ", r2_score(y_test, y_pred_rf))



Linear Regression
Mean Absolute Error:  3.219259582372625
Mean Squared Error:  37.41979022517705
R2 Score:  0.0669875482843002
Random Forest
Mean Absolute Error:  0.5221529824639872
Mean Squared Error:  1.508685190193732
R2 Score:  0.962382951382161


In [27]:
# write a function that will take the input and return the prediction
# input will be a dictionary with the following keys
# 't2m', 'sp', 'spDayBefore', 'tp', 'tcc', 'tHeightAvg'
def predict_vid(input):
    # convert the input to a dataframe
    input_df = pd.DataFrame(input, index=[0])
    # scale the input
    input_scaled = scaler.fit_transform(input_df)
    # predict
    prediction = rfModel.predict(input_scaled)
    return prediction

# test the function
input = {'t2m': 285.06, 'sp': 102078.0901, 'spDayBefore': 102435.0368, 'tp': 0.000000846, 'tcc': 0.51399893, 'tHeightAvg': 279.3657916}
print('pre dicted vertical integrated density of birds when')
print ('surface temperature is 285.06K, surface pressure is 102078.0901Pa, surface pressure of the day before is 102435.0368Pa,')
print ('total precipitation is 0.000000846, total cloud cover is 0.51399893 and Average air temperate between 400 and a 1000m is 279.3657916K is:')
print('is: ',predict_vid(input)[0], ' birds/km^2')

preddicted vertical integrated density of birds when
surface temperature is 285.06K, surface pressure is 102078.0901Pa, surface pressure of the day before is 102435.0368Pa,
total precipitation is 0.000000846, total cloud cover is 0.51399893 and Average air temperate between 400 and a 1000m is 279.3657916K is:
is:  6.424712918050001  birds/km^2
