In [10]:
import requests
import pandas as pd 
import matplotlib.pyplot as plt
import pycaret
from pycaret.regression import *



#download data

response = requests.get("https://power.larc.nasa.gov/api/temporal/monthly/point?parameters=T2M,PRECTOTCORR_SUM,RH2M,WS2M&community=SB&longitude=29.02&latitude=41.10&format=JSON&start=2000&end=2025") 

data_bundle = response.json()

#independent variables

precipitaiton_data = data_bundle['properties']['parameter']['PRECTOTCORR_SUM']

relative_humidity_data = data_bundle['properties']['parameter']['RH2M']

wind_speed_data = data_bundle['properties']['parameter']['WS2M']


#dependent variable

tempereture_data = data_bundle['properties']['parameter']['T2M']




#drop the sum of every year's value

prep_filtered_data = {k: v for i, (k, v) in enumerate(precipitaiton_data.items()) if (i + 1) % 13 != 0}
rel_hum_filtered_data = {k: v for i, (k, v) in enumerate(relative_humidity_data.items()) if (i + 1) % 13 != 0}
wind_speed_filtered_data = {k: v for i, (k, v) in enumerate(wind_speed_data.items()) if (i + 1) % 13 != 0}
temp_filtered_data = {k: v for i, (k, v) in enumerate(tempereture_data.items()) if (i + 1) % 13 != 0}

# Convert to time series format
precipitation_series = pd.Series(prep_filtered_data)
precipitation_series.index = pd.to_datetime(precipitation_series.index, format='%Y%m')
precipitation_series = precipitation_series.sort_index()
#print(precipitation_series)     
relative_humidity_series = pd.Series(rel_hum_filtered_data)
relative_humidity_series.index = pd.to_datetime(relative_humidity_series.index, format='%Y%m')
relative_humidity_series = relative_humidity_series.sort_index()
#print(relative_humidity_series)
wind_speed_series = pd.Series(wind_speed_filtered_data)
wind_speed_series.index = pd.to_datetime(wind_speed_series.index, format='%Y%m')
wind_speed_series = wind_speed_series.sort_index()
#print(wind_speed_series)
temp_series = pd.Series(temp_filtered_data)
temp_series.index = pd.to_datetime(temp_series.index, format='%Y%m')
temp_series = temp_series.sort_index()

#print(temp_series)

# Plot the time series
# plt.figure(figsize=(10, 5))
# plt.plot(precipitation_series.index, precipitation_series.values, marker='o', linestyle='-')
# plt.title('Monthly Precipitation Time Series')
# plt.xlabel('Date')
# plt.ylabel('Precipitation (mm/day)')
# plt.grid(True)
# plt.tight_layout()
# plt.show()  


# Create a DataFrame with the time series data

prep_df = pd.DataFrame(precipitation_series.values, columns=['Precipitation'])

rel_hun_df = pd.DataFrame(relative_humidity_series.values, columns=['Relative Humidity'])
wind_speed_df = pd.DataFrame(wind_speed_series.values, columns=['Wind Speed'])    
temp_df = pd.DataFrame(temp_series.values, columns=['Temperature'])
dataset = pd.concat([prep_df, rel_hun_df, wind_speed_df, temp_df], axis=1)

# remove rows with -999 in the last 6 months
dataset = dataset[(dataset != -999).all(axis=1)]

#print(all_df)
dataset.tail()

Unnamed: 0,Precipitation,Relative Humidity,Wind Speed,Temperature
301,74.69,80.17,4.4,5.12
302,21.42,77.12,3.55,10.71
303,30.55,77.62,4.04,11.13
304,49.24,68.96,3.19,17.61
305,1.77,62.05,3.79,23.36


In [11]:
# dataset has benn split into two parts, one for modeling and one for unseen data for predictions
# 90% for modeling and 10% for unseen data

data = dataset.sample(frac=0.9, random_state=13).reset_index(drop=True)
data_unseen = dataset.drop(data.index).reset_index(drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (275, 4)
Unseen Data For Predictions: (31, 4)


In [4]:
data.head()

Unnamed: 0,Precipitation,Relative Humidity,Wind Speed,Temperature
0,55.98,71.8,3.8,21.63
1,2.45,69.01,4.08,27.23
2,53.9,84.53,3.79,11.17
3,81.83,82.26,4.24,9.61
4,53.88,81.01,3.8,8.32


In [13]:
#setting up the pycaret environment 
# train-test split is done by pycaret automatically
# fold_strategy is set to timeseries to ensure that the data is split in a time series

exp_ML = setup(dataset,target = 'Temperature',session_id = 123,fold=5, data_split_shuffle =False, fold_strategy='timeseries' )

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Temperature
2,Target type,Regression
3,Original data shape,"(306, 4)"
4,Transformed data shape,"(306, 4)"
5,Transformed train set shape,"(214, 4)"
6,Transformed test set shape,"(92, 4)"
7,Numeric features,3
8,Preprocess,True
9,Imputation type,simple


In [None]:
#create and compare models
best = compare_models(sort='MAPE')
#best model is Extra Trees Regressor.

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,2.3293,10.097,3.1609,0.801,0.2533,0.2078,0.054
gbr,Gradient Boosting Regressor,2.4104,10.5196,3.2408,0.7922,0.2565,0.2156,0.03
rf,Random Forest Regressor,2.3922,10.1921,3.1797,0.7992,0.2543,0.2186,0.064
huber,Huber Regressor,2.5718,10.6034,3.227,0.7913,0.2485,0.2248,0.016
lar,Least Angle Regression,2.5642,10.5083,3.2141,0.7932,0.2487,0.2254,0.012
lr,Linear Regression,2.5669,10.4965,3.2131,0.7934,0.2486,0.2257,0.018
ridge,Ridge Regression,2.5666,10.4899,3.2125,0.7935,0.2488,0.2258,0.014
br,Bayesian Ridge,2.5699,10.4813,3.2154,0.7936,0.2513,0.2279,0.016
ada,AdaBoost Regressor,2.533,11.287,3.3496,0.7775,0.2656,0.2295,0.032
lasso,Lasso Regression,2.5639,10.3898,3.2052,0.7952,0.2552,0.2313,0.014
