In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
# Step 1: Load your datasets
weather_description = pd.read_csv('weather_description.csv', index_col='datetime', parse_dates=True)
humidity = pd.read_csv('humidity.csv', index_col='datetime', parse_dates=True).add_suffix('_humidity')
temperature = pd.read_csv('temperature.csv', index_col='datetime', parse_dates=True).add_suffix('_temperature')
pressure = pd.read_csv('pressure.csv', index_col='datetime', parse_dates=True).add_suffix('_pressure')
wind_direction = pd.read_csv('wind_direction.csv', index_col='datetime', parse_dates=True).add_suffix('_wind_direction')
wind_speed = pd.read_csv('wind_speed.csv', index_col='datetime', parse_dates=True).add_suffix('_wind_speed')
city_attributes = pd.read_csv('city_attributes.csv')

In [3]:
# Step 2: Merge datasets into a single DataFrame
weather_data = temperature.join([weather_description.add_suffix('_description'), humidity, pressure, wind_direction, wind_speed])

In [4]:
# Verify no overlapping columns (except for 'datetime')
if weather_data.columns.duplicated().any():
    raise Exception("Data contains overlapping columns, please check dataset column names.")

In [5]:
# Step 3: Data Cleaning
weather_data.dropna(inplace=True)

In [6]:
# In a real-world scenario, you could convert it to numeric using encoding.
weather_data.drop(columns=weather_data.filter(regex='_description$').columns, inplace=True)

In [7]:
# Extract all temperature columns as features except for the one we want to predict
feature_columns = [col for col in weather_data.columns if 'temperature' in col and 'New York_temperature' not in col]
features = weather_data[feature_columns]

In [8]:
# Set the target to the New York temperature
target = weather_data['New York_temperature']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [10]:
#step7:Model selection

In [11]:
model = RandomForestRegressor(n_estimators=10, random_state=42)

In [12]:
# Step 8: Model Training
model.fit(X_train, y_train)

In [13]:
# Step 9: Model Evaluation
y_pred = model.predict(X_test)
print(y_pred)
mae = mean_absolute_error(y_test, y_pred)

[295.60919647 285.65095106 279.7248     ... 295.3249     277.5869
 277.58585   ]


In [14]:
print(f"Model MAE: {mae}")

Model MAE: 0.8655412348044154
