In [204]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
from xgboost import XGBRegressor
from sklearn.linear_model import LogisticRegression
import pickle

In [147]:
df = pd.read_csv("seattle-weather.csv")
df

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain
...,...,...,...,...,...,...
1456,2015-12-27,8.6,4.4,1.7,2.9,rain
1457,2015-12-28,1.5,5.0,1.7,1.3,rain
1458,2015-12-29,0.0,7.2,0.6,2.6,fog
1459,2015-12-30,0.0,5.6,-1.0,3.4,sun


In [148]:
df.tail()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
1456,2015-12-27,8.6,4.4,1.7,2.9,rain
1457,2015-12-28,1.5,5.0,1.7,1.3,rain
1458,2015-12-29,0.0,7.2,0.6,2.6,fog
1459,2015-12-30,0.0,5.6,-1.0,3.4,sun
1460,2015-12-31,0.0,5.6,-2.1,3.5,sun


In [149]:
df.shape

(1461, 6)

In [150]:
df.isnull().sum()

date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64

In [151]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           1461 non-null   object 
 1   precipitation  1461 non-null   float64
 2   temp_max       1461 non-null   float64
 3   temp_min       1461 non-null   float64
 4   wind           1461 non-null   float64
 5   weather        1461 non-null   object 
dtypes: float64(4), object(2)
memory usage: 68.6+ KB


In [152]:
df.columns

Index(['date', 'precipitation', 'temp_max', 'temp_min', 'wind', 'weather'], dtype='object')

In [153]:
df.describe()

Unnamed: 0,precipitation,temp_max,temp_min,wind
count,1461.0,1461.0,1461.0,1461.0
mean,3.029432,16.439083,8.234771,3.241136
std,6.680194,7.349758,5.023004,1.437825
min,0.0,-1.6,-7.1,0.4
25%,0.0,10.6,4.4,2.2
50%,0.0,15.6,8.3,3.0
75%,2.8,22.2,12.2,4.0
max,55.9,35.6,18.3,9.5


In [154]:
df.corr(numeric_only= True)

Unnamed: 0,precipitation,temp_max,temp_min,wind
precipitation,1.0,-0.228555,-0.072684,0.328045
temp_max,-0.228555,1.0,0.875687,-0.164857
temp_min,-0.072684,0.875687,1.0,-0.074185
wind,0.328045,-0.164857,-0.074185,1.0


In [155]:
# Convert Date to datetime format
df['date'] = pd.to_datetime(df['date'])
df['Year'] = df['date'].dt.year
df['Month'] = df['date'].dt.month
df['Day'] = df['date'].dt.day
df.drop(columns=['date'], inplace=True)

In [156]:
df.columns

Index(['precipitation', 'temp_max', 'temp_min', 'wind', 'weather', 'Year',
       'Month', 'Day'],
      dtype='object')

In [157]:
df

Unnamed: 0,precipitation,temp_max,temp_min,wind,weather,Year,Month,Day
0,0.0,12.8,5.0,4.7,drizzle,2012,1,1
1,10.9,10.6,2.8,4.5,rain,2012,1,2
2,0.8,11.7,7.2,2.3,rain,2012,1,3
3,20.3,12.2,5.6,4.7,rain,2012,1,4
4,1.3,8.9,2.8,6.1,rain,2012,1,5
...,...,...,...,...,...,...,...,...
1456,8.6,4.4,1.7,2.9,rain,2015,12,27
1457,1.5,5.0,1.7,1.3,rain,2015,12,28
1458,0.0,7.2,0.6,2.6,fog,2015,12,29
1459,0.0,5.6,-1.0,3.4,sun,2015,12,30


In [158]:
df.tail()

Unnamed: 0,precipitation,temp_max,temp_min,wind,weather,Year,Month,Day
1456,8.6,4.4,1.7,2.9,rain,2015,12,27
1457,1.5,5.0,1.7,1.3,rain,2015,12,28
1458,0.0,7.2,0.6,2.6,fog,2015,12,29
1459,0.0,5.6,-1.0,3.4,sun,2015,12,30
1460,0.0,5.6,-2.1,3.5,sun,2015,12,31


In [159]:
unique_values = df['weather'].value_counts()
print(unique_values)

weather
rain       641
sun        640
fog        101
drizzle     53
snow        26
Name: count, dtype: int64


# Encoding categorical variable 'weather'

In [176]:
# Encoding categorical variable 'weather'
le = LabelEncoder()
df['weather'] = le.fit_transform(df['weather'])

In [177]:
df

Unnamed: 0,precipitation,temp_max,temp_min,wind,weather,Year,Month,Day
0,0.0,12.8,5.0,4.7,0,2012,1,1
1,10.9,10.6,2.8,4.5,2,2012,1,2
2,0.8,11.7,7.2,2.3,2,2012,1,3
3,20.3,12.2,5.6,4.7,2,2012,1,4
4,1.3,8.9,2.8,6.1,2,2012,1,5
...,...,...,...,...,...,...,...,...
1456,8.6,4.4,1.7,2.9,2,2015,12,27
1457,1.5,5.0,1.7,1.3,2,2015,12,28
1458,0.0,7.2,0.6,2.6,1,2015,12,29
1459,0.0,5.6,-1.0,3.4,4,2015,12,30


# Splitting Data into Training and Testing Sets

In [178]:
X = df[['precipitation', 'temp_min', 'wind', 'Year', 'Month', 'Day']]
y_temp = df['temp_max']  # Target for regression
y_weather = df['weather']  # Target for classification

In [179]:
df[['weather']]

Unnamed: 0,weather
0,0
1,2
2,2
3,2
4,2
...,...
1456,2
1457,2
1458,1
1459,4


In [180]:
df.shape

(1461, 8)

In [181]:
# Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [182]:
# Train-test split
X_train, X_test, y_train_temp, y_test_temp = train_test_split(X_scaled, y_temp, test_size=0.2, random_state=42)
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_scaled, y_weather, test_size=0.2, random_state=42)

In [191]:
(X_train)

array([[-0.45364986, -2.27725991,  1.71071279,  0.44800945,  1.58815364,
        -1.56016973],
       [ 2.31667892,  0.35154932,  2.12815262, -0.44617334,  0.71826144,
         1.62161517],
       [ 2.36160317, -0.42514431, -0.72435286,  0.44800945, -1.02152297,
        -0.65108833],
       ...,
       [-0.37877611, -0.20607688, -0.79392617,  0.44800945, -0.44159483,
        -0.65108833],
       [-0.45364986, -1.83912504,  0.11052679,  1.34219223,  1.58815364,
         1.62161517],
       [-0.45364986, -0.98277052, -0.93307278,  1.34219223, -1.6014511 ,
         1.73525035]])

In [193]:
(X_test)

array([[-0.45364986,  0.57061676, -0.37648634,  0.44800945, -0.15163076,
        -0.53745316],
       [ 0.41488565, -0.42514431, -1.90709904,  1.34219223, -1.6014511 ,
        -0.65108833],
       [-0.45364986, -0.76370308,  0.11052679, -0.44617334, -1.31148703,
         0.1443579 ],
       ...,
       [-0.45364986, -0.30565299, -1.00264608, -0.44617334,  1.0082255 ,
        -0.31018281],
       [-0.2290286 , -0.42514431,  1.8498594 , -0.44617334,  1.58815364,
         0.82616895],
       [-0.45364986,  0.13248189,  0.66711323, -1.34035613,  0.71826144,
        -0.53745316]])

In [194]:
(X_train_clf)

array([[-0.45364986, -2.27725991,  1.71071279,  0.44800945,  1.58815364,
        -1.56016973],
       [ 2.31667892,  0.35154932,  2.12815262, -0.44617334,  0.71826144,
         1.62161517],
       [ 2.36160317, -0.42514431, -0.72435286,  0.44800945, -1.02152297,
        -0.65108833],
       ...,
       [-0.37877611, -0.20607688, -0.79392617,  0.44800945, -0.44159483,
        -0.65108833],
       [-0.45364986, -1.83912504,  0.11052679,  1.34219223,  1.58815364,
         1.62161517],
       [-0.45364986, -0.98277052, -0.93307278,  1.34219223, -1.6014511 ,
         1.73525035]])

In [195]:
(X_test_clf)

array([[-0.45364986,  0.57061676, -0.37648634,  0.44800945, -0.15163076,
        -0.53745316],
       [ 0.41488565, -0.42514431, -1.90709904,  1.34219223, -1.6014511 ,
        -0.65108833],
       [-0.45364986, -0.76370308,  0.11052679, -0.44617334, -1.31148703,
         0.1443579 ],
       ...,
       [-0.45364986, -0.30565299, -1.00264608, -0.44617334,  1.0082255 ,
        -0.31018281],
       [-0.2290286 , -0.42514431,  1.8498594 , -0.44617334,  1.58815364,
         0.82616895],
       [-0.45364986,  0.13248189,  0.66711323, -1.34035613,  0.71826144,
        -0.53745316]])

In [196]:
print("Train-Test Split Completed!")
print(f"Training Data Regression: {X_train.shape}, Testing Data Regression: {X_test.shape},")
print(f"Training Data classification: {X_train_clf.shape}, Testing Data Classification: {X_test_clf.shape},")

Train-Test Split Completed!
Training Data Regression: (1168, 6), Testing Data Regression: (293, 6),
Training Data classification: (1168, 6), Testing Data Classification: (293, 6),


# Train Regression Model

In [208]:
rf_reg_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg_model.fit(X_train, y_train_temp)
y_pred_temp_rf = rf_reg_model.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test_temp, y_pred_temp_rf))
print(f"Random Forest Regression RMSE: {rmse_rf}")

Random Forest Regression RMSE: 2.6355621363845754


In [209]:
xgb_reg_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_reg_model.fit(X_train, y_train_temp)
y_pred_temp_xgb = xgb_reg_model.predict(X_test)
rmse_xgb = np.sqrt(mean_squared_error(y_test_temp, y_pred_temp_xgb))
print(f"XGBoost Regression RMSE: {rmse_xgb}")

XGBoost Regression RMSE: 2.860249693205642


In [210]:
# Determine Best Regression Model
best_reg_model = rf_reg_model if rmse_rf < rmse_xgb else xgb_reg_model
best_reg_model_name = "Random Forest" if rmse_rf < rmse_xgb else "XGBoost"
print(f"Best Regression Model: {best_reg_model_name}")

Best Regression Model: Random Forest


# Train Classification Model

In [211]:
rf_clf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf_model.fit(X_train_clf, y_train_clf)
y_pred_clf_rf = rf_clf_model.predict(X_test_clf)
acc_rf = accuracy_score(y_test_clf, y_pred_clf_rf)
print(f"Random Forest Classification Accuracy: {acc_rf}")

Random Forest Classification Accuracy: 0.8532423208191127


In [212]:
log_reg_model = LogisticRegression(max_iter=200)
log_reg_model.fit(X_train_clf, y_train_clf)
y_pred_clf_log = log_reg_model.predict(X_test_clf)
acc_log = accuracy_score(y_test_clf, y_pred_clf_log)
print(f"Logistic Regression Classification Accuracy: {acc_log}")

Logistic Regression Classification Accuracy: 0.7815699658703071


In [213]:
# Determine Best Classification Model
best_clf_model = rf_clf_model if acc_rf > acc_log else log_reg_model
best_clf_model_name = "Random Forest" if acc_rf > acc_log else "Logistic Regression"
print(f"Best Classification Model: {best_clf_model_name}")

Best Classification Model: Random Forest


#  Save models

In [214]:
with open("rf_regression_model.pkl", "wb") as f:
    pickle.dump(rf_reg_model, f)

with open("xgb_regression_model.pkl", "wb") as f:
    pickle.dump(xgb_reg_model, f)

with open("rf_classification_model.pkl", "wb") as f:
    pickle.dump(rf_clf_model, f)

with open("log_reg_classification_model.pkl", "wb") as f:
    pickle.dump(log_reg_model, f)


# Save The Best Model

In [215]:
with open("best_model.pkl", "wb") as f:
    pickle.dump(best_reg_model if rmse_rf < rmse_xgb else best_clf_model, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)