# import Library and dataset

In [1]:
import tkinter as tk
from tkinter import ttk
from tkinter import messagebox
import pandas as pd


In [2]:
df = pd.read_csv(r"C:\Users\Felicity\Downloads\seattle-weather.csv")
df.head()

# The df contain independent and dependent variables

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,1/1/2012,0.0,12.8,5.0,4.7,drizzle
1,1/2/2012,10.9,10.6,2.8,4.5,rain
2,1/3/2012,0.8,11.7,7.2,2.3,rain
3,1/4/2012,20.3,12.2,5.6,4.7,rain
4,1/5/2012,1.3,8.9,2.8,6.1,rain


# Checking for missing value in data set

In [3]:
df.isnull().sum()

date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64

# Coverting the dependant variable to a numerical value

In [4]:
def LabelEncoding(c):
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    df[c]= le.fit_transform(df[c])
    df[c].unique()
LabelEncoding("weather")
df

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,1/1/2012,0.0,12.8,5.0,4.7,0
1,1/2/2012,10.9,10.6,2.8,4.5,2
2,1/3/2012,0.8,11.7,7.2,2.3,2
3,1/4/2012,20.3,12.2,5.6,4.7,2
4,1/5/2012,1.3,8.9,2.8,6.1,2
...,...,...,...,...,...,...
1456,12/27/2015,8.6,4.4,1.7,2.9,2
1457,12/28/2015,1.5,5.0,1.7,1.3,2
1458,12/29/2015,0.0,7.2,0.6,2.6,1
1459,12/30/2015,0.0,5.6,-1.0,3.4,4


# Standardize or Normalize the value to start with 0

In [5]:
cols = ['precipitation' , 'temp_max', 'temp_min', 'wind']

In [6]:
def normalize(df,cols):
    for x in cols:
        df[x] = df[x]/df[x].max()
normalize(df,cols)
df

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,1/1/2012,0.000000,0.359551,0.273224,0.494737,0
1,1/2/2012,0.194991,0.297753,0.153005,0.473684,2
2,1/3/2012,0.014311,0.328652,0.393443,0.242105,2
3,1/4/2012,0.363148,0.342697,0.306011,0.494737,2
4,1/5/2012,0.023256,0.250000,0.153005,0.642105,2
...,...,...,...,...,...,...
1456,12/27/2015,0.153846,0.123596,0.092896,0.305263,2
1457,12/28/2015,0.026834,0.140449,0.092896,0.136842,2
1458,12/29/2015,0.000000,0.202247,0.032787,0.273684,1
1459,12/30/2015,0.000000,0.157303,-0.054645,0.357895,4


In [7]:
# drop the date column. NB when the Axis is equal to 1 means drop the full column
df = df.drop('date',axis=1)
df

Unnamed: 0,precipitation,temp_max,temp_min,wind,weather
0,0.000000,0.359551,0.273224,0.494737,0
1,0.194991,0.297753,0.153005,0.473684,2
2,0.014311,0.328652,0.393443,0.242105,2
3,0.363148,0.342697,0.306011,0.494737,2
4,0.023256,0.250000,0.153005,0.642105,2
...,...,...,...,...,...
1456,0.153846,0.123596,0.092896,0.305263,2
1457,0.026834,0.140449,0.092896,0.136842,2
1458,0.000000,0.202247,0.032787,0.273684,1
1459,0.000000,0.157303,-0.054645,0.357895,4


# Seperate independant variable from dependant

In [8]:
x = df.drop('weather',axis=1)
y = df['weather']

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [10]:
from xgboost import XGBClassifier

In [11]:
xg = XGBClassifier()
xg.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)

In [12]:
#xg.get_params()

In [13]:
from sklearn.metrics import classification_report, accuracy_score

In [14]:
y_hat = xg.predict(X_test)
print(accuracy_score(y_test,y_hat))
print(classification_report(y_test,y_hat))

0.757679180887372
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.15      0.07      0.10        29
           2       0.95      0.91      0.93       123
           3       1.00      0.33      0.50         6
           4       0.70      0.85      0.77       125

    accuracy                           0.76       293
   macro avg       0.56      0.43      0.46       293
weighted avg       0.73      0.76      0.74       293



# Create a dictionary

In [15]:
grid = {'learning_rate': [0.1,1, 0.01, 0.001], 'gamma':[0,1,10,100]}

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
model = GridSearchCV(XGBClassifier(), grid, cv=10, verbose=2)

In [18]:
model.fit(X_train, y_train)

Fitting 10 folds for each of 16 candidates, totalling 160 fits
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.2s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END ...........................gamma=0, learning_rate=1; total time=   0.2s
[CV] END ...........................gamma=0, l

GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     grow_policy=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_cat_threshold=None,
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,

In [19]:
grid_predictions = model.predict(X_test)
print(grid_predictions)

[4 1 2 4 4 2 2 2 4 2 4 2 4 4 4 4 4 2 2 2 2 2 2 4 4 4 4 2 4 4 2 4 2 2 4 2 4
 2 4 2 4 2 2 2 4 4 4 4 4 4 4 4 4 2 2 4 4 2 4 2 4 4 4 4 4 2 4 4 4 2 2 2 2 4
 4 4 4 4 2 2 4 4 2 2 2 4 4 2 4 2 4 4 2 4 2 3 4 4 4 4 2 4 4 2 2 4 2 2 4 4 2
 4 2 4 4 4 4 4 2 2 4 2 2 4 2 4 4 4 4 2 4 2 4 2 2 2 4 4 4 4 2 2 4 4 4 2 2 4
 4 4 4 4 4 2 4 2 4 4 2 4 4 4 2 4 2 2 4 2 2 2 4 4 4 4 2 2 4 4 4 4 4 4 2 1 2
 4 4 4 2 4 4 4 4 4 2 2 4 4 4 4 2 4 4 4 2 2 2 4 4 4 2 2 2 2 4 4 4 2 2 2 4 4
 4 2 4 4 4 4 2 4 4 2 2 2 2 4 2 4 2 2 4 2 4 2 2 4 2 4 4 4 4 4 2 3 4 4 4 4 4
 4 2 1 4 4 4 2 4 2 4 1 4 4 2 2 4 4 2 4 2 4 2 2 2 4 4 4 2 4 2 4 4 2 4]


In [20]:
grid_predictions = model.predict(X_test)
print(accuracy_score(y_test,grid_predictions))
print(classification_report(y_test,grid_predictions))

0.8088737201365188
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.25      0.03      0.06        29
           2       0.97      0.90      0.93       123
           3       1.00      0.33      0.50         6
           4       0.72      0.98      0.83       125

    accuracy                           0.81       293
   macro avg       0.59      0.45      0.46       293
weighted avg       0.76      0.81      0.76       293



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
print(model.best_estimator_)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=1, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
