In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd drive/My\ Drive

/content/drive/My Drive


In [4]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

In [5]:
wind_speed_df = pd.read_csv('wind_speed.csv')

In [6]:
wind_speed_df = wind_speed_df.dropna()

In [7]:
# Create Month column

wind_speed_df['Month'] = [str(datetime).split('-')[1] for datetime in wind_speed_df['datetime']]

In [8]:
# Create Day column

wind_speed_df['Day'] = [str(datetime).split('-')[2].split()[0] for datetime in wind_speed_df['datetime']]

In [9]:
# Create Year column

wind_speed_df['Year'] = [str(datetime).split('-')[0] for datetime in wind_speed_df['datetime']]

In [10]:
# Create Time column

wind_speed_df['Time'] = [str(datetime).split('-')[2].split()[1].split(':')[0] for datetime in wind_speed_df['datetime']]

In [11]:
# Drop non-US cities

wind_speed_df = wind_speed_df.drop('Jerusalem',axis=1)
wind_speed_df = wind_speed_df.drop('Nahariyya',axis=1)
wind_speed_df = wind_speed_df.drop('Haifa',axis=1)
wind_speed_df = wind_speed_df.drop('Eilat',axis=1)
wind_speed_df = wind_speed_df.drop('Tel Aviv District',axis=1)
wind_speed_df = wind_speed_df.drop('Beersheba',axis=1)
wind_speed_df = wind_speed_df.drop('Montreal',axis=1)
wind_speed_df = wind_speed_df.drop('Toronto',axis=1)

In [12]:
wind_speed_df = wind_speed_df.drop('datetime', axis = 1)

In [13]:
#Store cleaned data in file to restore for later use
import pickle

with open('cleanedData.pkl', 'wb') as f:
  pickle.dump(wind_speed_df, f)

In [None]:
# Make sure to check for type of input being passed in (expected is String for all parameters)
import scipy.spatial.distance as distance

def getModel(x, y):
  def chi_square_distance(X, Y):
    with np.errstate(divide="ignore", invalid="ignore"):
        return (1 / 2) * np.sum(np.nan_to_num((np.square(X - Y) / (X + Y))))

  n_neighbors = [int(x) for x in np.linspace(23, 69, num = 37)]
  metric = [chi_square_distance, distance.euclidean]
  param_distribs = {'n_neighbors':n_neighbors,'metric': metric}
  estimator = KNeighborsRegressor()

  skfold = RepeatedKFold(n_splits=10,n_repeats=10,random_state=1)
  grid_search = GridSearchCV(estimator, param_distribs, n_jobs=5,cv=skfold)

  grid_search.fit(x,y)
  best_model = grid_search.best_estimator_

  # Create a df from the cv_resutls
  df_cv = pd.DataFrame(grid_search.cv_results_)

  return best_model, df_cv

In [None]:
def filterWithUserInput(city, month, day):
  filtered_df = wind_speed_df[[city, 'Month', 'Day', 'Year', 'Time']].copy()
  filtered_df = filtered_df[filtered_df['Month'] == month]
  filtered_df = filtered_df[filtered_df['Day'] == day]

  x = filtered_df.drop(city, axis=1)
  y = filtered_df[city]

  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

  ### Normalization using standard scaler
  standardscaler = StandardScaler()
  standardscaler.fit(x_train)

  x_train_scale = standardscaler.fit_transform(x_train)

  model, df_cv = getModel(x_train_scale, y_train)

  #Retrain on training set
  model.fit(x_train_scale,y_train)

  ### Normalization using standard scaler for test
  standardscaler = StandardScaler()
  standardscaler.fit(x_test)

  x_test_scale = standardscaler.fit_transform(x_test)

  return model, df_cv, x_train_scale, y_train, x_test_scale, y_test

In [None]:
model, df_cv, x_train_scale, y_train, x_test_scale, y_test = filterWithUserInput('Boston', "10", "10")

In [None]:
# Average difference between prediction and actual value (using the test dataset)

sum(abs(model.predict(x_test_scale) - y_test))/115

0.5098814229249011

In [None]:
#Store model in file to restore for later use
import pickle

with open('model.pkl', 'wb') as f:
  pickle.dump(model, f)

In [None]:
# Restore model
import pickle

with open('model.pkl', 'rb') as f: 
    model = pickle.load(f)