In [1]:
import pandas as pd
import boto3

def read_csv_from_s3(bucket_name: str, file_path: str) -> pd.DataFrame:
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket_name, Key=file_path)
    df = pd.read_csv(obj['Body'])
    return df

bucket_name = "danneftw-dscrap-bucket"
file_path = "processed/station_bikes/2023-08-09-2023-11-02/StationaryStations.csv"
df = read_csv_from_s3(bucket_name, file_path)

In [42]:
df['Visibility'] = df['Visibility'].fillna(23180) # checked average visibility for the week before the missing data

COLUMNS_TO_KEEP = [
    "IsOpen",
    "Long",
    "Lat",
    "Year",
    "Month",
    "Day",
    "Hour",
    'Minute',
    "Temperature",
    "Humidity",
    "Windspeed",
    "Precipitation",
    "Visibility",
    "Snowfall",
    "IsWeekend",
    "TotalAvailableBikes",
]

df = dataframe[COLUMNS_TO_KEEP]

X = df.drop("TotalAvailableBikes", axis=1)
y = df["TotalAvailableBikes"]


0.9800410309200892

In [3]:
df_mossen = df[df['stationId'] == "Hornsgatan"]

print(df_mossen["TotalAvailableBikes"].min())
print(df_mossen["TotalAvailableBikes"].max())
print(df_mossen["TotalAvailableBikes"].mean())
print(df_mossen["Long"].unique())
print(df_mossen["Lat"].unique())

0
32
14.860050890585242
[12.01216]
[57.728825]


In [3]:
import pandas as pd
import requests
from datetime import datetime


class PredictorHandler:
    def __init__(
        self,
        IsOpen,
        Long,
        Lat,
        Year,
        Month,
        Day,
        Hour,
        Minute,
        Temperature,
        Humidity,
        Windspeed,
        Precipitation,
        Visibility,
        Snowfall,
        IsWeekend,
    ):
        self.IsOpen = IsOpen
        self.Long = Long
        self.Lat = Lat
        self.Year = Year
        self.Month = Month
        self.Day = Day
        self.Hour = Hour
        self.Minute = Minute

        self.Temperature = Temperature
        self.Humidity = Humidity
        self.Windspeed = Windspeed
        self.Precipitation = Precipitation
        self.Visibility = Visibility
        self.Snowfall = Snowfall
        self.IsWeekend = IsWeekend

    def create_dataframe(self):
        df = pd.DataFrame(
            {
                "IsOpen": [self.IsOpen],
                "Long": [self.Long],
                "Lat": [self.Lat],
                "Year": [self.Year],
                "Month": [self.Month],
                "Day": [self.Day],
                "Hour": [self.Hour],
                "Minute": [self.Minute],
                "Temperature": [self.Temperature],
                "Humidity": [self.Humidity],
                "Windspeed": [self.Windspeed],
                "Precipitation": [self.Precipitation],
                "Visibility": [self.Visibility],
                "Snowfall": [self.Snowfall],
                "IsWeekend": [self.IsWeekend],
            }
        )

        return df

In [5]:
import http.client
import urllib.parse
import json
from sagemaker.sklearn.model import SKLearnPredictor
import sagemaker

API_ENDPOINT = "api.open-meteo.com"
PATH = "/v1/forecast"


def fetch_weather_data():
    params = {
        "latitude": 52.52,
        "longitude": 13.41,
        "hourly": "temperature_2m,relativehumidity_2m,windspeed_10m,precipitation,visibility,snowfall",
    }
    params_str = urllib.parse.urlencode(params)
    url = f"{PATH}?{params_str}"

    conn = http.client.HTTPSConnection(API_ENDPOINT)
    conn.request("GET", url)
    response = conn.getresponse()

    if response.status == 200:
        response_data = response.read()
        weather_data = json.loads(response_data)
        hourly_data = weather_data["hourly"]
        return {
            "Temperature": hourly_data["temperature_2m"][0],
            "Humidity": hourly_data["relativehumidity_2m"][0],
            "Windspeed": hourly_data["windspeed_10m"][0],
            "Precipitation": hourly_data["precipitation"][0],
            "Visibility": hourly_data["visibility"][0],
            "Snowfall": hourly_data["snowfall"][0],
        }
    else:
        raise Exception("Failed to retrieve weather data.")



LAT = 57.681919
LONG = 11.963804

weather_data = fetch_weather_data()

input_df = PredictorHandler(
    IsOpen=True,
    Long=LONG,
    Lat=LAT,
    Year=2023,
    Month=11,
    Day=6,
    Hour=13,
    Minute=10,
    Temperature=weather_data["Temperature"],
    Humidity=weather_data["Humidity"],
    Windspeed=weather_data["Windspeed"],  # Corrected to match the class definition
    Precipitation=weather_data["Precipitation"],
    Visibility=weather_data["Visibility"],
    Snowfall=weather_data["Snowfall"],
    IsWeekend=0,
).create_dataframe()


endpoint_name = "random-forest-endpoint-1"
predictor = SKLearnPredictor(
    endpoint_name=endpoint_name, sagemaker_session=sagemaker.Session()
)
result = predictor.predict(input_df)
print(f"Number of bikes at location is {round(result[0])}")

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\b9ivn\AppData\Local\sagemaker\sagemaker\config.yaml
Number of bikes at location is 19


In [45]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import boto3
import os

# Define the S3 bucket and folder path
bucket_name = 'sagemaker-eu-north-1-796717305864'
folder_path = 'sagemaker/sklearncontainer/train-test-two-weeks/'

# Create an S3 client
s3_client = boto3.client('s3')

# Function to download the dataset from S3
def download_from_s3(filename):
    s3_client.download_file(bucket_name, f'{folder_path}{filename}', filename)

# Downloading the data from S3
download_from_s3('xtest2.csv')
download_from_s3('xtrain2.csv')
download_from_s3('ytest2.csv')
download_from_s3('ytrain2.csv')

# Load the data
x_train = pd.read_csv('xtrain2.csv')
y_train = pd.read_csv('ytrain2.csv')
x_test = pd.read_csv('xtest2.csv')
y_test = pd.read_csv('ytest2.csv')

# Initialize the RandomForestRegressor with default parameters
rf = RandomForestRegressor()

# Train the model
rf.fit(x_train, y_train.values.ravel()) # y_train.values.ravel() to convert it to 1D array if y_train is a DataFrame

# Make predictions
predictions = rf.predict(x_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

# Display predictions
prediction_df = pd.DataFrame(data={'Actual': y_test.iloc[:,0], 'Predicted': predictions})
print(prediction_df.head())

# Clean up the files to not clutter your notebook's directory
os.remove('xtrain2.csv')
os.remove('xtest2.csv')
os.remove('ytrain2.csv')
os.remove('ytest2.csv')


Mean Squared Error: 0.5563026118860795
   Actual  Predicted
0      13      11.53
1      17      17.00
2      10      10.00
3       6       5.98
4      11      11.03


In [8]:
df_mossen = dataframe[dataframe['stationId']== "Liseberg södra"]

print(df_mossen["TotalAvailableBikes"].min())
print(df_mossen["TotalAvailableBikes"].max())
print(df_mossen["TotalAvailableBikes"].mean())
print(df_mossen["Long"].unique())
print(df_mossen["Lat"].unique())

NameError: name 'dataframe' is not defined

In [84]:
dataframe

Unnamed: 0,AvailableBikes,StationId,Distance,stationId,IsOpen,timestamp,BikeIds,LastUpdate,Long,Lat,...,Hour,Minute,Temperature,Humidity,Windspeed,Precipitation,Visibility,Snowfall,TotalAvailableBikes,IsWeekend
0,5,28045585,2484,Linnéplatsen (A),True,2023-10-18 00:00:18.009125,"['711963', '711856', '711664', '711204', '7111...",/Date(1697587218103+0200)/,11.951773,57.690247,...,0,0,2.0,96,3.3,0.0,24140.0,0.0,5,0
1,5,28045585,2484,Linnéplatsen (A),True,2023-10-18 00:10:18.581522,"['711963', '711856', '711664', '711204', '7111...",/Date(1697587818685+0200)/,11.951773,57.690247,...,0,10,2.0,96,3.3,0.0,24140.0,0.0,5,0
2,5,28045585,2484,Linnéplatsen (A),True,2023-10-18 00:20:18.436062,"['711963', '711856', '711664', '711204', '7111...",/Date(1697588418531+0200)/,11.951773,57.690247,...,0,20,2.0,96,3.3,0.0,24140.0,0.0,5,0
3,5,28045585,2484,Linnéplatsen (A),True,2023-10-18 00:30:18.143687,"['711963', '711856', '711664', '711204', '7111...",/Date(1697589018254+0200)/,11.951773,57.690247,...,0,30,2.0,96,3.3,0.0,24140.0,0.0,5,0
4,5,28045585,2484,Linnéplatsen (A),True,2023-10-18 00:40:17.817040,"['711963', '711856', '711664', '711204', '7111...",/Date(1697589617929+0200)/,11.951773,57.690247,...,0,40,2.0,96,3.3,0.0,24140.0,0.0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275274,7,30921953,1173,Engelbrektsgatan,True,2023-10-31 23:10:18.758052,"['711999', '711252', '710881', '710841', '7102...",/Date(1698793818901+0100)/,11.976702,57.698429,...,23,10,6.8,74,10.1,0.0,24140.0,0.0,7,0
275275,7,30921953,1173,Engelbrektsgatan,True,2023-10-31 23:20:18.796382,"['711999', '711252', '710881', '710841', '7102...",/Date(1698794418904+0100)/,11.976702,57.698429,...,23,20,6.8,74,10.1,0.0,24140.0,0.0,7,0
275276,7,30921953,1173,Engelbrektsgatan,True,2023-10-31 23:30:18.581008,"['711999', '711252', '710881', '710841', '7102...",/Date(1698795018688+0100)/,11.976702,57.698429,...,23,30,6.8,74,10.1,0.0,24140.0,0.0,7,0
275277,7,30921953,1173,Engelbrektsgatan,True,2023-10-31 23:40:19.024793,"['711999', '711252', '710881', '710841', '7102...",/Date(1698795619129+0100)/,11.976702,57.698429,...,23,40,6.8,74,10.1,0.0,24140.0,0.0,7,0


In [30]:
import http.client
import urllib.parse
from predictor_handler import PredictorHandler
import json
from datetime import datetime

API_ENDPOINT = "api.open-meteo.com"
PATH = "/v1/forecast"


def fetch_weather_data():
    params = {
        "latitude": 52.52,
        "longitude": 13.41,
        "hourly": "temperature_2m,relativehumidity_2m,windspeed_10m,precipitation,visibility,snowfall",
    }
    params_str = urllib.parse.urlencode(params)
    url = f"{PATH}?{params_str}"

    conn = http.client.HTTPSConnection(API_ENDPOINT)
    conn.request("GET", url)
    response = conn.getresponse()

    if response.status == 200:
        response_data = response.read()
        weather_data = json.loads(response_data)
        hourly_data = weather_data["hourly"]
        return {
            "Temperature": hourly_data["temperature_2m"][0],
            "Humidity": hourly_data["relativehumidity_2m"][0],
            "Windspeed": hourly_data["windspeed_10m"][0],
            "Precipitation": hourly_data["precipitation"][0],
            "Visibility": hourly_data["visibility"][0],
            "Snowfall": hourly_data["snowfall"][0],
        }
    else:
        raise Exception("Failed to retrieve weather data.")


if __name__ == "__main__":
    LAT = 57.69669
    LONG = 11.972278
    weather_data = fetch_weather_data()

    # Get current system time
    now = datetime.now()

    input_df = PredictorHandler(
        IsOpen=True,
        Long=LONG,
        Lat=LAT,
        Year=now.year,
        Month=now.month,
        Day=now.day,
        Hour=now.hour,
        Minute=now.minute,
        Temperature=weather_data["Temperature"],
        Humidity=weather_data["Humidity"],
        Windspeed=weather_data["Windspeed"],
        Precipitation=weather_data["Precipitation"],
        Visibility=weather_data["Visibility"],
        Snowfall=weather_data["Snowfall"],
        IsWeekend=int(now.weekday() >= 5),  # 0 for weekdays, 1 for weekends
    ).create_dataframe()
    

input_df

Unnamed: 0,IsOpen,Long,Lat,Year,Month,Day,Hour,Minute,Temperature,Humidity,Windspeed,Precipitation,Visibility,Snowfall,IsWeekend
0,True,11.972278,57.69669,2023,11,8,10,32,7.8,86,9.9,0.0,24140.0,0.0,0


In [2]:
import argparse
import pandas as pd
import boto3
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from io import StringIO
import logging


def read_csv_from_s3(bucket_name, file_path):
    s3 = boto3.client("s3")
    obj = s3.get_object(Bucket=bucket_name, Key=file_path)
    df = pd.read_csv(StringIO(obj["Body"].read().decode("utf-8")))
    return df


def upload_metrics_to_s3(bucket_name, metrics, file_path):
    s3 = boto3.client("s3")
    metrics_str = StringIO()
    pd.DataFrame([metrics]).to_csv(metrics_str, index=False)
    s3.put_object(Bucket=bucket_name, Key=file_path, Body=metrics_str.getvalue())
    logging.info(f"Metrics uploaded to S3 bucket sagemaker-eu-north-1-796717305864")


    COLUMNS_TO_KEEP = [
        "IsOpen",
        "Long",
        "Lat",
        "Year",
        "Month",
        "Day",
        "Hour",
        "Minute",
        "Temperature",
        "Humidity",
        "Windspeed",
        "Precipitation",
        "Visibility",
        "Snowfall",
        "IsWeekend",
        "TotalAvailableBikes",
    ]

    df = df[COLUMNS_TO_KEEP]

    X = df.drop("TotalAvailableBikes", axis=1)
    y = df["TotalAvailableBikes"]
    
    # 0.5% test data only for metrics
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.005, random_state=42)

    model = RandomForestRegressor()
    model.fit(xtrain, ytrain)

    # metrics to upload to s3
    ypred = model.predict(xtest)
    mae = mean_absolute_error(ytest, ypred)
    mse = mean_squared_error(ytest, ypred)
    
    metrics = {
        "mean_absolute_error": mae,
        "mean_squared_error": mse,
        "r2_score": r2_score(ytest, ypred),
    }
    
    upload_metrics_to_s3(args.bucket_name, metrics, "metrics/metrics-full-dataset.csv")

    joblib.dump(model, "/opt/ml/model/model.joblib")


TypeError: string indices must be integers

In [9]:
import argparse
import pandas as pd
import boto3
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from io import StringIO
import logging

def upload_metrics_to_s3(bucket_name, metrics, file_path):
    s3 = boto3.client("s3")
    metrics_str = StringIO()
    pd.DataFrame([metrics]).to_csv(metrics_str, index=False)
    s3.put_object(Bucket=bucket_name, Key=file_path, Body=metrics_str.getvalue())
    logging.info(f"Metrics uploaded to S3 bucket sagemaker-eu-north-1-796717305864")


df["Visibility"] = df["Visibility"].fillna(23180)    

COLUMNS_TO_KEEP = [
        "IsOpen",
        "Long",
        "Lat",
        "Year",
        "Month",
        "Day",
        "Hour",
        "Minute",
        "Temperature",
        "Humidity",
        "Windspeed",
        "Precipitation",
        "Visibility",
        "Snowfall",
        "IsWeekend",
        "TotalAvailableBikes",
    ]

df = df[COLUMNS_TO_KEEP]

df = df.sample(10000)

X = df.drop("TotalAvailableBikes", axis=1)
y = df["TotalAvailableBikes"]

# 0.5% test data only for metrics
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.01, random_state=42)

model = RandomForestRegressor()
model.fit(xtrain, ytrain)

# metrics to upload to s3
ypred = model.predict(xtest)
mae = mean_absolute_error(ytest, ypred)
mse = mean_squared_error(ytest, ypred)

metrics = {
    "mean_absolute_error": mae,
    "mean_squared_error": mse,
    "r2_score": r2_score(ytest, ypred),
}

upload_metrics_to_s3("sagemaker-eu-north-1-796717305864", metrics, "metrics/metrics-full-dataset.csv")