In [None]:
import os
import boto3
from datetime import datetime
from io import StringIO
import boto3
import pandas as pd
import json



SOURCE_BUCKET = "danneftw-dscrap-bucket"
WEATHER_KEY = "weather_data.csv"
BIKES_KEY = "output3.csv"

def get_data_from_s3(bucket_name: str, key: str) -> pd.DataFrame:
    s3_client = boto3.client("s3", region_name="eu-north-1")  # Need to declare s3_client here
    obj = s3_client.get_object(Bucket=bucket_name, Key=key)
    df = pd.read_csv(obj["Body"])
    if df.empty:
        raise Exception("No data found in S3 bucket")
    return df

weather_data = get_data_from_s3(SOURCE_BUCKET, WEATHER_KEY)
bikes_data = get_data_from_s3(SOURCE_BUCKET, BIKES_KEY)

weather_data.head()

In [2]:
import pandas as pd
import boto3

def read_csv_from_s3(bucket_name: str, file_path: str) -> pd.DataFrame:
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket_name, Key=file_path)
    df = pd.read_csv(obj['Body'])
    return df

bucket_name = "danneftw-dscrap-bucket"
file_path = "processed/station_bikes/2023-08-09-2023-11-02/StationaryStations.csv"
dataframe = read_csv_from_s3(bucket_name, file_path)


In [3]:
missing_visibility_df = dataframe[dataframe['Visibility'].isnull()]

# Get the timestamps where 'Visibility' is missing
missing_visibility_timestamps = missing_visibility_df['timestamp']

# Display the results
print("Dates where 'Visibility' is missing:")
print(missing_visibility_timestamps)

Dates where 'Visibility' is missing:
3119       2023-08-29 13:00:09.169000+00:00
3120       2023-08-29 13:10:09.226000+00:00
3121       2023-08-29 13:20:09.201000+00:00
3122       2023-08-29 13:30:09.165000+00:00
3123       2023-08-29 13:40:08.915000+00:00
                         ...               
1717684    2023-08-31 17:10:09.038000+00:00
1717685    2023-08-31 17:20:09.674000+00:00
1717686    2023-08-31 17:30:09.396000+00:00
1717687    2023-08-31 17:40:09.374000+00:00
1717688    2023-08-31 17:50:09.270000+00:00
Name: timestamp, Length: 46908, dtype: object


In [4]:
# set visibility to 0 if nan

dataframe['Visibility'] = dataframe['Visibility'].fillna(0)

In [41]:
dataframe.columns

Index(['AvailableBikes', 'StationId', 'Distance', 'stationId', 'IsOpen',
       'timestamp', 'BikeIds', 'LastUpdate', 'Long', 'Lat', 'Name', 'Year',
       'Month', 'Day', 'Hour', 'Minute', 'Temperature', 'Humidity',
       'Windspeed', 'Precipitation', 'Visibility', 'Snowfall',
       'TotalAvailableBikes', 'IsWeekend'],
      dtype='object')

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

COLUMNS_TO_KEEP = [
    "IsOpen",
    "Long",
    "Lat",
    "Year",
    "Month",
    "Day",
    "Hour",
    'Minute',
    "Temperature",
    "Humidity",
    "Windspeed",
    "Precipitation",
    "Visibility",
    "Snowfall",
    "IsWeekend",
    "TotalAvailableBikes",
]

df = dataframe[COLUMNS_TO_KEEP]

X = df.drop("TotalAvailableBikes", axis=1)
y = df["TotalAvailableBikes"]

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor()

model.fit(xtrain, ytrain)

model.score(xtest, ytest)


0.9800410309200892

In [47]:
df_mossen = dataframe[dataframe['stationId']== "Sahlgrenska södra"]

print(df_mossen["TotalAvailableBikes"].min())
print(df_mossen["TotalAvailableBikes"].max())
print(df_mossen["TotalAvailableBikes"].mean())
print(df_mossen["Long"].unique())
print(df_mossen["Lat"].unique())

0
25
7.683026920551543
[11.963804]
[57.681919]


In [28]:
import pandas as pd
import requests
from datetime import datetime


class PredictorHandler:
    def __init__(
        self,
        IsOpen,
        Long,
        Lat,
        Year,
        Month,
        Day,
        Hour,
        Minute,
        Temperature,
        Humidity,
        Windspeed,
        Precipitation,
        Visibility,
        Snowfall,
        IsWeekend,
    ):
        self.IsOpen = IsOpen
        self.Long = Long
        self.Lat = Lat
        self.Year = Year
        self.Month = Month
        self.Day = Day
        self.Hour = Hour
        self.Minute = Minute

        self.Temperature = Temperature
        self.Humidity = Humidity
        self.Windspeed = Windspeed
        self.Precipitation = Precipitation
        self.Visibility = Visibility
        self.Snowfall = Snowfall
        self.IsWeekend = IsWeekend

    def create_dataframe(self):
        df = pd.DataFrame(
            {
                "IsOpen": [self.IsOpen],
                "Long": [self.Long],
                "Lat": [self.Lat],
                "Year": [self.Year],
                "Month": [self.Month],
                "Day": [self.Day],
                "Hour": [self.Hour],
                "Minute": [self.Minute],
                "Temperature": [self.Temperature],
                "Humidity": [self.Humidity],
                "Windspeed": [self.Windspeed],
                "Precipitation": [self.Precipitation],
                "Visibility": [self.Visibility],
                "Snowfall": [self.Snowfall],
                "IsWeekend": [self.IsWeekend],
            }
        )

        return df

In [48]:
import http.client
import urllib.parse
import json

API_ENDPOINT = "api.open-meteo.com"
PATH = "/v1/forecast"


def fetch_weather_data():
    params = {
        "latitude": 52.52,
        "longitude": 13.41,
        "hourly": "temperature_2m,relativehumidity_2m,windspeed_10m,precipitation,visibility,snowfall",
    }
    params_str = urllib.parse.urlencode(params)
    url = f"{PATH}?{params_str}"

    conn = http.client.HTTPSConnection(API_ENDPOINT)
    conn.request("GET", url)
    response = conn.getresponse()

    if response.status == 200:
        response_data = response.read()
        weather_data = json.loads(response_data)
        hourly_data = weather_data["hourly"]
        return {
            "Temperature": hourly_data["temperature_2m"][0],
            "Humidity": hourly_data["relativehumidity_2m"][0],
            "Windspeed": hourly_data["windspeed_10m"][0],
            "Precipitation": hourly_data["precipitation"][0],
            "Visibility": hourly_data["visibility"][0],
            "Snowfall": hourly_data["snowfall"][0],
        }
    else:
        raise Exception("Failed to retrieve weather data.")


if __name__ == "__main__":
    LAT = 57.681919
    LONG = 11.963804

    weather_data = fetch_weather_data()

    input_df = PredictorHandler(
        IsOpen=True,
        Long=LONG,
        Lat=LAT,
        Year=2023,
        Month=11,
        Day=6,
        Hour=13,
        Minute=10,
        Temperature=weather_data["Temperature"],
        Humidity=weather_data["Humidity"],
        Windspeed=weather_data["Windspeed"],  # Corrected to match the class definition
        Precipitation=weather_data["Precipitation"],
        Visibility=weather_data["Visibility"],
        Snowfall=weather_data["Snowfall"],
        IsWeekend=0,
    ).create_dataframe()



    result = model.predict(input_df)
    print(f"Number of bikes at location is {round(result[0])}")


Number of bikes at location is 19


: 

In [45]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import boto3
import os

# Define the S3 bucket and folder path
bucket_name = 'sagemaker-eu-north-1-796717305864'
folder_path = 'sagemaker/sklearncontainer/train-test-two-weeks/'

# Create an S3 client
s3_client = boto3.client('s3')

# Function to download the dataset from S3
def download_from_s3(filename):
    s3_client.download_file(bucket_name, f'{folder_path}{filename}', filename)

# Downloading the data from S3
download_from_s3('xtest2.csv')
download_from_s3('xtrain2.csv')
download_from_s3('ytest2.csv')
download_from_s3('ytrain2.csv')

# Load the data
x_train = pd.read_csv('xtrain2.csv')
y_train = pd.read_csv('ytrain2.csv')
x_test = pd.read_csv('xtest2.csv')
y_test = pd.read_csv('ytest2.csv')

# Initialize the RandomForestRegressor with default parameters
rf = RandomForestRegressor()

# Train the model
rf.fit(x_train, y_train.values.ravel()) # y_train.values.ravel() to convert it to 1D array if y_train is a DataFrame

# Make predictions
predictions = rf.predict(x_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

# Display predictions
prediction_df = pd.DataFrame(data={'Actual': y_test.iloc[:,0], 'Predicted': predictions})
print(prediction_df.head())

# Clean up the files to not clutter your notebook's directory
os.remove('xtrain2.csv')
os.remove('xtest2.csv')
os.remove('ytrain2.csv')
os.remove('ytest2.csv')


Mean Squared Error: 0.5563026118860795
   Actual  Predicted
0      13      11.53
1      17      17.00
2      10      10.00
3       6       5.98
4      11      11.03


In [83]:
df_mossen = dataframe[dataframe['stationId']== "Polstjärnegatan"]

print(df_mossen["TotalAvailableBikes"].min())
print(df_mossen["TotalAvailableBikes"].max())
print(df_mossen["TotalAvailableBikes"].mean())
print(df_mossen["Long"].unique())
print(df_mossen["Lat"].unique())

2
16
5.649949341438703
[11.933084]
[57.710429]


In [84]:
dataframe

Unnamed: 0,AvailableBikes,StationId,Distance,stationId,IsOpen,timestamp,BikeIds,LastUpdate,Long,Lat,...,Hour,Minute,Temperature,Humidity,Windspeed,Precipitation,Visibility,Snowfall,TotalAvailableBikes,IsWeekend
0,5,28045585,2484,Linnéplatsen (A),True,2023-10-18 00:00:18.009125,"['711963', '711856', '711664', '711204', '7111...",/Date(1697587218103+0200)/,11.951773,57.690247,...,0,0,2.0,96,3.3,0.0,24140.0,0.0,5,0
1,5,28045585,2484,Linnéplatsen (A),True,2023-10-18 00:10:18.581522,"['711963', '711856', '711664', '711204', '7111...",/Date(1697587818685+0200)/,11.951773,57.690247,...,0,10,2.0,96,3.3,0.0,24140.0,0.0,5,0
2,5,28045585,2484,Linnéplatsen (A),True,2023-10-18 00:20:18.436062,"['711963', '711856', '711664', '711204', '7111...",/Date(1697588418531+0200)/,11.951773,57.690247,...,0,20,2.0,96,3.3,0.0,24140.0,0.0,5,0
3,5,28045585,2484,Linnéplatsen (A),True,2023-10-18 00:30:18.143687,"['711963', '711856', '711664', '711204', '7111...",/Date(1697589018254+0200)/,11.951773,57.690247,...,0,30,2.0,96,3.3,0.0,24140.0,0.0,5,0
4,5,28045585,2484,Linnéplatsen (A),True,2023-10-18 00:40:17.817040,"['711963', '711856', '711664', '711204', '7111...",/Date(1697589617929+0200)/,11.951773,57.690247,...,0,40,2.0,96,3.3,0.0,24140.0,0.0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275274,7,30921953,1173,Engelbrektsgatan,True,2023-10-31 23:10:18.758052,"['711999', '711252', '710881', '710841', '7102...",/Date(1698793818901+0100)/,11.976702,57.698429,...,23,10,6.8,74,10.1,0.0,24140.0,0.0,7,0
275275,7,30921953,1173,Engelbrektsgatan,True,2023-10-31 23:20:18.796382,"['711999', '711252', '710881', '710841', '7102...",/Date(1698794418904+0100)/,11.976702,57.698429,...,23,20,6.8,74,10.1,0.0,24140.0,0.0,7,0
275276,7,30921953,1173,Engelbrektsgatan,True,2023-10-31 23:30:18.581008,"['711999', '711252', '710881', '710841', '7102...",/Date(1698795018688+0100)/,11.976702,57.698429,...,23,30,6.8,74,10.1,0.0,24140.0,0.0,7,0
275277,7,30921953,1173,Engelbrektsgatan,True,2023-10-31 23:40:19.024793,"['711999', '711252', '710881', '710841', '7102...",/Date(1698795619129+0100)/,11.976702,57.698429,...,23,40,6.8,74,10.1,0.0,24140.0,0.0,7,0
