In [3]:
import numpy as np
import pandas as pd
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

In [4]:
weather_set = pd.read_csv("data/weather_burbank_airport.csv")

In [3]:
weather_set

Unnamed: 0,city,timestamp,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature
0,Burbank,2018-01-01 08:53:00,9.0,33.0,Fair,991.75,9.0,0.0,8.0
1,Burbank,2018-01-01 09:53:00,9.0,33.0,Fair,992.08,0.0,0.0,9.0
2,Burbank,2018-01-01 10:53:00,9.0,21.0,Haze,992.08,0.0,0.0,9.0
3,Burbank,2018-01-01 11:53:00,9.0,29.0,Partly Cloudy,992.08,0.0,0.0,9.0
4,Burbank,2018-01-01 12:53:00,8.0,33.0,Fair,992.08,0.0,0.0,8.0
...,...,...,...,...,...,...,...,...,...
29239,Burbank,2021-01-01 03:53:00,13.0,33.0,Fair,986.81,0.0,0.0,13.0
29240,Burbank,2021-01-01 04:53:00,12.0,33.0,Fair,986.81,11.0,0.0,12.0
29241,Burbank,2021-01-01 05:53:00,12.0,33.0,Fair,987.47,9.0,0.0,12.0
29242,Burbank,2021-01-01 06:53:00,11.0,33.0,Fair,987.14,13.0,0.0,11.0


In [5]:
# check for duplicates
duplicate_counter = weather_set.duplicated().sum()
# duplicates = weather_set[weather_set.duplicated()]
print("sum of duplicates:", duplicate_counter)

sum of duplicates: 0


In [6]:
# check for none-values
nan_counts = weather_set.isna().sum()
nan_counts_sorted = nan_counts.sort_values(ascending=False)

for column, count in nan_counts_sorted.items():
    print(f"Feature: {column}, Count of NaN-Values: {count}")

Feature: windspeed, Count of NaN-Values: 86
Feature: felt_temperature, Count of NaN-Values: 26
Feature: temperature, Count of NaN-Values: 25
Feature: cloud_cover_description, Count of NaN-Values: 20
Feature: cloud_cover, Count of NaN-Values: 20
Feature: pressure, Count of NaN-Values: 8
Feature: city, Count of NaN-Values: 0
Feature: timestamp, Count of NaN-Values: 0
Feature: precipitation, Count of NaN-Values: 0


-> Idee: NaN Werte mit dem Durchschnitt oder "Randoms" die den Durchschnitt ergeben auffüllen -> verlieren keine Rows
-> Ich würde alle Na-rows rausschmeißen, es handelt sich hier bei um 139 rows von insgesamt knapp 30.000, außerdem war das Hauptargument bei dem charging_set, dass wir die Na-Werte auffüllen bei den userInputs damit wir den rest der spalten nicht verlieren, da wir die gesondert in einer Analyse verwenden müssen. Das haben wir hier auch nicht zwingend. Wir haben mehr als genug datenpunkte um ein modell zu trainieren auch ohne die Na-rows, daher würde Ich die einfach rausschmeißen um das ergebnis des modells nicht zu beeinflussen mit "falschen" Werten

In [7]:
# remove missing data
weather_set.dropna()

Unnamed: 0,city,timestamp,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature
0,Burbank,2018-01-01 08:53:00,9.0,33.0,Fair,991.75,9.0,0.0,8.0
1,Burbank,2018-01-01 09:53:00,9.0,33.0,Fair,992.08,0.0,0.0,9.0
2,Burbank,2018-01-01 10:53:00,9.0,21.0,Haze,992.08,0.0,0.0,9.0
3,Burbank,2018-01-01 11:53:00,9.0,29.0,Partly Cloudy,992.08,0.0,0.0,9.0
4,Burbank,2018-01-01 12:53:00,8.0,33.0,Fair,992.08,0.0,0.0,8.0
...,...,...,...,...,...,...,...,...,...
29239,Burbank,2021-01-01 03:53:00,13.0,33.0,Fair,986.81,0.0,0.0,13.0
29240,Burbank,2021-01-01 04:53:00,12.0,33.0,Fair,986.81,11.0,0.0,12.0
29241,Burbank,2021-01-01 05:53:00,12.0,33.0,Fair,987.47,9.0,0.0,12.0
29242,Burbank,2021-01-01 06:53:00,11.0,33.0,Fair,987.14,13.0,0.0,11.0


In [7]:
weather_set.dtypes

city                        object
timestamp                   object
temperature                float64
cloud_cover                float64
cloud_cover_description     object
pressure                   float64
windspeed                  float64
precipitation              float64
felt_temperature           float64
dtype: object

# Outlier detection

In [8]:
# Check ranges of numeric columns
numeric_columns = weather_set.select_dtypes(include=[np.number])
for col in numeric_columns.columns:
    valid_values = numeric_columns.loc[numeric_columns[col] != -1, col]  # exclude -1, since we used it as a placeholder for NaN-values
    print(f"Column '{col}': min={valid_values.min()}, max={valid_values.max()}")

# Check unique values of categorical columns
categorical_columns = weather_set.select_dtypes(exclude=[np.number])
for col in categorical_columns.columns:
    print(f"Column '{col}': unique values={categorical_columns[col].unique()}")

Column 'temperature': min=2.0, max=46.0
Column 'cloud_cover': min=4.0, max=47.0
Column 'pressure': min=971.0, max=999.65
Column 'windspeed': min=0.0, max=57.0
Column 'precipitation': min=0.0, max=18.54
Column 'felt_temperature': min=0.0, max=42.0
Column 'city': unique values=['Burbank']
Column 'timestamp': unique values=['2018-01-01 08:53:00' '2018-01-01 09:53:00' '2018-01-01 10:53:00' ...
 '2021-01-01 05:53:00' '2021-01-01 06:53:00' '2021-01-01 07:53:00']
Column 'cloud_cover_description': unique values=['Fair' 'Haze' 'Partly Cloudy' 'Mostly Cloudy' 'Cloudy' 'Fog' 'Light Rain'
 'Rain' 'Heavy Rain' 'Heavy Rain / Windy' 'Light Rain / Windy' 'T-Storm'
 'Fair / Windy' 'Cloudy / Windy' 'Mostly Cloudy / Windy'
 'Partly Cloudy / Windy' 'Thunder in the Vicinity' 'Thunder' nan 'Smoke'
 'Light Rain with Thunder' 'Heavy T-Storm' 'Rain / Windy' 'Blowing Dust']


# Feature Engineering

In [8]:
weather_set['temperature_difference'] = abs(weather_set['felt_temperature'] - weather_set['temperature'])

In [9]:
for column in weather_set.columns:
    unique_values = weather_set[column].unique()
    print(f"Unique values in column '{column}': {unique_values}")

Unique values in column 'city': ['Burbank']
Unique values in column 'timestamp': ['2018-01-01 08:53:00' '2018-01-01 09:53:00' '2018-01-01 10:53:00' ...
 '2021-01-01 05:53:00' '2021-01-01 06:53:00' '2021-01-01 07:53:00']
Unique values in column 'temperature': [ 9.  8.  7. 12. 16. 19. 21. 22. 23. 17. 15. 14. 13. 18. 24. 27. 25. 20.
 11. 10. 26. 28.  6.  4. 29. 30.  5.  3.  2. 31. 33. 34. 32. nan 36. 37.
 42. 43. 44. 45. 41. 39. 38. 40. 35. 46.]
Unique values in column 'cloud_cover': [33. 21. 29. 30. 34. 28. 26. 27. 20. 11. 12. 40.  4. 38. nan 47. 22. 19.]
Unique values in column 'cloud_cover_description': ['Fair' 'Haze' 'Partly Cloudy' 'Mostly Cloudy' 'Cloudy' 'Fog' 'Light Rain'
 'Rain' 'Heavy Rain' 'Heavy Rain / Windy' 'Light Rain / Windy' 'T-Storm'
 'Fair / Windy' 'Cloudy / Windy' 'Mostly Cloudy / Windy'
 'Partly Cloudy / Windy' 'Thunder in the Vicinity' 'Thunder' nan 'Smoke'
 'Light Rain with Thunder' 'Heavy T-Storm' 'Rain / Windy' 'Blowing Dust']
Unique values in column 'pressure': [

In [10]:
frequency = weather_set['cloud_cover_description'].value_counts(dropna=False)

print(frequency)

cloud_cover_description
Fair                       17122
Cloudy                      4936
Partly Cloudy               2668
Mostly Cloudy               1830
Light Rain                   896
Haze                         579
Smoke                        329
Fog                          325
Rain                         247
Heavy Rain                   120
Fair / Windy                  74
NaN                           20
T-Storm                       18
Thunder in the Vicinity       17
Partly Cloudy / Windy         14
Mostly Cloudy / Windy         10
Light Rain / Windy            10
Cloudy / Windy                 9
Heavy Rain / Windy             7
Blowing Dust                   5
Heavy T-Storm                  4
Rain / Windy                   2
Thunder                        1
Light Rain with Thunder        1
Name: count, dtype: int64


Feature Engineering:
- Konzept überlegen, mit dem man Überschneidungen zuteilt
- Idee: "Partly" "Light" Heavy" als eigenes Feature implementieren und die Einträge überarbeiten

WICHTIG: Spalten mit unnötigen Wörtern händisch umbennen

In [11]:
weather_set['cloud_cover_description'] = weather_set['cloud_cover_description'].fillna('')

# Definieren der Sonderfälle und deren neue Bezeichnungen
special_cases = {
    'Thunder in the Vicinity': 'thunder_in_vicinity',
    'Light Rain with Thunder': 'light_rain_thunder',
    'Blowing Dust': 'blowing_dust'
}

# Liste von Wörtern, die ignoriert werden sollen
stop_words = {'in', 'the'}

# Set für alle einzigartigen Features
unique_features = set()

for entry in weather_set['cloud_cover_description']:
    # Sonderfälle zuerst ersetzen
    for case, replacement in special_cases.items():
        if case in entry:
            entry = entry.replace(case, replacement)
    
    # Trennen bei Leerzeichen und Slashes
    words = entry.replace('/', ' ').split()
    
    # Wörter zu den Features hinzufügen, außer Stoppwörter
    unique_features.update(word for word in words if word.lower() not in stop_words)

# Für jedes Feature eine neue Spalte erstellen
for feature in unique_features:
    weather_set[feature.lower()] = weather_set['cloud_cover_description'].apply(
        lambda x: 1 if feature in x else 0
    )

In [12]:
weather_set

Unnamed: 0,city,timestamp,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature,temperature_difference,...,smoke,light,blowing_dust,fog,windy,thunder_in_vicinity,light_rain_thunder,heavy,cloudy,haze
0,Burbank,2018-01-01 08:53:00,9.0,33.0,Fair,991.75,9.0,0.0,8.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,Burbank,2018-01-01 09:53:00,9.0,33.0,Fair,992.08,0.0,0.0,9.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,Burbank,2018-01-01 10:53:00,9.0,21.0,Haze,992.08,0.0,0.0,9.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,Burbank,2018-01-01 11:53:00,9.0,29.0,Partly Cloudy,992.08,0.0,0.0,9.0,0.0,...,0,0,0,0,0,0,0,0,1,0
4,Burbank,2018-01-01 12:53:00,8.0,33.0,Fair,992.08,0.0,0.0,8.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,Burbank,2021-01-01 03:53:00,13.0,33.0,Fair,986.81,0.0,0.0,13.0,0.0,...,0,0,0,0,0,0,0,0,0,0
29240,Burbank,2021-01-01 04:53:00,12.0,33.0,Fair,986.81,11.0,0.0,12.0,0.0,...,0,0,0,0,0,0,0,0,0,0
29241,Burbank,2021-01-01 05:53:00,12.0,33.0,Fair,987.47,9.0,0.0,12.0,0.0,...,0,0,0,0,0,0,0,0,0,0
29242,Burbank,2021-01-01 06:53:00,11.0,33.0,Fair,987.14,13.0,0.0,11.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
weather_set.columns

Index(['city', 'timestamp', 'temperature', 'cloud_cover',
       'cloud_cover_description', 'pressure', 'windspeed', 'precipitation',
       'felt_temperature', 'temperature_difference', 'mostly', 'partly',
       'rain', 'fair', 't-storm', 'thunder', 'smoke', 'light', 'blowing_dust',
       'fog', 'windy', 'thunder_in_vicinity', 'light_rain_thunder', 'heavy',
       'cloudy', 'haze'],
      dtype='object')

In [14]:
random_entry = weather_set.sample(n=1).iloc[0]

# Zellenwerte einzeln ausgeben
for column_name, value in random_entry.items():
    print(f"{column_name}: {value}")

city: Burbank
timestamp: 2019-01-09 10:53:00
temperature: 8.0
cloud_cover: 33.0
cloud_cover_description: Fair
pressure: 992.41
windspeed: 0.0
precipitation: 0.0
felt_temperature: 8.0
temperature_difference: 0.0
mostly: 0
partly: 0
rain: 0
fair: 1
t-storm: 0
thunder: 0
smoke: 0
light: 0
blowing_dust: 0
fog: 0
windy: 0
thunder_in_vicinity: 0
light_rain_thunder: 0
heavy: 0
cloudy: 0
haze: 0


In [15]:
# weather_set['timestamp'] = pd.to_datetime(weather_set['timestamp_date'])

# weather_set['timestamp_year'] = charging_set['connectionTime_date'].dt.year
# charging_set['timestamp_month'] = charging_set['connectionTime_date'].dt.month

In [16]:
# # split time-related data into year, month, day, time
# ### split connectionTime in separate date and time columns
# # Convert the column to datetime format
weather_set['timestamp'] = pd.to_datetime(weather_set['timestamp'])

weather_set['timestamp_year'] = weather_set['timestamp'].dt.year
weather_set['timestamp_month'] = weather_set['timestamp'].dt.month
weather_set['timestamp_day'] = weather_set['timestamp'].dt.day
weather_set['timestamp_time'] = weather_set['timestamp'].dt.time

In [17]:
# suggestion: season, isWeekday, time instead
weather_set['timestamp'] = pd.to_datetime(weather_set['timestamp'])

# 0: january, february, march = spring
# 1: april, may, june = summer
# 2: july, august, september = fall
# 3: oktober, november, december = winter 
weather_set['season'] = (weather_set['timestamp'].dt.month - 1) // 3
weather_set['isWeekday'] = weather_set['timestamp'].dt.dayofweek.apply(lambda x: 1 if x > 5 else 0)
weather_set['timestamp_time'] = weather_set['timestamp'].dt.time

In [18]:
weather_set

Unnamed: 0,city,timestamp,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature,temperature_difference,...,light_rain_thunder,heavy,cloudy,haze,timestamp_year,timestamp_month,timestamp_day,timestamp_time,season,isWeekday
0,Burbank,2018-01-01 08:53:00,9.0,33.0,Fair,991.75,9.0,0.0,8.0,1.0,...,0,0,0,0,2018,1,1,08:53:00,0,0
1,Burbank,2018-01-01 09:53:00,9.0,33.0,Fair,992.08,0.0,0.0,9.0,0.0,...,0,0,0,0,2018,1,1,09:53:00,0,0
2,Burbank,2018-01-01 10:53:00,9.0,21.0,Haze,992.08,0.0,0.0,9.0,0.0,...,0,0,0,1,2018,1,1,10:53:00,0,0
3,Burbank,2018-01-01 11:53:00,9.0,29.0,Partly Cloudy,992.08,0.0,0.0,9.0,0.0,...,0,0,1,0,2018,1,1,11:53:00,0,0
4,Burbank,2018-01-01 12:53:00,8.0,33.0,Fair,992.08,0.0,0.0,8.0,0.0,...,0,0,0,0,2018,1,1,12:53:00,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,Burbank,2021-01-01 03:53:00,13.0,33.0,Fair,986.81,0.0,0.0,13.0,0.0,...,0,0,0,0,2021,1,1,03:53:00,0,0
29240,Burbank,2021-01-01 04:53:00,12.0,33.0,Fair,986.81,11.0,0.0,12.0,0.0,...,0,0,0,0,2021,1,1,04:53:00,0,0
29241,Burbank,2021-01-01 05:53:00,12.0,33.0,Fair,987.47,9.0,0.0,12.0,0.0,...,0,0,0,0,2021,1,1,05:53:00,0,0
29242,Burbank,2021-01-01 06:53:00,11.0,33.0,Fair,987.14,13.0,0.0,11.0,0.0,...,0,0,0,0,2021,1,1,06:53:00,0,0


In [19]:
# weather_set.to_csv('data/cleaned_weather_set.csv', mode='w', index=False)