In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
import warnings
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split 

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
weather_set = pd.read_csv("data/weather_burbank_airport.csv")

In [3]:
weather_set

Unnamed: 0,city,timestamp,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature
0,Burbank,2018-01-01 08:53:00,9.0,33.0,Fair,991.75,9.0,0.0,8.0
1,Burbank,2018-01-01 09:53:00,9.0,33.0,Fair,992.08,0.0,0.0,9.0
2,Burbank,2018-01-01 10:53:00,9.0,21.0,Haze,992.08,0.0,0.0,9.0
3,Burbank,2018-01-01 11:53:00,9.0,29.0,Partly Cloudy,992.08,0.0,0.0,9.0
4,Burbank,2018-01-01 12:53:00,8.0,33.0,Fair,992.08,0.0,0.0,8.0
...,...,...,...,...,...,...,...,...,...
29239,Burbank,2021-01-01 03:53:00,13.0,33.0,Fair,986.81,0.0,0.0,13.0
29240,Burbank,2021-01-01 04:53:00,12.0,33.0,Fair,986.81,11.0,0.0,12.0
29241,Burbank,2021-01-01 05:53:00,12.0,33.0,Fair,987.47,9.0,0.0,12.0
29242,Burbank,2021-01-01 06:53:00,11.0,33.0,Fair,987.14,13.0,0.0,11.0


In [4]:
weather_set.duplicated().sum()

np.int64(0)

In [5]:
weather_set.dtypes

city                        object
timestamp                   object
temperature                float64
cloud_cover                float64
cloud_cover_description     object
pressure                   float64
windspeed                  float64
precipitation              float64
felt_temperature           float64
dtype: object

# Feature Engineering

In [6]:
weather_set['temperature_difference'] = abs(weather_set['felt_temperature'] - weather_set['temperature'])

In [7]:
weather_set

Unnamed: 0,city,timestamp,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature,temperature_difference
0,Burbank,2018-01-01 08:53:00,9.0,33.0,Fair,991.75,9.0,0.0,8.0,1.0
1,Burbank,2018-01-01 09:53:00,9.0,33.0,Fair,992.08,0.0,0.0,9.0,0.0
2,Burbank,2018-01-01 10:53:00,9.0,21.0,Haze,992.08,0.0,0.0,9.0,0.0
3,Burbank,2018-01-01 11:53:00,9.0,29.0,Partly Cloudy,992.08,0.0,0.0,9.0,0.0
4,Burbank,2018-01-01 12:53:00,8.0,33.0,Fair,992.08,0.0,0.0,8.0,0.0
...,...,...,...,...,...,...,...,...,...,...
29239,Burbank,2021-01-01 03:53:00,13.0,33.0,Fair,986.81,0.0,0.0,13.0,0.0
29240,Burbank,2021-01-01 04:53:00,12.0,33.0,Fair,986.81,11.0,0.0,12.0,0.0
29241,Burbank,2021-01-01 05:53:00,12.0,33.0,Fair,987.47,9.0,0.0,12.0,0.0
29242,Burbank,2021-01-01 06:53:00,11.0,33.0,Fair,987.14,13.0,0.0,11.0,0.0


In [8]:
nan_counts = weather_set.isna().sum()
nan_counts_sorted = nan_counts.sort_values(ascending=False)

for column, count in nan_counts_sorted.items():
    print(f"Feature: {column}, Count of NaN-Values: {count}")

Feature: windspeed, Count of NaN-Values: 86
Feature: felt_temperature, Count of NaN-Values: 26
Feature: temperature_difference, Count of NaN-Values: 26
Feature: temperature, Count of NaN-Values: 25
Feature: cloud_cover, Count of NaN-Values: 20
Feature: cloud_cover_description, Count of NaN-Values: 20
Feature: pressure, Count of NaN-Values: 8
Feature: city, Count of NaN-Values: 0
Feature: timestamp, Count of NaN-Values: 0
Feature: precipitation, Count of NaN-Values: 0


-> Idee: NaN Werte mit dem Durchschnitt oder "Randoms" die den Durchschnitt ergeben auffüllen -> verlieren keine Rows

In [9]:
for column in weather_set.columns:
    unique_values = weather_set[column].unique()
    print(f"Unique values in column '{column}': {unique_values}")

Unique values in column 'city': ['Burbank']
Unique values in column 'timestamp': ['2018-01-01 08:53:00' '2018-01-01 09:53:00' '2018-01-01 10:53:00' ...
 '2021-01-01 05:53:00' '2021-01-01 06:53:00' '2021-01-01 07:53:00']
Unique values in column 'temperature': [ 9.  8.  7. 12. 16. 19. 21. 22. 23. 17. 15. 14. 13. 18. 24. 27. 25. 20.
 11. 10. 26. 28.  6.  4. 29. 30.  5.  3.  2. 31. 33. 34. 32. nan 36. 37.
 42. 43. 44. 45. 41. 39. 38. 40. 35. 46.]
Unique values in column 'cloud_cover': [33. 21. 29. 30. 34. 28. 26. 27. 20. 11. 12. 40.  4. 38. nan 47. 22. 19.]
Unique values in column 'cloud_cover_description': ['Fair' 'Haze' 'Partly Cloudy' 'Mostly Cloudy' 'Cloudy' 'Fog' 'Light Rain'
 'Rain' 'Heavy Rain' 'Heavy Rain / Windy' 'Light Rain / Windy' 'T-Storm'
 'Fair / Windy' 'Cloudy / Windy' 'Mostly Cloudy / Windy'
 'Partly Cloudy / Windy' 'Thunder in the Vicinity' 'Thunder' nan 'Smoke'
 'Light Rain with Thunder' 'Heavy T-Storm' 'Rain / Windy' 'Blowing Dust']
Unique values in column 'pressure': [

In [10]:
frequency = weather_set['cloud_cover_description'].value_counts(dropna=False)

print(frequency)

cloud_cover_description
Fair                       17122
Cloudy                      4936
Partly Cloudy               2668
Mostly Cloudy               1830
Light Rain                   896
Haze                         579
Smoke                        329
Fog                          325
Rain                         247
Heavy Rain                   120
Fair / Windy                  74
NaN                           20
T-Storm                       18
Thunder in the Vicinity       17
Partly Cloudy / Windy         14
Mostly Cloudy / Windy         10
Light Rain / Windy            10
Cloudy / Windy                 9
Heavy Rain / Windy             7
Blowing Dust                   5
Heavy T-Storm                  4
Rain / Windy                   2
Thunder                        1
Light Rain with Thunder        1
Name: count, dtype: int64


Feature Engineering:
- Konzept überlegen, mit dem man Überschneidungen zuteilt
- Idee: "Partly" "Light" Heavy" als eigenes Feature implementieren und die Einträge überarbeiten

In [11]:
weather_set['timestamp'] = pd.to_datetime(weather_set['timestamp'])

In [None]:
# weather_set['date'] = weather_set['timestamp'].str.split(' ').str[0]
# weather_set['time'] = weather_set['timestamp'].str.split(' ').str[1]

AttributeError: Can only use .str accessor with string values!

In [15]:
weather_set['date_year'] = weather_set['timestamp'].dt.year
weather_set['date_month'] = weather_set['timestamp'].dt.month
weather_set['date_day'] = weather_set['timestamp'].dt.day

In [16]:
weather_set['time'] = weather_set['timestamp'].dt.time

WICHTIG: Spalten mit unnötigen Wörtern händisch umbennen

In [None]:
weather_set['cloud_cover_description'] = weather_set['cloud_cover_description'].fillna('')

unique_features = set()
for entry in weather_set['cloud_cover_description']:
    # Trennen bei Leerzeichen und Slashes
    words = entry.replace('/', ' ').split()
    unique_features.update(words)
for feature in unique_features:
    weather_set[feature] = weather_set['cloud_cover_description'].apply(lambda x: 1 if feature in x else 0)

In [19]:
weather_set

Unnamed: 0,city,timestamp,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature,temperature_difference,...,Mostly,Rain,Windy,Heavy,the,Dust,Thunder,Fog,in,Cloudy
0,Burbank,2018-01-01 08:53:00,9.0,33.0,Fair,991.75,9.0,0.0,8.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,Burbank,2018-01-01 09:53:00,9.0,33.0,Fair,992.08,0.0,0.0,9.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,Burbank,2018-01-01 10:53:00,9.0,21.0,Haze,992.08,0.0,0.0,9.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,Burbank,2018-01-01 11:53:00,9.0,29.0,Partly Cloudy,992.08,0.0,0.0,9.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,Burbank,2018-01-01 12:53:00,8.0,33.0,Fair,992.08,0.0,0.0,8.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,Burbank,2021-01-01 03:53:00,13.0,33.0,Fair,986.81,0.0,0.0,13.0,0.0,...,0,0,0,0,0,0,0,0,0,0
29240,Burbank,2021-01-01 04:53:00,12.0,33.0,Fair,986.81,11.0,0.0,12.0,0.0,...,0,0,0,0,0,0,0,0,0,0
29241,Burbank,2021-01-01 05:53:00,12.0,33.0,Fair,987.47,9.0,0.0,12.0,0.0,...,0,0,0,0,0,0,0,0,0,0
29242,Burbank,2021-01-01 06:53:00,11.0,33.0,Fair,987.14,13.0,0.0,11.0,0.0,...,0,0,0,0,0,0,0,0,0,0


'Thunder in the Vicinity' = thunder_in_vicinity
Light Rain with Thunder = Light / Rain Thunder
Blowing Dust = blowing_dust