In [1]:
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
# Load the dataset
df = pd.read_csv('weather_data.csv')

# Convert 'Date Time' to datetime format
df['date_time'] = pd.to_datetime(df['date_time'])


# Filter the dataframe into two separate dataframes based on the location
df_beutenberg = df[df['location'] == 'Beutenberg'][['date_time', 'temperature_celsius']].rename(columns={'temperature_celsius': 'temperature_celsius_Beutenberg'})
df_saaleaue = df[df['location'] == 'Saaleaue'].rename(columns={c: c + "_Saaleaue" for c in df.columns if c != "date_time"})

# Merge the two dataframes on 'Date Time' to align the temperatures
df_merged = pd.merge(df_beutenberg, df_saaleaue, on='date_time', how='inner')

# Now df_merged has 'Date Time', 'T_Beutenberg', and 'T_Saaleaue'

In [3]:
df_merged.columns

Index(['date_time', 'temperature_celsius_Beutenberg', 'pressure_mbar_Saaleaue',
       'temperature_celsius_Saaleaue', 'dew_point_celsius_Saaleaue',
       'relative_humidity_percent_Saaleaue',
       'vapor_pressure_actual_mbar_Saaleaue',
       'specific_humidity_g_per_kg_Saaleaue',
       'air_density_g_per_m_cubed_Saaleaue', 'wind_velocity_m_per_s_Saaleaue',
       'wind_direction_degrees_Saaleaue', 'rainfall_mm_Saaleaue',
       'location_Saaleaue', 'season_Saaleaue'],
      dtype='object')

In [4]:
import statsmodels.api as sm

predictors = [
    'pressure_mbar_Saaleaue',
    'temperature_celsius_Saaleaue',
    'relative_humidity_percent_Saaleaue',
    'wind_velocity_m_per_s_Saaleaue'
]

# Prepare the predictor and the response variables
X = sm.add_constant(df_merged[predictors])  # Adds a constant term to the predictor
Y = df_merged['temperature_celsius_Beutenberg']

In [5]:
X

Unnamed: 0,const,pressure_mbar_Saaleaue,temperature_celsius_Saaleaue,relative_humidity_percent_Saaleaue,wind_velocity_m_per_s_Saaleaue
0,1.0,1007.56,13.06,44.91,2.46
1,1.0,1007.18,13.60,43.63,3.14
2,1.0,1006.96,14.79,41.85,1.32
3,1.0,1006.77,14.81,41.89,2.03
4,1.0,1006.31,16.04,37.87,1.79
...,...,...,...,...,...
31951,1.0,988.02,8.39,73.01,3.22
31952,1.0,988.71,8.39,71.35,2.91
31953,1.0,988.88,8.41,69.21,3.44
31954,1.0,988.89,8.07,71.16,2.65


In [6]:
Y

0        11.54
1        12.33
2        12.88
3        12.91
4        13.99
         ...  
31951     7.44
31952     7.45
31953     7.11
31954     7.37
31955     7.57
Name: temperature_celsius_Beutenberg, Length: 31956, dtype: float64

In [7]:
# Build and fit the model
model = sm.OLS(Y, X).fit()
print(model.summary())

                                  OLS Regression Results                                  
Dep. Variable:     temperature_celsius_Beutenberg   R-squared:                       0.964
Model:                                        OLS   Adj. R-squared:                  0.964
Method:                             Least Squares   F-statistic:                 2.161e+05
Date:                            Tue, 19 Nov 2024   Prob (F-statistic):               0.00
Time:                                    22:43:48   Log-Likelihood:                -58694.
No. Observations:                           31956   AIC:                         1.174e+05
Df Residuals:                               31951   BIC:                         1.174e+05
Df Model:                                       4                                         
Covariance Type:                        nonrobust                                         
                                         coef    std err          t      P>|t|      [0.025

# Categorical

In [20]:
# Filter the dataframe based on the location
df_beutenberg_full = df[df['location'] == 'Beutenberg']

# Assuming 'df' is your DataFrame and it already has a 'season' column
season_dummies = pd.get_dummies(df_beutenberg_full['season'], drop_first=True)

# 'drop_first=True' drops the first category to avoid the dummy variable trap
# This means if we have ['winter', 'spring', 'summer', 'autumn'], 'autumn' is the reference category

season_dummies = season_dummies.astype(int)

In [21]:
import statsmodels.api as sm

# Assuming 'temperature_celsius' is the column for Beutenberg temperature
# Add the intercept term manually
X = sm.add_constant(season_dummies)
Y = df_beutenberg_full['temperature_celsius']

In [22]:
X

Unnamed: 0,const,spring,summer,winter
0,1.0,0,0,1
1,1.0,0,0,1
2,1.0,0,0,1
3,1.0,0,0,1
4,1.0,0,0,1
...,...,...,...,...
35071,1.0,0,0,1
35072,1.0,0,0,1
35073,1.0,0,0,1
35074,1.0,0,0,1


In [23]:
Y

0        0.34
1       -0.21
2       -0.05
3       -1.02
4       -1.84
         ... 
35071    7.44
35072    7.45
35073    7.11
35074    7.37
35075    7.57
Name: temperature_celsius, Length: 35076, dtype: float64

In [24]:
# Fit the model
model = sm.OLS(Y, X).fit()

# Print the summary
print(model.summary())

                             OLS Regression Results                            
Dep. Variable:     temperature_celsius   R-squared:                       0.521
Model:                             OLS   Adj. R-squared:                  0.521
Method:                  Least Squares   F-statistic:                 1.270e+04
Date:                 Tue, 19 Nov 2024   Prob (F-statistic):               0.00
Time:                         22:48:28   Log-Likelihood:            -1.0957e+05
No. Observations:                35076   AIC:                         2.191e+05
Df Residuals:                    35072   BIC:                         2.192e+05
Df Model:                            3                                         
Covariance Type:             nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         10.9861      0.059    186.94

In the model summary, you'll see coefficients for three of the seasons (since we dropped one to avoid multicollinearity). Each coefficient tells you the expected difference in temperature (from the reference season, which is the dropped one, assuming it's 'autumn' in this case) associated with being in that specific season, holding all else constant.

For example, if the coefficient for winter is -5, it means that, on average, the temperature in Beutenberg is expected to be 5 degrees Celsius lower in winter compared to autumn (the reference season). Similarly, positive coefficients for other seasons indicate higher temperatures compared to the reference season.

This approach allows you to quantify the impact of each season on Beutenberg's temperature while accounting for the categorical nature of the season variable.