# Notebook for transforming Weather data

### 1. Read data to dataframe and drop NaN values

In [1]:
import pandas as pd

df = pd.read_csv('Weather.csv')
df.dropna(inplace=True)
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3936 entries, 0 to 3935
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   time                     3936 non-null   object 
 1   temperature_2m (°C)      3929 non-null   float64
 2   relativehumidity_2m (%)  3929 non-null   float64
 3   precipitation (mm)       3929 non-null   float64
 4   cloudcover (%)           3929 non-null   float64
 5   windspeed_10m (m/s)      3929 non-null   float64
dtypes: float64(5), object(1)
memory usage: 184.6+ KB


Unnamed: 0,time,temperature_2m (°C),relativehumidity_2m (%),precipitation (mm),cloudcover (%),windspeed_10m (m/s)
0,2022-09-25T00:00,16.1,94.0,0.0,0.0,2.24
1,2022-09-25T01:00,15.5,97.0,0.0,0.0,2.20
2,2022-09-25T02:00,15.0,98.0,0.0,14.0,2.05
3,2022-09-25T03:00,14.5,99.0,0.0,90.0,2.05
4,2022-09-25T04:00,14.1,99.0,0.0,90.0,2.62
...,...,...,...,...,...,...
3924,2023-03-07T12:00,10.2,62.0,0.0,80.0,2.58
3925,2023-03-07T13:00,10.4,62.0,0.0,59.0,3.33
3926,2023-03-07T14:00,10.4,63.0,0.0,59.0,4.03
3927,2023-03-07T15:00,10.4,64.0,0.0,64.0,3.58


### 2. Handling time formats. Creating DateKey and Weather_time columns. Dropping unnecessary time related columns

In [9]:
# Split time to date (year, month, day) and time (hour, minute)
df[['Date', 'Time']] = df['time'].str.split('T', expand=True)
df[['Hour', 'Minute']] = df['Time'].str.split(':', expand=True)
df[['Year', 'Month', 'Day']] = df['Date'].str.split('-', expand=True)

# Create Datekey and Datetime colums
df['DateKey'] = df[['Year', 'Month', 'Day', 'Hour']].apply(''.join, axis=1).astype(int)
df['Weather_time'] = pd.to_datetime(df['time'])

# Drop unnecessary columns
df.drop(columns=['Time','time','Date','Minute','Hour','Month','Day','Year'], inplace=True)
df.head()

Unnamed: 0,temperature_2m (°C),relativehumidity_2m (%),precipitation (mm),cloudcover (%),windspeed_10m (m/s),DateKey,Weather_time
0,16.1,94.0,0.0,0.0,2.24,2022092500,2022-09-25 00:00:00
1,15.5,97.0,0.0,0.0,2.20,2022092501,2022-09-25 01:00:00
2,15.0,98.0,0.0,14.0,2.05,2022092502,2022-09-25 02:00:00
3,14.5,99.0,0.0,90.0,2.05,2022092503,2022-09-25 03:00:00
4,14.1,99.0,0.0,90.0,2.62,2022092504,2022-09-25 04:00:00
...,...,...,...,...,...,...,...
3924,10.2,62.0,0.0,80.0,2.58,2023030712,2023-03-07 12:00:00
3925,10.4,62.0,0.0,59.0,3.33,2023030713,2023-03-07 13:00:00
3926,10.4,63.0,0.0,59.0,4.03,2023030714,2023-03-07 14:00:00
3927,10.4,64.0,0.0,64.0,3.58,2023030715,2023-03-07 15:00:00


### 3. Renaming columns and setting DateKey as index

In [10]:
# Renaming columns
df = df.rename(columns={'temperature_2m (°C)':'Temperature', 
                        'relativehumidity_2m (%)':'Humidity', 
                        'precipitation (mm)':'Rainfall', 
                        'cloudcover (%)':'Clouds',
                        'windspeed_10m (m/s)':'Wind'})
# Resetting index
df.set_index('DateKey', inplace=True)

# Saving to file
df.to_csv('Weather_final.csv', sep=',')
df.head()

Unnamed: 0_level_0,Temperature,Humidity,Rainfall,Clouds,Wind,Weather_time
DateKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022092500,16.1,94.0,0.0,0.0,2.24,2022-09-25 00:00:00
2022092501,15.5,97.0,0.0,0.0,2.20,2022-09-25 01:00:00
2022092502,15.0,98.0,0.0,14.0,2.05,2022-09-25 02:00:00
2022092503,14.5,99.0,0.0,90.0,2.05,2022-09-25 03:00:00
2022092504,14.1,99.0,0.0,90.0,2.62,2022-09-25 04:00:00
...,...,...,...,...,...,...
2023030712,10.2,62.0,0.0,80.0,2.58,2023-03-07 12:00:00
2023030713,10.4,62.0,0.0,59.0,3.33,2023-03-07 13:00:00
2023030714,10.4,63.0,0.0,59.0,4.03,2023-03-07 14:00:00
2023030715,10.4,64.0,0.0,64.0,3.58,2023-03-07 15:00:00
