In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt 
import seaborn as sns

pd.set_option('display.max_columns', None)


In [2]:
df = pd.read_csv('../data/Weather/WeatherData.txt',skiprows=31, sep=',')
df

Unnamed: 0,# STN,YYYYMMDD,HH,DD,FH,FF,FX,T,T10N,TD,SQ,Q,DR,RH,P,VV,N,U,WW,IX,M,R,S,O,Y
0,260,20210101,1,230,10,10,20,-25,,-26,0,0,0,0,10059,1,0,98,35,7,1,0,0,0,1
1,260,20210101,2,230,10,10,20,-32,,-34,0,0,0,0,10061,0,1,98,35,7,1,0,0,0,1
2,260,20210101,3,230,20,20,30,-27,,-29,0,0,0,0,10064,4,7,98,35,7,1,0,0,0,1
3,260,20210101,4,220,20,20,30,-11,,-14,0,0,0,0,10064,15,8,98,20,7,1,0,0,0,1
4,260,20210101,5,230,20,20,40,11,,5,0,0,0,0,10064,18,8,95,10,7,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26275,260,20231231,20,200,80,70,140,86,,50,0,0,0,-1,9951,75,8,77,81,7,0,1,0,0,0
26276,260,20231231,21,200,70,70,150,82,,56,0,0,0,-1,9954,70,8,84,81,7,0,1,0,0,0
26277,260,20231231,22,200,70,70,140,81,,56,0,0,3,2,9955,70,8,84,81,7,0,1,0,0,0
26278,260,20231231,23,210,80,80,150,86,,50,0,0,0,-1,9959,75,8,78,23,7,0,1,0,0,0


In [3]:
# New column names
new_column_names = [
    "Station",
    "Date",
    "Hour",
    "WindDirection",
    "WindSpeedAvg60min",
    "WindSpeedAvg10min",
    "WindGust",
    "Temperature",
    "MinTemperature6hour",
    "DewPoint",
    "Sunshineperhour",
    "GlobalRadiation",
    "PrecipitationDuration",
    "HourlyPrecipitationAmount",
    "Pressure",
    "HorizontalVisibility",
    "CloudCover",
    "RelativeAtmosphericHumidity",
    "WeatherCode",
    "IndicatorWeatherCode",
    "Fog",
    "Rain",
    "Snow",
    "Thunder",
    "IceFormation",
]

# Rename columns
df.columns = new_column_names

In [4]:
df

Unnamed: 0,Station,Date,Hour,WindDirection,WindSpeedAvg60min,WindSpeedAvg10min,WindGust,Temperature,MinTemperature6hour,DewPoint,Sunshineperhour,GlobalRadiation,PrecipitationDuration,HourlyPrecipitationAmount,Pressure,HorizontalVisibility,CloudCover,RelativeAtmosphericHumidity,WeatherCode,IndicatorWeatherCode,Fog,Rain,Snow,Thunder,IceFormation
0,260,20210101,1,230,10,10,20,-25,,-26,0,0,0,0,10059,1,0,98,35,7,1,0,0,0,1
1,260,20210101,2,230,10,10,20,-32,,-34,0,0,0,0,10061,0,1,98,35,7,1,0,0,0,1
2,260,20210101,3,230,20,20,30,-27,,-29,0,0,0,0,10064,4,7,98,35,7,1,0,0,0,1
3,260,20210101,4,220,20,20,30,-11,,-14,0,0,0,0,10064,15,8,98,20,7,1,0,0,0,1
4,260,20210101,5,230,20,20,40,11,,5,0,0,0,0,10064,18,8,95,10,7,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26275,260,20231231,20,200,80,70,140,86,,50,0,0,0,-1,9951,75,8,77,81,7,0,1,0,0,0
26276,260,20231231,21,200,70,70,150,82,,56,0,0,0,-1,9954,70,8,84,81,7,0,1,0,0,0
26277,260,20231231,22,200,70,70,140,81,,56,0,0,3,2,9955,70,8,84,81,7,0,1,0,0,0
26278,260,20231231,23,210,80,80,150,86,,50,0,0,0,-1,9959,75,8,78,23,7,0,1,0,0,0


In [5]:
# Combine Date and Hour into a single datetime column
df['Datetime'] = pd.to_datetime(df['Date'].astype(str)) + pd.to_timedelta(df['Hour'] - 1, unit='h')

# # Drop Date and Hour columns
# df.drop(columns=['Date', 'Hour'], inplace=True)

# Move the Datetime column to the front
df = df[['Datetime'] + [col for col in df.columns if col != 'Datetime']]
df

Unnamed: 0,Datetime,Station,Date,Hour,WindDirection,WindSpeedAvg60min,WindSpeedAvg10min,WindGust,Temperature,MinTemperature6hour,DewPoint,Sunshineperhour,GlobalRadiation,PrecipitationDuration,HourlyPrecipitationAmount,Pressure,HorizontalVisibility,CloudCover,RelativeAtmosphericHumidity,WeatherCode,IndicatorWeatherCode,Fog,Rain,Snow,Thunder,IceFormation
0,2021-01-01 00:00:00,260,20210101,1,230,10,10,20,-25,,-26,0,0,0,0,10059,1,0,98,35,7,1,0,0,0,1
1,2021-01-01 01:00:00,260,20210101,2,230,10,10,20,-32,,-34,0,0,0,0,10061,0,1,98,35,7,1,0,0,0,1
2,2021-01-01 02:00:00,260,20210101,3,230,20,20,30,-27,,-29,0,0,0,0,10064,4,7,98,35,7,1,0,0,0,1
3,2021-01-01 03:00:00,260,20210101,4,220,20,20,30,-11,,-14,0,0,0,0,10064,15,8,98,20,7,1,0,0,0,1
4,2021-01-01 04:00:00,260,20210101,5,230,20,20,40,11,,5,0,0,0,0,10064,18,8,95,10,7,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26275,2023-12-31 19:00:00,260,20231231,20,200,80,70,140,86,,50,0,0,0,-1,9951,75,8,77,81,7,0,1,0,0,0
26276,2023-12-31 20:00:00,260,20231231,21,200,70,70,150,82,,56,0,0,0,-1,9954,70,8,84,81,7,0,1,0,0,0
26277,2023-12-31 21:00:00,260,20231231,22,200,70,70,140,81,,56,0,0,3,2,9955,70,8,84,81,7,0,1,0,0,0
26278,2023-12-31 22:00:00,260,20231231,23,210,80,80,150,86,,50,0,0,0,-1,9959,75,8,78,23,7,0,1,0,0,0


In [6]:
# Convert Temperature to float and scale it by 0.1
df['Temperature'] = df['Temperature'].astype(float) * 0.1

# Replace spaces with NaN
df.replace(to_replace='     ', value=np.nan, inplace=True)
# Convert MinTemperature6hour to float and scale it by 0.1
df['MinTemperature6hour'] = df['MinTemperature6hour'].astype(float) * 0.1

df

Unnamed: 0,Datetime,Station,Date,Hour,WindDirection,WindSpeedAvg60min,WindSpeedAvg10min,WindGust,Temperature,MinTemperature6hour,DewPoint,Sunshineperhour,GlobalRadiation,PrecipitationDuration,HourlyPrecipitationAmount,Pressure,HorizontalVisibility,CloudCover,RelativeAtmosphericHumidity,WeatherCode,IndicatorWeatherCode,Fog,Rain,Snow,Thunder,IceFormation
0,2021-01-01 00:00:00,260,20210101,1,230,10,10,20,-2.5,,-26,0,0,0,0,10059,1,0,98,35,7,1,0,0,0,1
1,2021-01-01 01:00:00,260,20210101,2,230,10,10,20,-3.2,,-34,0,0,0,0,10061,0,1,98,35,7,1,0,0,0,1
2,2021-01-01 02:00:00,260,20210101,3,230,20,20,30,-2.7,,-29,0,0,0,0,10064,4,7,98,35,7,1,0,0,0,1
3,2021-01-01 03:00:00,260,20210101,4,220,20,20,30,-1.1,,-14,0,0,0,0,10064,15,8,98,20,7,1,0,0,0,1
4,2021-01-01 04:00:00,260,20210101,5,230,20,20,40,1.1,,5,0,0,0,0,10064,18,8,95,10,7,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26275,2023-12-31 19:00:00,260,20231231,20,200,80,70,140,8.6,,50,0,0,0,-1,9951,75,8,77,81,7,0,1,0,0,0
26276,2023-12-31 20:00:00,260,20231231,21,200,70,70,150,8.2,,56,0,0,0,-1,9954,70,8,84,81,7,0,1,0,0,0
26277,2023-12-31 21:00:00,260,20231231,22,200,70,70,140,8.1,,56,0,0,3,2,9955,70,8,84,81,7,0,1,0,0,0
26278,2023-12-31 22:00:00,260,20231231,23,210,80,80,150,8.6,,50,0,0,0,-1,9959,75,8,78,23,7,0,1,0,0,0


In [8]:
# Calculate the number of missing values for each column
missing_values = df.isnull().sum()

# Calculate the percentage of missing values for each column
missing_percentage = (missing_values / len(df)) * 100

# Create a DataFrame to display the results
missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})

# Display the missing values analysis
missing_data

Unnamed: 0,Missing Values,Percentage
Datetime,0,0.0
Station,0,0.0
Date,0,0.0
Hour,0,0.0
WindDirection,0,0.0
WindSpeedAvg60min,0,0.0
WindSpeedAvg10min,0,0.0
WindGust,0,0.0
Temperature,0,0.0
MinTemperature6hour,21900,83.333333


In [None]:
# df.to_csv('../Data/Weather/WeatherDatacleaned.csv', index=False)