In [1]:
# Author: Geethu Thottungal Harilal
# data from : https://power.larc.nasa.gov/data-access-viewer/

# This code will check the missing data, format the date coulmn and handles the columns with outliers 
# and chnages the column names into meaningful manner in England area


### Imports

In [2]:
# import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


## Reading dataset

In [3]:
data = pd.read_csv("England_daily_1981-2023 July.csv")

In [4]:
data.shape

(15551, 24)

In [5]:
data.head()

Unnamed: 0,YEAR,MO,DY,PS,WS10M,WS10M_MAX,WS10M_MIN,WS10M_RANGE,WD10M,QV2M,...,T2MWET,TS,T2M_RANGE,T2M_MAX,T2M_MIN,WD50M,WS50M_RANGE,WS50M_MIN,WS50M_MAX,WS50M
0,1981,1,2,100.18,10.08,11.63,7.12,4.52,259.75,5.74,...,6.13,6.18,4.82,8.14,3.32,261.56,5.87,9.84,15.71,13.6
1,1981,1,3,98.95,12.04,13.94,7.08,6.86,278.56,5.19,...,5.27,6.01,5.88,9.28,3.4,280.12,8.47,10.25,18.73,16.36
2,1981,1,4,99.81,8.08,13.4,5.24,8.16,298.94,3.72,...,0.55,0.98,3.79,3.27,-0.51,300.25,9.35,8.27,17.62,11.34
3,1981,1,5,100.97,5.11,11.06,2.84,8.23,285.31,3.54,...,-0.4,-0.42,6.94,3.81,-3.13,286.44,11.88,3.43,15.31,7.98
4,1981,1,6,100.92,6.09,11.78,1.88,9.89,262.75,4.39,...,2.42,2.33,4.71,4.83,0.12,263.69,13.02,3.18,16.2,8.91


In [6]:
data.columns

Index(['YEAR', 'MO', 'DY', 'PS', 'WS10M', 'WS10M_MAX', 'WS10M_MIN',
       'WS10M_RANGE', 'WD10M', 'QV2M', 'RH2M', 'PRECTOTCORR', 'T2M', 'T2MDEW',
       'T2MWET', 'TS', 'T2M_RANGE', 'T2M_MAX', 'T2M_MIN', 'WD50M',
       'WS50M_RANGE', 'WS50M_MIN', 'WS50M_MAX', 'WS50M'],
      dtype='object')

# General Cleaning Techniques

### 1. Combaining YEAR and DOY columns into Date column

In [7]:
df2 = data[["YEAR", "MO", "DY"]].copy()
df2.columns = ["year", "month", "day"]
data['date']= pd.to_datetime(df2)

In [8]:
# removing YEAR and DOY from the data
data = data.drop(['YEAR','MO', 'DY'], axis=1)

In [9]:
# Move 'date' column to the first position
cols = data.columns.tolist()
cols = ['date'] + [col for col in cols if col != 'date']
data = data[cols]

In [10]:
data.head()

Unnamed: 0,date,PS,WS10M,WS10M_MAX,WS10M_MIN,WS10M_RANGE,WD10M,QV2M,RH2M,PRECTOTCORR,...,T2MWET,TS,T2M_RANGE,T2M_MAX,T2M_MIN,WD50M,WS50M_RANGE,WS50M_MIN,WS50M_MAX,WS50M
0,1981-01-02,100.18,10.08,11.63,7.12,4.52,259.75,5.74,94.0,2.78,...,6.13,6.18,4.82,8.14,3.32,261.56,5.87,9.84,15.71,13.6
1,1981-01-03,98.95,12.04,13.94,7.08,6.86,278.56,5.19,83.5,0.77,...,5.27,6.01,5.88,9.28,3.4,280.12,8.47,10.25,18.73,16.36
2,1981-01-04,99.81,8.08,13.4,5.24,8.16,298.94,3.72,87.81,0.25,...,0.55,0.98,3.79,3.27,-0.51,300.25,9.35,8.27,17.62,11.34
3,1981-01-05,100.97,5.11,11.06,2.84,8.23,285.31,3.54,93.62,2.0,...,-0.4,-0.42,6.94,3.81,-3.13,286.44,11.88,3.43,15.31,7.98
4,1981-01-06,100.92,6.09,11.78,1.88,9.89,262.75,4.39,94.81,2.18,...,2.42,2.33,4.71,4.83,0.12,263.69,13.02,3.18,16.2,8.91


In [11]:
# Define a dictionary to map the original column names to meaningful names
column_mapping = {"PS":"SP","QV2M":"SH2M","PRECTOTCORR":"Rainfall","TS":"TSkin"}

In [12]:
# Rename the columns using the mapping dictionary
data = data.rename(columns=column_mapping)

In [13]:
data.columns

Index(['date', 'SP', 'WS10M', 'WS10M_MAX', 'WS10M_MIN', 'WS10M_RANGE', 'WD10M',
       'SH2M', 'RH2M', 'Rainfall', 'T2M', 'T2MDEW', 'T2MWET', 'TSkin',
       'T2M_RANGE', 'T2M_MAX', 'T2M_MIN', 'WD50M', 'WS50M_RANGE', 'WS50M_MIN',
       'WS50M_MAX', 'WS50M'],
      dtype='object')

In [14]:
data.head()

Unnamed: 0,date,SP,WS10M,WS10M_MAX,WS10M_MIN,WS10M_RANGE,WD10M,SH2M,RH2M,Rainfall,...,T2MWET,TSkin,T2M_RANGE,T2M_MAX,T2M_MIN,WD50M,WS50M_RANGE,WS50M_MIN,WS50M_MAX,WS50M
0,1981-01-02,100.18,10.08,11.63,7.12,4.52,259.75,5.74,94.0,2.78,...,6.13,6.18,4.82,8.14,3.32,261.56,5.87,9.84,15.71,13.6
1,1981-01-03,98.95,12.04,13.94,7.08,6.86,278.56,5.19,83.5,0.77,...,5.27,6.01,5.88,9.28,3.4,280.12,8.47,10.25,18.73,16.36
2,1981-01-04,99.81,8.08,13.4,5.24,8.16,298.94,3.72,87.81,0.25,...,0.55,0.98,3.79,3.27,-0.51,300.25,9.35,8.27,17.62,11.34
3,1981-01-05,100.97,5.11,11.06,2.84,8.23,285.31,3.54,93.62,2.0,...,-0.4,-0.42,6.94,3.81,-3.13,286.44,11.88,3.43,15.31,7.98
4,1981-01-06,100.92,6.09,11.78,1.88,9.89,262.75,4.39,94.81,2.18,...,2.42,2.33,4.71,4.83,0.12,263.69,13.02,3.18,16.2,8.91


### 2. checking for duplicates

In [15]:
data.shape

(15551, 22)

In [16]:
data.drop_duplicates(inplace=True)

In [17]:
data.shape

(15551, 22)

#### No duplicates found

In [18]:
data.dtypes

date           datetime64[ns]
SP                    float64
WS10M                 float64
WS10M_MAX             float64
WS10M_MIN             float64
WS10M_RANGE           float64
WD10M                 float64
SH2M                  float64
RH2M                  float64
Rainfall              float64
T2M                   float64
T2MDEW                float64
T2MWET                float64
TSkin                 float64
T2M_RANGE             float64
T2M_MAX               float64
T2M_MIN               float64
WD50M                 float64
WS50M_RANGE           float64
WS50M_MIN             float64
WS50M_MAX             float64
WS50M                 float64
dtype: object

#### Every column contains float types

### 3. Missing Values

In [19]:
data.isna().sum()

date           0
SP             0
WS10M          0
WS10M_MAX      0
WS10M_MIN      0
WS10M_RANGE    0
WD10M          0
SH2M           0
RH2M           0
Rainfall       0
T2M            0
T2MDEW         0
T2MWET         0
TSkin          0
T2M_RANGE      0
T2M_MAX        0
T2M_MIN        0
WD50M          0
WS50M_RANGE    0
WS50M_MIN      0
WS50M_MAX      0
WS50M          0
dtype: int64

In [20]:
data.isnull().sum()

date           0
SP             0
WS10M          0
WS10M_MAX      0
WS10M_MIN      0
WS10M_RANGE    0
WD10M          0
SH2M           0
RH2M           0
Rainfall       0
T2M            0
T2MDEW         0
T2MWET         0
TSkin          0
T2M_RANGE      0
T2M_MAX        0
T2M_MIN        0
WD50M          0
WS50M_RANGE    0
WS50M_MIN      0
WS50M_MAX      0
WS50M          0
dtype: int64

#### No missing or null value found!

In [21]:
# Count occurrences of a specific value in each column
target_value = -999.0
counts = data.apply(lambda x: x.value_counts().get(target_value, 0))

print(counts)

date           0
SP             0
WS10M          0
WS10M_MAX      0
WS10M_MIN      0
WS10M_RANGE    0
WD10M          0
SH2M           0
RH2M           0
Rainfall       0
T2M            0
T2MDEW         0
T2MWET         0
TSkin          0
T2M_RANGE      0
T2M_MAX        0
T2M_MIN        0
WD50M          0
WS50M_RANGE    0
WS50M_MIN      0
WS50M_MAX      0
WS50M          0
dtype: int64


In [22]:
data.shape

(15551, 22)

In [23]:
# write final data to csv
data.to_csv("England_dataset_cleaned.csv",index=0)

#### ################################################################################################