In [1]:
# Author: Geethu Thottungal Harilal
# data from : https://power.larc.nasa.gov/data-access-viewer/

# This code will check the missing data, format the date coulmn and handles the columns with outliers 
# and chnages the column names into meaningful manner for Scotland area


### Imports

In [2]:
# import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


## Reading dataset

In [3]:
data = pd.read_csv("Scotland_daily_1981-2023 July.csv")

In [4]:
data.shape

(15551, 24)

In [5]:
data.head()

Unnamed: 0,YEAR,MO,DY,T2M,T2MDEW,T2MWET,TS,T2M_RANGE,T2M_MAX,T2M_MIN,...,WS10M,WS10M_MAX,WS10M_MIN,WS10M_RANGE,WD10M,WS50M,WS50M_MAX,WS50M_MIN,WS50M_RANGE,WD50M
0,1981,1,2,5.87,5.48,5.68,5.15,4.95,7.37,2.43,...,10.31,12.17,8.17,4.0,263.25,13.89,15.72,11.02,4.7,264.75
1,1981,1,3,1.56,0.73,1.15,1.15,5.92,3.97,-1.96,...,9.61,12.02,5.22,6.8,291.5,12.61,15.5,8.0,7.5,292.94
2,1981,1,4,-1.95,-3.17,-2.56,-2.2,2.2,-0.88,-3.08,...,7.2,10.3,3.85,6.46,315.44,9.5,12.83,5.58,7.25,316.56
3,1981,1,5,-1.37,-2.36,-1.87,-1.66,5.09,1.53,-3.56,...,6.41,13.37,2.51,10.86,284.12,8.54,17.3,3.48,13.84,285.31
4,1981,1,6,-1.27,-1.92,-1.6,-2.57,4.68,0.72,-3.97,...,3.09,7.3,0.21,7.09,208.62,4.77,9.96,0.27,9.69,209.25


In [6]:
data.columns

Index(['YEAR', 'MO', 'DY', 'T2M', 'T2MDEW', 'T2MWET', 'TS', 'T2M_RANGE',
       'T2M_MAX', 'T2M_MIN', 'QV2M', 'RH2M', 'PRECTOTCORR', 'PS', 'WS10M',
       'WS10M_MAX', 'WS10M_MIN', 'WS10M_RANGE', 'WD10M', 'WS50M', 'WS50M_MAX',
       'WS50M_MIN', 'WS50M_RANGE', 'WD50M'],
      dtype='object')

# General Cleaning Techniques

### 1. Combaining YEAR and DOY columns into Date column

In [7]:
df2 = data[["YEAR", "MO", "DY"]].copy()
df2.columns = ["year", "month", "day"]
data['date']= pd.to_datetime(df2)

In [8]:
# removing YEAR and DOY from the data
data = data.drop(['YEAR','MO', 'DY'], axis=1)

In [9]:
# Move 'date' column to the first position
cols = data.columns.tolist()
cols = ['date'] + [col for col in cols if col != 'date']
data = data[cols]

In [10]:
data.head()

Unnamed: 0,date,T2M,T2MDEW,T2MWET,TS,T2M_RANGE,T2M_MAX,T2M_MIN,QV2M,RH2M,...,WS10M,WS10M_MAX,WS10M_MIN,WS10M_RANGE,WD10M,WS50M,WS50M_MAX,WS50M_MIN,WS50M_RANGE,WD50M
0,1981-01-02,5.87,5.48,5.68,5.15,4.95,7.37,2.43,5.92,97.38,...,10.31,12.17,8.17,4.0,263.25,13.89,15.72,11.02,4.7,264.75
1,1981-01-03,1.56,0.73,1.15,1.15,5.92,3.97,-1.96,4.27,94.25,...,9.61,12.02,5.22,6.8,291.5,12.61,15.5,8.0,7.5,292.94
2,1981-01-04,-1.95,-3.17,-2.56,-2.2,2.2,-0.88,-3.08,3.11,92.69,...,7.2,10.3,3.85,6.46,315.44,9.5,12.83,5.58,7.25,316.56
3,1981-01-05,-1.37,-2.36,-1.87,-1.66,5.09,1.53,-3.56,3.3,94.06,...,6.41,13.37,2.51,10.86,284.12,8.54,17.3,3.48,13.84,285.31
4,1981-01-06,-1.27,-1.92,-1.6,-2.57,4.68,0.72,-3.97,3.42,96.25,...,3.09,7.3,0.21,7.09,208.62,4.77,9.96,0.27,9.69,209.25


In [11]:
# Define a dictionary to map the original column names to meaningful names
column_mapping = {"PS":"SP","QV2M":"SH2M","PRECTOTCORR":"Rainfall","TS":"TSkin"}

In [12]:
# Rename the columns using the mapping dictionary
data = data.rename(columns=column_mapping)

In [13]:
data.columns

Index(['date', 'T2M', 'T2MDEW', 'T2MWET', 'TSkin', 'T2M_RANGE', 'T2M_MAX',
       'T2M_MIN', 'SH2M', 'RH2M', 'Rainfall', 'SP', 'WS10M', 'WS10M_MAX',
       'WS10M_MIN', 'WS10M_RANGE', 'WD10M', 'WS50M', 'WS50M_MAX', 'WS50M_MIN',
       'WS50M_RANGE', 'WD50M'],
      dtype='object')

In [14]:
data.head()

Unnamed: 0,date,T2M,T2MDEW,T2MWET,TSkin,T2M_RANGE,T2M_MAX,T2M_MIN,SH2M,RH2M,...,WS10M,WS10M_MAX,WS10M_MIN,WS10M_RANGE,WD10M,WS50M,WS50M_MAX,WS50M_MIN,WS50M_RANGE,WD50M
0,1981-01-02,5.87,5.48,5.68,5.15,4.95,7.37,2.43,5.92,97.38,...,10.31,12.17,8.17,4.0,263.25,13.89,15.72,11.02,4.7,264.75
1,1981-01-03,1.56,0.73,1.15,1.15,5.92,3.97,-1.96,4.27,94.25,...,9.61,12.02,5.22,6.8,291.5,12.61,15.5,8.0,7.5,292.94
2,1981-01-04,-1.95,-3.17,-2.56,-2.2,2.2,-0.88,-3.08,3.11,92.69,...,7.2,10.3,3.85,6.46,315.44,9.5,12.83,5.58,7.25,316.56
3,1981-01-05,-1.37,-2.36,-1.87,-1.66,5.09,1.53,-3.56,3.3,94.06,...,6.41,13.37,2.51,10.86,284.12,8.54,17.3,3.48,13.84,285.31
4,1981-01-06,-1.27,-1.92,-1.6,-2.57,4.68,0.72,-3.97,3.42,96.25,...,3.09,7.3,0.21,7.09,208.62,4.77,9.96,0.27,9.69,209.25


### 2. checking for duplicates

In [15]:
data.shape

(15551, 22)

In [16]:
data.drop_duplicates(inplace=True)

In [17]:
data.shape

(15551, 22)

#### No duplicates found

In [18]:
data.dtypes

date           datetime64[ns]
T2M                   float64
T2MDEW                float64
T2MWET                float64
TSkin                 float64
T2M_RANGE             float64
T2M_MAX               float64
T2M_MIN               float64
SH2M                  float64
RH2M                  float64
Rainfall              float64
SP                    float64
WS10M                 float64
WS10M_MAX             float64
WS10M_MIN             float64
WS10M_RANGE           float64
WD10M                 float64
WS50M                 float64
WS50M_MAX             float64
WS50M_MIN             float64
WS50M_RANGE           float64
WD50M                 float64
dtype: object

#### Every column contains float types

### 3. Missing Values

In [19]:
data.isna().sum()

date           0
T2M            0
T2MDEW         0
T2MWET         0
TSkin          0
T2M_RANGE      0
T2M_MAX        0
T2M_MIN        0
SH2M           0
RH2M           0
Rainfall       0
SP             0
WS10M          0
WS10M_MAX      0
WS10M_MIN      0
WS10M_RANGE    0
WD10M          0
WS50M          0
WS50M_MAX      0
WS50M_MIN      0
WS50M_RANGE    0
WD50M          0
dtype: int64

In [20]:
data.isnull().sum()

date           0
T2M            0
T2MDEW         0
T2MWET         0
TSkin          0
T2M_RANGE      0
T2M_MAX        0
T2M_MIN        0
SH2M           0
RH2M           0
Rainfall       0
SP             0
WS10M          0
WS10M_MAX      0
WS10M_MIN      0
WS10M_RANGE    0
WD10M          0
WS50M          0
WS50M_MAX      0
WS50M_MIN      0
WS50M_RANGE    0
WD50M          0
dtype: int64

#### No missing or null value found!

#### Check for outliers

In [21]:
# Count occurrences of a specific value in each column
target_value = -999.0
counts = data.apply(lambda x: x.value_counts().get(target_value, 0))

print(counts)

date           0
T2M            0
T2MDEW         0
T2MWET         0
TSkin          0
T2M_RANGE      0
T2M_MAX        0
T2M_MIN        0
SH2M           0
RH2M           0
Rainfall       0
SP             0
WS10M          0
WS10M_MAX      0
WS10M_MIN      0
WS10M_RANGE    0
WD10M          0
WS50M          0
WS50M_MAX      0
WS50M_MIN      0
WS50M_RANGE    0
WD50M          0
dtype: int64


In [22]:
data.shape

(15551, 22)

In [23]:
# write final data to csv
data.to_csv("Scotland_dataset_cleaned.csv",index=0)

#### ################################################################################################