# PFDA Project

### by E. Qejvani
***

## Part 1 - Preparing the dataset.
***

* Uploading the libraries.
* Keeping the information we are interested in.
* Preparing the dataset for analyzing.

In [311]:
# Importing the libraries.
import pandas as pd 
from project_functions import *

In [312]:
# Uploading the dataset to my data frame, skipping the first 23 rows as the dataset starts at row 24. 
# Setting low_memory=False - easier to upload in big datasets.
df = pd.read_csv('./data/data_valentia.csv', skiprows=23, low_memory=False)
# Creating a subset using only the data I need for this project.
df_valentia = df[['date', 'rain', 'temp', 'wetb', 'dewpt', 'rhum', 'vappr', 'msl', 'wdsp', 'wddir']].copy()
# df_valentia = df.drop(columns=['ww', 'sun', 'w', 'vis', 'clht', 'clamt', 'ind', 'ind.1', 'ind.2', 'ind.3', 'ind.4']) #### another way of keeping only the columns we're interested in.
df_valentia


Unnamed: 0,date,rain,temp,wetb,dewpt,rhum,vappr,msl,wdsp,wddir
0,01-jan-1944 00:00,0.0,9.3,8.3,7.2,86,10.1,1034.3,12,300
1,01-jan-1944 01:00,0.0,8.9,8.2,7.2,90,10.3,1033.9,9,290
2,01-jan-1944 02:00,0.0,9.4,8.1,6.6,83,9.7,1033.4,11,280
3,01-jan-1944 03:00,0.0,9.3,8.3,7.2,86,10.2,1032.8,11,280
4,01-jan-1944 04:00,0.5,8.6,7.8,7.2,89,9.9,1032.4,11,300
...,...,...,...,...,...,...,...,...,...,...
709315,30-nov-2024 20:00,0.0,13.3,12.4,11.7,90,13.8,1006.8,11,190
709316,30-nov-2024 21:00,0.0,13.2,12.4,11.7,90,13.8,1007.0,11,190
709317,30-nov-2024 22:00,0.0,13.0,12.4,11.8,92,13.9,1007.0,11,190
709318,30-nov-2024 23:00,0.0,13.0,12.4,11.8,92,13.8,1006.6,11,190


In [313]:
# Checking for missing data.
df_valentia.isnull().sum()

date     0
rain     0
temp     0
wetb     0
dewpt    0
rhum     0
vappr    0
msl      0
wdsp     0
wddir    0
dtype: int64

In [314]:
df_valentia.isna().sum()

date     0
rain     0
temp     0
wetb     0
dewpt    0
rhum     0
vappr    0
msl      0
wdsp     0
wddir    0
dtype: int64

In [315]:
# Checking in each of columns the number of cells in that column that contain the string ' '(single space)
(df_valentia == ' ').sum()

date       0
rain      42
temp      60
wetb      60
dewpt     60
rhum      60
vappr     60
msl       60
wdsp      45
wddir    109
dtype: int64

In [316]:
count_rows_with_empty_or_space_cells_detail(df_valentia)

Rows with 0 empty cells: 709144
Rows with 1 empty cells: 75
Rows with 2 empty cells: 41
Rows with 6 empty cells: 22
Rows with 7 empty cells: 37
Rows with 8 empty cells: 1


0    709144
1        75
2        41
6        22
7        37
8         1
Name: count, dtype: int64

In [None]:
remove_rows_with_missing_data(df_valentia)

In [318]:
# Checking for data types used in my dataset.
df_valentia.dtypes

date     object
rain     object
temp     object
wetb     object
dewpt    object
rhum     object
vappr    object
msl      object
wdsp     object
wddir    object
dtype: object