In [None]:
path = 'Country/Country'

In [None]:
import os
import pandas as pd
from dbfread import DBF
from pyproj import CRS
import xarray as xr
import geopandas as gpd
import rioxarray  # For handling geospatial data in xarray
import matplotlib.pyplot as plt
import seaborn as sns

### DBF

**Description** : 
A `.dbf` (DataBase File) is a **binary** file format for storing structured tabular data with metadata on field types and lengths. 
Commonly used with GIS (Geographic Information Systems) and database applications, it supports multiple data types like integers, strings, and dates.


In [None]:
# Load the .dbf file
table = DBF(f'{path}.dbf', encoding='utf-8') 
df = pd.DataFrame(iter(table))
df.head()

In [None]:
df.info()

### PRJ file

**Description** : A `.prj` (Projection) file stores coordinate system and map projection information for geospatial data, typically in the WKT (Well-Known Text) format. 
It is used in GIS (Geographic Information Systems) to ensure accurate spatial alignment of geographic features.


In [None]:
# Load the .prj file
with open(f'{path}.prj', 'r') as f:
    prj_text = f.read()

# Parse the projection information
crs = CRS.from_wkt(prj_text)
crs          # Displays the CRS object details


### SHX file

**Description** : An `.shx` (Shape Index) file is an index file for a shapefile (`.shp`) that stores offsets to the geometry data. 
It enables faster access and spatial queries by linking geometric shapes to their attribute data in a shapefile dataset.


In [None]:
# Load the shapefile into a GeoDataFrame
gdf = gpd.read_file(f'{path}.shx', encoding='utf-8')
gdf.head(3)

In [None]:
gdf.info()

### SHP file

**Description** : A shapefile (`.shp`) is a geospatial vector data format used in GIS to represent geographic features, including points, lines, and polygons. 
It is typically accompanied by related files (`.shx`, `.dbf`, `.prj`) that store indexing, attribute data, and projection information.


In [None]:
gdf_shp = gpd.read_file(f'{path}.shp', encoding='utf-8')
gdf_algeria = gdf_shp[gdf_shp['CNTRY_NAME'] == 'Algeria']
gdf_algeria['geometry'].iloc[0]

* The following command will create a set of shapefile-related files (.shp, .shx, .dbf, .prj) in the same directory, storing geometry, indexing, attribute data, and projection information.

In [None]:
# gdf_algeria.to_file("Algeria/algeria.shp", driver="ESRI Shapefile")

In [None]:
import matplotlib.pyplot as plt
import geopandas as gpd

# Load the shapefile
gdf_shp = gpd.read_file('Algeria/algeria.shp', encoding='utf-8')

# Plot Algeria
fig, ax = plt.subplots(figsize=(5, 5))
gdf_shp.plot(ax=ax, color="lightgreen", edgecolor="black")

# Add title and labels
ax.set_title("Map of Algeria")
plt.xlabel("Longitude")
plt.ylabel("Latitude")

plt.show()


### NC file (Climate)

**Description** : An `.nc` (NetCDF) file is a format for storing multidimensional scientific data, such as climate, atmospheric, and oceanographic data. It organizes data into variables with dimensions and metadata, supporting efficient storage, access, and sharing in research applications.

* Coordinate Reference System (CRS) : is a system that maps locations on Earth to specific coordinates, allowing spatial data from different sources to align accurately.

* WGS84 (World Geodetic System 1984) is a common CRS with the EPSG code 4326, which represents Earth using latitude and longitude.

PSurf = Surface Pressure

In [None]:
def data_filtering(nc_path, shp_path, country):
   
    ds = xr.open_dataset(nc_path)

    # Load the Algeria shapefile and get Algeria geometry
    gdf = gpd.read_file(f'{shp_path}.shp', encoding="utf-8")
    gdf_country = gdf[gdf['CNTRY_NAME'] == country]
    country_geometry = gdf_country.geometry

    # Ensure NetCDF data has a CRS (if missing, set it to WGS84)
    ds = ds.rio.write_crs("EPSG:4326")

    # Clip the NetCDF data to Algeria’s geometry
    ds_country = ds.rio.clip(country_geometry, ds.rio.crs, drop=True)
    return ds_country

In [None]:
def save_nc_file(output_path, ds_country):
    # Save the clipped dataset to a NetCDF file
    ds_country.to_netcdf(output_path)

In [None]:
def plot_climat_data(nc_path, attribute, cbar_label, title, gdf_algerie):

    ds = xr.open_dataset(nc_path)
    data = ds[attribute].isel(time=1)

    # Plotting
    fig, ax = plt.subplots(figsize=(5, 5))

    # Plot the clipped data (wind data)
    data.plot(ax=ax, cmap="coolwarm", cbar_kwargs={'label': cbar_label})

    # Overlay Algeria's boundary
    gdf_algerie.boundary.plot(ax=ax, edgecolor="black", linewidth=1)

    # Add title and labels
    plt.title(title)
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.show()


In [None]:
#Uncomment to filtre the original data and save it :

# for doc_name in os.listdir('Climate-DATA'):
#     doc_path = os.path.join('Climate-DATA', doc_name)
#     ds_algeria = data_filtering(doc_path, path, 'Algeria')
#     save_nc_file(f'filtered_climat_data/{doc_name}', ds_algeria)


In [None]:
plot_climat_data('filtered_climat_data/Rainf_WFDE5_CRU_201901_v2.1.nc', 'Rainf', 'Rainf', "Rainf over Algeria", gdf_algeria)

#### NC to CSV

* Open filtered nc file
* turn it into a dataframe
* Drop the nan values
* save it as csv

In [None]:
for doc_name in os.listdir('filtered_climat_data'):
    doc_path = os.path.join('filtered_climat_data', doc_name)
    
    attribute = doc_name.split("_")[0]

    ds = xr.open_dataset(doc_path)
    df = ds.to_dataframe().reset_index()
    df = df[df[attribute].notna()]
    df = df.reset_index(drop = True)

    csv_doc_name = doc_name.split('.')[0]
    df.to_csv(f'csv_filtered_climat_data/{csv_doc_name}.csv', index=False)


* Plot the data 

In [None]:
doc_name = 'Rainf_WFDE5_CRU_201902_v2.csv'
df = pd.read_csv(f'csv_filtered_climat_data/{doc_name}')
df

In [None]:
import matplotlib.pyplot as plt
import numpy as np

attribute = doc_name.split("_")[0]

df_data = df[['time', 'lon', 'lat', attribute]]
pivoted_data = df_data.pivot_table(index="lat", columns="lon", values=attribute, aggfunc="mean")

# Define the geographic extent of the heatmap (min and max longitude and latitude)
extent = [df_data['lon'].min(), df_data['lon'].max(), df_data['lat'].min(), df_data['lat'].max()]

# Plot the heatmap using imshow with the defined extent
plt.figure(figsize=(7, 7))
ax = plt.gca()
im = ax.imshow(pivoted_data, cmap="coolwarm", extent=extent, origin="lower", aspect="auto")
plt.colorbar(im, label=attribute)

# Plot Algeria boundaries on top of the heatmap
# Make sure gdf_algerie is in the same CRS as the heatmap's coordinates (WGS84)
gdf_algerie = gdf_algerie.to_crs("EPSG:4326")  # Confirm it's in the correct CRS
gdf_algerie.plot(ax=ax, edgecolor="black", facecolor="none", linewidth=1)

# Add titles and labels
plt.title(f"{attribute} Heatmap with Algeria Boundaries")
plt.xlabel("Longitude")
plt.ylabel("Latitude")

plt.show()


### CSV file (Soil)

In [None]:
import pandas as pd

# Load the Excel file into a DataFrame
df = pd.read_csv('soil_dz_allprops.csv')

# Display the first 2 rows
df.head(2)

In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely import wkt

# Charger le fichier CSV
df = pd.read_csv('soil_dz_allprops.csv')

# Convertir les chaînes WKT en objets de géométrie
df['geometry'] = df['geometry'].apply(wkt.loads)

# Convertir en GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")

# Tracer la carte avec le colormap "viridis" pour correspondre aux couleurs de l'image
fig, ax = plt.subplots(figsize=(5, 5))
# cmap='YlOrBr'
gdf.plot(column='sand % topsoil', cmap='viridis', legend=True, 
         legend_kwds={'label': "Sand % Topsoil", 'orientation': "horizontal"},
         ax=ax)

# Personnaliser le graphique
plt.title("Sand Content in Topsoil across Algeria")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()
