In [1]:
import pandas as pd
import json
import geopandas as gpd
from shapely.geometry import shape, MultiPolygon

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
csv_file = 'info_pozos.csv'
df = pd.read_csv(csv_file)

json_file = 'provincia.json'
with open(json_file) as f:
    data_provincias = json.load(f)

provinces_gdf = gpd.GeoDataFrame.from_features(data_provincias['features'])
provinces_gdf = provinces_gdf.set_crs("EPSG:4326")

def geojson_to_geometry(geojson_str):
    try:
        return shape(json.loads(geojson_str))
    except (json.JSONDecodeError, TypeError, ValueError):
        return None

df['geometry'] = df['geojson'].apply(geojson_to_geometry)
geo_gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")

# Verificar interseccion de geojson con dataset
result_gdf = gpd.sjoin(geo_gdf, provinces_gdf, how='left', predicate='intersects')

df['provincia_geojson'] = result_gdf['nam']

output_csv_file = 'info_pozos_provincias.csv'
df.to_csv(output_csv_file, index=False)


In [2]:
csv_file = 'info_pozos_provincias.csv'
df = pd.read_csv(csv_file)

def check_provincia(row):
    return row['provincia_geojson'] != row['provincia']

# Aplicar matcheo de ovincias (true)
df['match_check'] = df.apply(check_provincia, axis=1)
mismatch_rows = df[df['match_check'] == True]

output_csv_file = 'provincias_erroneas.csv'
mismatch_rows.to_csv(output_csv_file, index=False)

             provincia provincia_geojson
76               Salta             Jujuy
2792         Rio Negro           Neuquén
2793         Rio Negro           Neuquén
2799         Rio Negro           Neuquén
3181           Mendoza          La Pampa
...                ...               ...
82680  Estado Nacional               NaN
84281  Estado Nacional               NaN
84282              NaN               NaN
84293        Rio Negro         Río Negro
84396  Estado Nacional               NaN

[625 rows x 2 columns]
Mismatched rows saved to provincias_erroneas.csv


In [3]:
# Analisis de resultados
csv_file = 'provincias_erroneas.csv'
df = pd.read_csv(csv_file)

# Reemplazar vacios con ''
df['provincia_geojson'] = df['provincia_geojson'].fillna('').str.strip()

grouped_df = df.groupby(['provincia', 'provincia_geojson']).size().reset_index(name='count')

# Display the resulting DataFrame
print(grouped_df)

           provincia provincia_geojson  count
0             Chubut                      323
1    Estado Nacional                       74
2           La Pampa                        1
3            Mendoza            Chubut      1
4            Mendoza          La Pampa      1
5            Mendoza           Neuquén      1
6            Mendoza          San Luis      1
7            Neuquén                        1
8            Neuquén           Mendoza      2
9            Neuquén         Rio Negro     13
10         Rio Negro                        1
11         Rio Negro          La Pampa     11
12         Rio Negro           Neuquén      3
13         Rio Negro         Río Negro      1
14             Salta             Jujuy      1
15        Santa Cruz                       66
16  Tierra del Fuego                      123
