# OSS4GEO Scripts for semantic search and topic clustering

# https://github.com/do-me/oss4geo-semantic

## Mine GitHub Repo information

In [None]:
!pip install pandarallel pandas polars tqdm requests FlagEmbedding semantic_text_splitter

In [1]:
# Get your Github token and add here 

github_token = "YOUR_TOKEN"

## Download script

Note that you cannot simply use the main branch always as some legacy projects still use master or other so we need to query the API and get that infomrmation first. After, we can query the readme from the right branch.

In [2]:
import requests
import pandas as pd

def download_github_readme(repo, github_token=None):
    """
    Downloads the README file from a GitHub repository.

    Parameters:
        repo (str): GitHub repository in the format 'owner/repo_name'.
        github_token (str, optional): GitHub Personal Access Token for authentication (optional but recommended for higher rate limits).

    Returns:
        str: README content or an error message.
    """
    headers = {}
    if github_token:
        headers = {'Authorization': f'token {github_token}'}
    
    try:
        owner, repo_name = repo.split('/')
        url = f'https://api.github.com/repos/{owner}/{repo_name}/readme'
        
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            try:
                readme_info = response.json()
                download_url = readme_info['download_url']
                
                readme_content = requests.get(download_url).text
                return readme_content
            except Exception as e:
                print(f"Error processing README for {repo}: {e}")
                return ""
        else:
            print(f"Error: {response.status_code} - {response.json().get('message', 'Unknown error')} for {repo}")
            return ""
    
    except Exception as e:
        print(f"Error parsing repository {repo}: {e}")
        return ""

# Example usage with pandas, tests
repos = pd.Series([
    "torvalds/linux",
    "microsoft/vscode",
])

# Apply the function to the pandas Series
readmes = repos.apply(download_github_readme, github_token=github_token)
readmes


1    # Visual Studio Code - Open Source ("Code - OS...
dtype: object

# Manually adding the repos here that are logged to the console when opening https://www.oss4geo.org/

Could also scrape it from the webpage of course.

In [3]:
import pandas as pd 
from tqdm import tqdm 
tqdm.pandas()
from pandarallel import pandarallel

# mining the repos in parallel with all available cores (1 worker per cpu, works with GitHub token, does not without auth)
pandarallel.initialize(progress_bar=True)

# Example usage
repos = [
    "3dcitydb/3dcitydb",
    "3liz/lizmap-web-client",
    "52North/helgoland",
    "52North/IlwisCore",
    "52North/SOS",
    "actinia-org/actinia-core",
    "ahhz/raster",
    "airbusgeo/cogger",
    "Anagraph/yogrt",
    "apache/calcite",
    "apache/doris",
    "apache/echarts",
    "apache/incubator-baremaps",
    "apache/sedona",
    "apache/sis",
    "apache/superset",
    "appelmar/gdalcubes",
    "arangodb/arangodb",
    "aseldawy/spatialhadoop2",
    "axismaps/colorbrewer",
    "azavea/raster-vision",
    "birgander2/PyRAT",
    "bjornd/jvectormap",
    "bjornharrtell/jsts",
    "boostorg/geometry",
    "cambecc/earth",
    "cambecc/grib2json",
    "CartoDB/bigmetadata",
    "CartoDB/carto.js",
    "CartoDB/cartodb",
    "CartoDB/cartodb-postgresql",
    "CartoDB/cartoframes",
    "cartographer-project/cartographer",
    "cartographer-project/cartographer_ros",
    "CCI-Tools/cate-desktop",
    "CesiumGS/cesium",
    "cga-harvard/geonode",
    "CGAL/cgal",
    "cgre-aachen/gemgis",
    "cgre-aachen/gempy",
    "chelm/mbtiles-server",
    "chrismattmann/lucene-geo-gazetteer",
    "citusdata/citus",
    "ckan/ckan",
    "commaai/laika",
    "connormanning/entwine",
    "CosmiQ/solaris",
    "CosmiQ/yolt",
    "cran/rgdal",
    "d3/d3",
    "DanBloomberg/leptonica",
    "davidbyttow/govips",
    "davidfrantz / force",
    "dcherian/ncview",
    "deegree/deegree3",
    "developmentseed/geolambda",
    "developmentseed/label-maker",
    "developmentseed/landsat-util",
    "developmentseed/lonboard",
    "developmentseed/morecantile",
    "developmentseed/sentinel-util",
    "developmentseed/skynet-train",
    "developmentseed/tifeatures",
    "developmentseed/timvt",
    "developmentseed/tipg",
    "dmsl/anyplace",
    "dnchayes/MB-System",
    "DOI-USGS/ghsc-esi-shakemap",
    "domlysz/BlenderGIS",
    "drolbr/Overpass-API",
    "dtarb/TauDEM",
    "dwins/geoscript.scala",
    "dwins/mapnik2geotools",
    "e-sensing/sits",
    "ecmwf/eccodes",
    "eco-hydro/TIMESAT.R",
    "elastic/elasticsearch",
    "emeeks/d3-carto-map",
    "eoxc/eoxc",
    "EOxServer/eoxserver",
    "Esri/cedar",
    "Esri/geoportal-server-catalog",
    "Esri/wind-js",
    "eurostat/eurostat-map.js",
    "eurostat/GridMaker",
    "eurostat/JGiscoTools",
    "Factual/geo",
    "fatiando/verde",
    "fegyi001/mangol",
    "FreeCAD/FreeCAD",
    "Gaia3D/mago3d",
    "GenericMappingTools/gmt",
    "GeoDaCenter/geoda",
    "GeoDaCenter/GeoDaSpace",
    "Geodan/mapbox-3dtiles",
    "Geodan/pg2b3dm",
    "geoext/geoext",
    "geographika/mappyfile",
    "geoharmonizer_inea/eumap",
    "GeoJSON-Net/GeoJSON.Net",
    "geoman-io/leaflet-geoman",
    "geomapfish/getting_started",
    "Geomatys/examind-community",
    "geonetwork/core-geonetwork",
    "GeoNode/geonode",
    "geopandas/geopandas",
    "geopaparazzi/geopaparazzi",
    "geopaparazzi/smash",
    "geopython/GeoHealthCheck",
    "geopython/mapslicer",
    "geopython/OWSLib",
    "geopython/pycsw",
    "geopython/pygeometa",
    "geopython/pywps",
    "georchestra/georchestra",
    "geoscript/geoscript-js",
    "geoserver/geofence",
    "geoserver/geoserver",
    "geoserver/geoserver-cloud",
    "geosolutions-it/geostore",
    "geosolutions-it/imageio-ext",
    "geosolutions-it/MapStore2",
    "GeoStat-Framework/GSTools",
    "GeoStat-Framework/PyKrige",
    "GeostatsGuy/GeostatsPy",
    "GeoTIFF/geoblaze",
    "GeoTIFF/geotiff.io",
    "geotiffjs/geotiff.js",
    "geotools/geotools",
    "GeoWebCache/geowebcache",
    "GeoWebCache/gwc-release",
    "Geoyi/pixel-decoder",
    "gina-alaska/dans-gdal-scripts",
    "girder/girder",
    "GIScience/openrouteservice",
    "gisgraphy/gisgraphy",
    "gisquick/gisquick",
    "gltn/stdm",
    "gnss-sdr/gnss-sdr",
    "google/earthenterprise",
    "graphhopper/graphhopper",
    "grimzy/laravel-mysql-spatial",
    "gwaldron/osgearth",
    "gzuidhof/zarr.js",
    "halestudio/hale",
    "harsha2010/magellan",
    "HDFGroup/hdf5",
    "heremaps/harp.gl",
    "hms-dbmi/viv",
    "hslayers/hslayers-ng",
    "huidian200803/vterrain-VTP-vterrain.org",
    "Image-Py/imagepy",
    "ImageMagick/ImageMagick",
    "InsightSoftwareConsortium/ITK",
    "iTowns/itowns",
    "JamesLMilner/terra-draw",
    "jblindsay/whitebox-tools",
    "jjrom/itag",
    "jjrom/resto",
    "JuliaGeo/GeoInterface.jl",
    "kadas-albireo/kadas-albireo",
    "Kitware/ParaView",
    "Kitware/vtk-js",
    "KNMI/adaguc-server",
    "KNMI/adaguc-viewer",
    "komoot/photon",
    "koopjs/FeatureServer",
    "koopjs/koop",
    "LAStools/LAStools",
    "Leaflet/Leaflet",
    "libgeos/geos",
    "libLAS/libLAS",
    "LibreCAD/LibreCAD",
    "libspatialindex/libspatialindex",
    "libvips/libvips",
    "locationtech/geogig",
    "locationtech/geomesa",
    "locationtech/geotrellis",
    "locationtech/geowave",
    "locationtech/jts",
    "locationtech/udig-platform",
    "lucadelu/pyModis",
    "mapbox/geojson.io",
    "mapbox/Hecate",
    "mapbox/mapbox-gl-js",
    "mapbox/mapbox-gl-native",
    "mapbox/mapboxgl-jupyter",
    "mapbox/node-mbtiles",
    "mapbox/osmcha-frontend",
    "mapbox/robosat",
    "mapbox/tile-reduce",
    "mapbox/tilelive",
    "mapcentia/geocloud2",
    "mapfish/mapfish-print",
    "mapillary/mapillary-js",
    "mapillary/OpenSfM",
    "maplibre/maplibre-gl-js",
    "maplibre/maplibre-native",
    "maplibre/martin",
    "mapmint/mapmint",
    "mapnik/mapnik",
    "mapproxy/mapproxy",
    "maproulette/maproulette3",
    "MapServer/mapcache",
    "MapServer/MapServer",
    "MapServer/tinyows",
    "maptalks/maptalks.js",
    "maputnik/editor",
    "MapWindow/MapWindow5",
    "MatanYadaev/laravel-eloquent-spatial",
    "matplotlib/matplotlib",
    "mbloch/mapshaper",
    "melowntech/vts-browser-js",
    "MerginMaps/mergin",
    "microsoft/torchgeo",
    "MITK/MITK",
    "mkeller3/FastGeospatial",
    "mmaelicke/scikit-gstat",
    "MobilityDB/MobilityDB",
    "MODFLOW-USGS/modflow6",
    "mojodna/tessera",
    "mongodb/mongo",
    "movingpandas/movingpandas",
    "mrdoob/three.js",
    "mundialis / sadasadam",
    "nasa-gibs/onearth",
    "nasa-gibs/worldview",
    "nasa-jpl-memex/GeoParser",
    "nasa/cumulus-dashboard",
    "neocarto/bertin",
    "NetTopologySuite/NetTopologySuite",
    "neveldo/jQuery-Mapael",
    "nextgis/android_gisapp",
    "nextgis/android_maplib",
    "ngageoint/hootenanny",
    "ngageoint/mrgeo",
    "nsidc/earthaccess",
    "opencv/opencv",
    "OpenDroneMap/ODM",
    "OpenDroneMap/WebODM",
    "OpendTect/OpendTect",
    "OpenGeoscience/geojs",
    "OpenGeoscience/geonotebook",
    "OpenGeoVis/PVGeo",
    "opengisch/QField",
    "openglobus/openglobus",
    "openjump-gis/openjump",
    "openlayers/openlayers",
    "OpenOrienteering/mapper",
    "openscenegraph/OpenSceneGraph",
    "openstreetmap/iD",
    "openstreetmap/osm2pgsql",
    "openvenues/libpostal",
    "orbisgis/h2gis",
    "orfeotoolbox/OTB",
    "orientechnologies/orientdb",
    "OSGeo/gdal",
    "OSGeo/grass",
    "OSGeo/libgeotiff",
    "OSGeo/PROJ",
    "Oslandia/lopocs",
    "Oslandia/SFCGAL",
    "osm-search/Nominatim",
    "OSMBuildings/OSMBuildings",
    "osmcode/pyosmium",
    "ossimlabs/ossim",
    "PDAL/PDAL",
    "pelias/pelias",
    "pemn/vtk_triangulate_points",
    "pghydro/pghydro",
    "pgpointcloud/pointcloud",
    "pgRouting/osm2pgrouting",
    "pgRouting/pgrouting",
    "phayes/geoPHP",
    "pka/qgpkg",
    "placemark/placemark",
    "postgis/postgis",
    "potree/potree",
    "proj4js/proj4js",
    "Project-OSRM/osrm-backend",
    "py3dtiles/py3dtiles",
    "pyproj4/pyproj",
    "pysal/mapclassify",
    "pysal/pysal",
    "pysal/segregation",
    "pysal/spopt",
    "pysal/tobler",
    "qcad/qcad",
    "qgis/QGIS",
    "qgis/qwc2",
    "questdb/questdb",
    "radiantearth/stac-api-spec",
    "radiantearth/stac-browser",
    "rafaqz/Rasters.jl",
    "ranghetti/sen2r",
    "rasterio/rasterio",
    "Reading-eScience-Centre/edal-java",
    "Reading-eScience-Centre/ncwms",
    "rhansson/geotuple",
    "riatelab/magrit",
    "ROGUE-JCTD/Arbiter-Android",
    "ROGUE-JCTD/MapLoom",
    "ROGUE-JCTD/rogue_geonode",
    "ropensci/MODIStsp",
    "rspatial/raster",
    "rstudio/rstudio",
    "rstudio/shiny",
    "saga-gis",
    "sahana/eden",
    "sat-utils/sat-api",
    "schollz/find",
    "scikit-image/scikit-image",
    "SciTools/cartopy",
    "SciTools/iris",
    "sduclos/S52",
    "semiautomaticgit/SemiAutomaticClassificationPlugin",
    "senbox-org/s1tbx",
    "sentinel-hub/eo-learn",
    "sentinel-hub/sentinelhub-py",
    "sentinelsat/sentinelsat",
    "shapely/shapely",
    "SharpMap/SharpMap",
    "simplegeo/polymaps",
    "sldeditor/sldeditor",
    "Slicer/Slicer",
    "sogelink-research/ctod",
    "stac-utils/pgstac",
    "stac-utils/stac-fastapi",
    "stamen/modestmaps-js",
    "STEMLab/InFactory",
    "SuperElastix/elastix",
    "t-rex-tileserver/t-rex",
    "tangrams/tangram",
    "terrestris/shogun-core",
    "TerriaJS/terriajs",
    "tesseract-ocr/tesseract",
    "tidwall/tile38",
    "TileDB-Inc/TileDB",
    "tilemill-project/tilemill",
    "TileStache/TileStache",
    "timescale/timescaledb",
    "TNOCS/csWeb-tile",
    "Toblerity/Fiona",
    "Toblerity/rtree",
    "tomojitakasu/RTKLIB",
    "topojson/topojson",
    "Turfjs/turf",
    "tyrasd/overpass-turbo",
    "uclouvain/openjpeg",
    "UDST/vizicities",
    "umap-project/umap",
    "Unidata/IDV",
    "Unidata/netcdf-c",
    "Unidata/netcdf-java",
    "Unidata/thredds",
    "valhalla/valhalla",
    "vega/vega",
    "verma/plasio",
    "visgl/deck.gl",
    "VROOM-Project/vroom",
    "webglearth/webglearth2",
    "whamlyn/auralib",
    "yaph/d3-geomap",
    "yeesian/ArchGDAL.jl",
    "zombodb/zombodb",
    "ZOO-Project/ZOO-Project"
]

df = pd.DataFrame(repos,columns=["Repo"])
df["Readme"] = df["Repo"].parallel_apply(download_github_readme, github_token=github_token)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=23), Label(value='0 / 23'))), HBox…

Error: 404 - Not Found for cran/rgdal
Error: 404 - Not Found for py3dtiles/py3dtiles
Error: 404 - Not Found for davidfrantz / force
Error: 404 - Not Found for geoharmonizer_inea/eumap
Error parsing repository saga-gis: not enough values to unpack (expected 2, got 1)
Error: 404 - Not Found for mundialis / sadasadam


In [5]:
df["Readme_len"] = df["Readme"].apply(lambda x: len(x))
df.to_parquet("readmes.parquet")

## Get the erroneous repos, is it errors or are they on GitLab maybe?

In [6]:
df[df["Readme_len"] == 0]

Unnamed: 0,Repo,Readme,Readme_len
48,cran/rgdal,,0
52,davidfrantz / force,,0
98,geoharmonizer_inea/eumap,,0
224,mundialis / sadasadam,,0
280,py3dtiles/py3dtiles,,0
307,saga-gis,,0


In [7]:
# remove erroneous ones
df = df[df["Readme_len"] != 0]

## Some repos have huge descriptions others have no description

In [9]:
df = df.sort_values("Readme_len")
df

Unnamed: 0,Repo,Readme,Readme_len
90,Gaia3D/mago3d,# mago3d,8
69,drolbr/Overpass-API,Please see\nhttp://wiki.osm.org/wiki/Overpass_...,81
160,kadas-albireo/kadas-albireo,**This application is obsolete! Please use [KA...,121
294,ranghetti/sen2r,# sen2r End Of Life\n\nsen2r will be retired f...,247
129,GeoWebCache/geowebcache,# geowebcache\n\n[![Gitter](https://badges.git...,321
...,...,...,...
252,openvenues/libpostal,# libpostal: international street address NLP\...,46123
183,mapbox/Hecate,<h1 align='center'>Hecate</h1>\n\n<p align=cen...,47795
290,questdb/questdb,"<div align=""center"">\n <a href=""https://quest...",52918
229,neocarto/bertin,[![logo](img/logo_small.png)](https://observab...,77411


## The best text splitter around for good chunk quality

In [10]:
from semantic_text_splitter import TextSplitter
from pandarallel import pandarallel

#pandarallel.initialize(progress_bar=True)

splitter = TextSplitter((4000,8000)) # 4k-8k chars -> 1-4ktokens, 8k max tokens for bge-m3

def wrap_func(text):
    return splitter.chunks(text)

df["chunks"] = df["Readme"].parallel_apply(wrap_func) 

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=23), Label(value='0 / 23'))), HBox…

In [13]:
df["chunks_len"] = df["chunks"].apply(lambda x: len(x))
df

Unnamed: 0,Repo,Readme,Readme_len,chunks,chunks_len
90,Gaia3D/mago3d,# mago3d,8,[# mago3d],1
69,drolbr/Overpass-API,Please see\nhttp://wiki.osm.org/wiki/Overpass_...,81,[Please see\nhttp://wiki.osm.org/wiki/Overpass...,1
160,kadas-albireo/kadas-albireo,**This application is obsolete! Please use [KA...,121,[**This application is obsolete! Please use [K...,1
294,ranghetti/sen2r,# sen2r End Of Life\n\nsen2r will be retired f...,247,[# sen2r End Of Life\n\nsen2r will be retired ...,1
129,GeoWebCache/geowebcache,# geowebcache\n\n[![Gitter](https://badges.git...,321,[# geowebcache\n\n[![Gitter](https://badges.gi...,1
...,...,...,...,...,...
252,openvenues/libpostal,# libpostal: international street address NLP\...,46123,[# libpostal: international street address NLP...,11
183,mapbox/Hecate,<h1 align='center'>Hecate</h1>\n\n<p align=cen...,47795,[<h1 align='center'>Hecate</h1>\n\n<p align=ce...,9
290,questdb/questdb,"<div align=""center"">\n <a href=""https://quest...",52918,"[<div align=""center"">\n <a href=""https://ques...",13
229,neocarto/bertin,[![logo](img/logo_small.png)](https://observab...,77411,[[![logo](img/logo_small.png)](https://observa...,19


In [12]:
# Test export
df[["Repo","Readme"]].to_excel("Readmes.xlsx")

# Create embeddings 

bge-m3 is stil amongst the best multilingual embedding models out there (even though there are better, see here https://huggingface.co/spaces/mteb/leaderboard). I choose it because you could load the exact same model in the frontent and create static web apps like these hosted on GitHub pages. 

- https://do-me.github.io/SDG-Analyzer/
- https://do-me.github.io/copernicus-services-semantic-search/
- https://do-me.github.io/SemanticFinder/

In [14]:
from FlagEmbedding import BGEM3FlagModel
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [15]:
df["embs"] = df["chunks"].progress_apply(lambda x: model.encode(x, batch_size=64)['dense_vecs']) # takes 6 mins on my M3 Max but should also work fairly fast on CPU-only devices
df

100%|██████████| 360/360 [06:32<00:00,  1.09s/it]


Unnamed: 0,Repo,Readme,Readme_len,chunks,chunks_len,embs
90,Gaia3D/mago3d,# mago3d,8,[# mago3d],1,"[[-0.006393, -0.010506, -0.01356, -0.06015, -0..."
69,drolbr/Overpass-API,Please see\nhttp://wiki.osm.org/wiki/Overpass_...,81,[Please see\nhttp://wiki.osm.org/wiki/Overpass...,1,"[[-0.004932, -0.00886, -0.02988, -0.00218, -0...."
160,kadas-albireo/kadas-albireo,**This application is obsolete! Please use [KA...,121,[**This application is obsolete! Please use [K...,1,"[[-0.000566, 0.01485, -0.065, -0.03055, -0.039..."
294,ranghetti/sen2r,# sen2r End Of Life\n\nsen2r will be retired f...,247,[# sen2r End Of Life\n\nsen2r will be retired ...,1,"[[-0.04718, 0.003744, 0.004616, -0.02188, -0.0..."
129,GeoWebCache/geowebcache,# geowebcache\n\n[![Gitter](https://badges.git...,321,[# geowebcache\n\n[![Gitter](https://badges.gi...,1,"[[-0.0518, 0.00799, -0.0359, 0.003271, -0.0064..."
...,...,...,...,...,...,...
252,openvenues/libpostal,# libpostal: international street address NLP\...,46123,[# libpostal: international street address NLP...,11,"[[-0.0818, -0.004604, -0.01075, 0.02449, 0.017..."
183,mapbox/Hecate,<h1 align='center'>Hecate</h1>\n\n<p align=cen...,47795,[<h1 align='center'>Hecate</h1>\n\n<p align=ce...,9,"[[-0.04398, 0.02171, -0.05487, 0.04868, 0.0091..."
290,questdb/questdb,"<div align=""center"">\n <a href=""https://quest...",52918,"[<div align=""center"">\n <a href=""https://ques...",13,"[[-0.04514, -0.03952, -0.0709, -0.007202, -0.0..."
229,neocarto/bertin,[![logo](img/logo_small.png)](https://observab...,77411,[[![logo](img/logo_small.png)](https://observa...,19,"[[-0.06647, -0.007717, -0.02762, 0.007065, 0.0..."


In [16]:
df.to_parquet("readmes.parquet")

ArrowInvalid: ('Can only convert 1-dimensional array values', 'Conversion failed for column embs with type object')

## My custom functions for pickling arrays of arrays in a column, not needed if you don't want to persist all chunk embeddings

In [4]:
import pandas as pd
import pickle
import polars as pl

def write_pd_pickle(df, filename, pickle_cols=None):
    """
    Writes a pandas DataFrame to a Parquet file, pickling specified columns.
    The function takes a DataFrame and pickles the specified columns before saving
    the DataFrame to a Parquet file. This is useful for saving columns that contain
    data types that Parquet might not natively support, such as lists or dictionaries.
    Parameters:
    - df: pandas DataFrame to be written.
    - filename: the name of the output Parquet file.
    - pickle_cols: a list of column names in the DataFrame to be pickled.
    Returns:
    - A list of pickled column names.
    Raises:
    - ValueError: if pickle_cols is not provided or a column listed in pickle_cols is not in the DataFrame.
    Example:
    >>> df_auto = pd.DataFrame({'test_list': [[1, 2], [3, 4]], 'list_of_lists': [[[1], [2]], [[3], [4]]]})
    >>> write_pd_pickle(df_auto, "test.parquet", ["test_list", "list_of_lists"])
    ['test_list', 'list_of_lists']
    """
    if pickle_cols is None:
        raise ValueError("pickle_cols must be a list of column names.")

    df_to_write = df.copy()

    # Pickle specified columns
    for col in pickle_cols:
        if col not in df_to_write.columns:
            raise ValueError(f"Column '{col}' not found in the DataFrame.")
        df_to_write[col] = df_to_write[col].apply(pickle.dumps)

    # Write DataFrame to Parquet file
    df_to_write.to_parquet(filename, index=False)
    
    return pickle_cols

  # write_pd_pickle(df_auto, "test.parquet", ["test_list","list_of_lists"]

 
  
def is_pickled(column_sample):
    try:
        pickle.loads(column_sample)
        return True
    except Exception:
        return False

def read_pd_unpickle(filename, unpickle_cols=None):
    """
    Reads a Parquet file into a pandas DataFrame, unpickling specified columns.
    The function reads a Parquet file into a DataFrame and unpickles specified columns.
    If unpickle_cols is not provided, it will attempt to auto-detect pickled columns.
    Parameters:
    - filename: the name of the Parquet file to read.
    - unpickle_cols: a list of column names to be unpickled. If None, auto-detection is used.
    Returns:
    - A pandas DataFrame with specified columns unpickled.
    Raises:
    - ValueError: if unpickle_cols is not a list or a column listed in unpickle_cols is not in the DataFrame.
    Examples:
    Auto-detection mode:
    >>> df_auto = read_pd_unpickle("test.parquet")
    >>> df_auto
      test_list list_of_lists
    0    [1, 2]       [[1], [2]]
    1    [3, 4]       [[3], [4]]
    
    Manual mode, specifying columns to unpickle:
    >>> df_manual = read_pd_unpickle("test.parquet", unpickle_cols=["test_list", "list_of_lists"])
    >>> df_manual
      test_list list_of_lists
    0    [1, 2]       [[1], [2]]
    1    [3, 4]       [[3], [4]]
    
    Quick check if the two DataFrames are the same:
    >>> str(df_auto) == str(df_manual)
    True
    """
    df = pl.read_parquet(filename).to_pandas() # using polars as pandas has some bugs with nested structures
    
    if unpickle_cols is None:
        # Auto-detection mode
        unpickle_cols = []
        for col in df.columns:
            non_null_items = df[col].dropna()
            if len(non_null_items) > 0:
                sample = non_null_items.iloc[0]
                if is_pickled(sample):
                    unpickle_cols.append(col)
    elif not isinstance(unpickle_cols, list):
        raise ValueError("unpickle_cols must be a list of column names.")
    
    # Unpickle specified columns
    for col in unpickle_cols:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: pickle.loads(x) if pd.notnull(x) else x)
        else:
            raise ValueError(f"Column '{col}' not found in the DataFrame.")
    
    return df


In [5]:
#write_pd_pickle(df, "Readme_embs.parquet", pickle_cols=["embs"])

In [6]:
import polars as pl
df = read_pd_unpickle("Readme_embs.parquet")#, pickle_cols=["embs"])
df

Unnamed: 0,Repo,Readme,Readme_len,chunks,chunks_len,embs
0,Gaia3D/mago3d,# mago3d,8,[# mago3d],1,"[[-0.006393, -0.010506, -0.01356, -0.06015, -0..."
1,drolbr/Overpass-API,Please see\nhttp://wiki.osm.org/wiki/Overpass_...,81,[Please see\nhttp://wiki.osm.org/wiki/Overpass...,1,"[[-0.004932, -0.00886, -0.02988, -0.00218, -0...."
2,kadas-albireo/kadas-albireo,**This application is obsolete! Please use [KA...,121,[**This application is obsolete! Please use [K...,1,"[[-0.000566, 0.01485, -0.065, -0.03055, -0.039..."
3,ranghetti/sen2r,# sen2r End Of Life\n\nsen2r will be retired f...,247,[# sen2r End Of Life\n\nsen2r will be retired ...,1,"[[-0.04718, 0.003744, 0.004616, -0.02188, -0.0..."
4,GeoWebCache/geowebcache,# geowebcache\n\n[![Gitter](https://badges.git...,321,[# geowebcache\n\n[![Gitter](https://badges.gi...,1,"[[-0.0518, 0.00799, -0.0359, 0.003271, -0.0064..."
...,...,...,...,...,...,...
355,openvenues/libpostal,# libpostal: international street address NLP\...,46123,[# libpostal: international street address NLP...,11,"[[-0.0818, -0.004604, -0.01075, 0.02449, 0.017..."
356,mapbox/Hecate,<h1 align='center'>Hecate</h1>\n\n<p align=cen...,47795,[<h1 align='center'>Hecate</h1>\n\n<p align=ce...,9,"[[-0.04398, 0.02171, -0.05487, 0.04868, 0.0091..."
357,questdb/questdb,"<div align=""center"">\n <a href=""https://quest...",52918,"[<div align=""center"">\n <a href=""https://ques...",13,"[[-0.04514, -0.03952, -0.0709, -0.007202, -0.0..."
358,neocarto/bertin,[![logo](img/logo_small.png)](https://observab...,77411,[[![logo](img/logo_small.png)](https://observa...,19,"[[-0.06647, -0.007717, -0.02762, 0.007065, 0.0..."


## Averaging all chunk embeddings to obtain one averaged ebedding per repo

In [7]:
import numpy as np
def average_embs(embs):

    # Convert the list of embeddings to a NumPy array and change its dtype to float32
    embeddings_array = np.array(embs).astype('float32')
    
    # Calculate the mean vector
    mean_vector = np.mean(embeddings_array, axis=0)
    
    return mean_vector

df["mean_embs"] = df["embs"].apply(lambda x: average_embs(x))
df

Unnamed: 0,Repo,Readme,Readme_len,chunks,chunks_len,embs,mean_embs
0,Gaia3D/mago3d,# mago3d,8,[# mago3d],1,"[[-0.006393, -0.010506, -0.01356, -0.06015, -0...","[-0.0063934326, -0.010505676, -0.013557434, -0..."
1,drolbr/Overpass-API,Please see\nhttp://wiki.osm.org/wiki/Overpass_...,81,[Please see\nhttp://wiki.osm.org/wiki/Overpass...,1,"[[-0.004932, -0.00886, -0.02988, -0.00218, -0....","[-0.0049324036, -0.008857727, -0.029876709, -0..."
2,kadas-albireo/kadas-albireo,**This application is obsolete! Please use [KA...,121,[**This application is obsolete! Please use [K...,1,"[[-0.000566, 0.01485, -0.065, -0.03055, -0.039...","[-0.0005660057, 0.014846802, -0.06500244, -0.0..."
3,ranghetti/sen2r,# sen2r End Of Life\n\nsen2r will be retired f...,247,[# sen2r End Of Life\n\nsen2r will be retired ...,1,"[[-0.04718, 0.003744, 0.004616, -0.02188, -0.0...","[-0.047180176, 0.0037441254, 0.0046157837, -0...."
4,GeoWebCache/geowebcache,# geowebcache\n\n[![Gitter](https://badges.git...,321,[# geowebcache\n\n[![Gitter](https://badges.gi...,1,"[[-0.0518, 0.00799, -0.0359, 0.003271, -0.0064...","[-0.05178833, 0.007987976, -0.035888672, 0.003..."
...,...,...,...,...,...,...,...
355,openvenues/libpostal,# libpostal: international street address NLP\...,46123,[# libpostal: international street address NLP...,11,"[[-0.0818, -0.004604, -0.01075, 0.02449, 0.017...","[-0.069654986, -0.0029153607, -0.023781776, 0...."
356,mapbox/Hecate,<h1 align='center'>Hecate</h1>\n\n<p align=cen...,47795,[<h1 align='center'>Hecate</h1>\n\n<p align=ce...,9,"[[-0.04398, 0.02171, -0.05487, 0.04868, 0.0091...","[-0.038349576, 0.0061443117, -0.03994836, 0.02..."
357,questdb/questdb,"<div align=""center"">\n <a href=""https://quest...",52918,"[<div align=""center"">\n <a href=""https://ques...",13,"[[-0.04514, -0.03952, -0.0709, -0.007202, -0.0...","[-0.040005025, -0.012131765, -0.03804838, -0.0..."
358,neocarto/bertin,[![logo](img/logo_small.png)](https://observab...,77411,[[![logo](img/logo_small.png)](https://observa...,19,"[[-0.06647, -0.007717, -0.02762, 0.007065, 0.0...","[-0.06993826, -0.014848056, -0.02876603, 0.014..."


In [8]:
from numpy import dot
from numpy.linalg import norm

def cosine_distance(list1, list2):
    # Ensure the input lists have the same length
    if len(list1) != len(list2):
        raise ValueError("Both lists must have the same length")
    
    # Compute cosine similarity
    cos_similarity = dot(list1, list2) / (norm(list1) * norm(list2))
    
    # Cosine distance is 1 - cosine similarity
    cos_distance = 1 - cos_similarity
    
    return cos_distance


# Run some test semantic search 

Might not work very well as the data is quite dirty (special chars, lots of code etc. see the GitHub readme for improvement strategies)

In [10]:
USER_QUERY = "A performant raster processing framework" #a performant desktop gis software for processing large geospatial datasets"
query_embs = model.encode(USER_QUERY, batch_size=64)['dense_vecs'].tolist()

NameError: name 'model' is not defined

In [39]:
df["similarity"] = df["mean_embs"].apply(lambda x: cosine_distance(x,query_embs))
df.sort_values("similarity", ascending=False)

Unnamed: 0,Repo,Readme,Readme_len,chunks,chunks_len,embs,mean_embs,similarity
294,ranghetti/sen2r,# sen2r End Of Life\n\nsen2r will be retired f...,247,[# sen2r End Of Life\n\nsen2r will be retired ...,1,"[[-0.04718, 0.003744, 0.004616, -0.02188, -0.0...","[-0.047180176, 0.0037441254, 0.0046157837, -0....",0.729533
78,eoxc/eoxc,# EOxC client framework\n\nThe full documentat...,697,[# EOxC client framework\n\nThe full documenta...,1,"[[-0.05774, -0.013535, -0.0333, 0.002598, -0.0...","[-0.057739258, -0.013534546, -0.033294678, 0.0...",0.691178
248,OpenOrienteering/mapper,# OpenOrienteering Mapper\n\n![Mapper Screensh...,1842,[# OpenOrienteering Mapper\n\n![Mapper Screens...,1,"[[-0.04678, -0.02916, -0.05716, -0.01528, 0.00...","[-0.046783447, -0.029159546, -0.057159424, -0....",0.690257
308,sahana/eden,# Sahana Eden\r\n\r\nSahana Eden is an Emergen...,818,[# Sahana Eden\r\n\r\nSahana Eden is an Emerge...,1,"[[-0.01949, -0.00813, -0.03235, -0.0466, -0.04...","[-0.019485474, -0.008132935, -0.032348633, -0....",0.682721
40,chelm/mbtiles-server,mbtiles-server\n==============\n\nWow. It's re...,366,[mbtiles-server\n==============\n\nWow. It's r...,1,"[[-0.03894, 0.002981, 0.002565, 0.0276, 0.0004...","[-0.03894043, 0.002981186, 0.002565384, 0.0276...",0.682104
...,...,...,...,...,...,...,...,...
244,opengisch/QField,[![Read the Docs](https://img.shields.io/badge...,3731,[[![Read the Docs](https://img.shields.io/badg...,1,"[[-0.0708, 0.00784, -0.0674, -0.0647, -0.03397...","[-0.07080078, 0.007843018, -0.06738281, -0.064...",0.478474
51,davidbyttow/govips,"# <img src=""https://raw.githubusercontent.com/...",4284,"[# <img src=""https://raw.githubusercontent.com...",2,"[[-0.0712, -0.00569, -0.04672, 0.01567, -0.016...","[-0.06359863, -0.009773254, -0.037017822, 0.00...",0.478294
1,3liz/lizmap-web-client,"# [![logo](icon.png ""3Liz"")][3liz]Lizmap Web A...",4756,"[# [![logo](icon.png ""3Liz"")][3liz]Lizmap Web ...",2,"[[-0.0739, 0.01813, -0.03186, -0.00068, -0.002...","[-0.07107544, 0.02129364, -0.04598999, -0.0222...",0.455781
274,pka/qgpkg,qgpkg\n========\n\nIntroduction\n------------\...,4377,[qgpkg\n========\n\nIntroduction\n------------...,2,"[[-0.07043, 0.006348, -0.05615, -0.00705, 0.01...","[-0.0826416, -0.0054244995, -0.035762787, -0.0...",0.404193


## Reducing the n-dimensional embedding to 2D for simple scatterplots

In [11]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.manifold import TSNE

# Assuming your DataFrame is named 'df' and the embeddings are in the 'mean_embs' column
# Convert the 'mean_embs' column to a 2D NumPy array
embeddings = np.array(df['mean_embs'].tolist())  # Convert list of lists to a 2D array

# Initialize t-SNE with 2 dimensions
tsne = TSNE(n_components=2, random_state=42)

# Reduce embeddings to 2 dimensions
embeddings_2d = tsne.fit_transform(embeddings)

# Write the 2D t-SNE results back to the DataFrame
df['x_tsne'] = embeddings_2d[:, 0]  # First dimension (x)
df['y_tsne'] = embeddings_2d[:, 1]  # Second dimension (y)

# Now, use Plotly to create an interactive scatter plot with tooltips
# We can display additional columns (e.g., 'similarity') on hover
fig = px.scatter(
    df, 
    x='x_tsne', 
    y='y_tsne', 
    hover_data=['Repo'],  # Display 'similarity' column in the tooltip
    title='t-SNE 2D Visualization with Tooltips'#,
    #labels={'x_tsne': 'Dimension 1 (x)', 'y_tsne': 'Dimension 2 (y)'}
    #height=1000,  # Set custom height (in pixels)
    #width=900 
)

# Show the figure with interactive tooltips
fig.show()


In [12]:
import plotly.io as pio
# Export the figure as a standalone HTML file
pio.write_html(fig, file='index.html', auto_open=True)


## Alternative for plotly

In [51]:
import jscatter
jscatter.plot(data=df, x='x_tsne', y='y_tsne')

HBox(children=(VBox(children=(Button(button_style='primary', icon='arrows', layout=Layout(width='36px'), style…