In [1]:
import pandas as pd
import shapefile as shp
import cartopy.io.shapereader as shpreader

In [2]:
# Watermark is not required for this code, but is included for information. 
import watermark
%load_ext watermark
%watermark -a "ELEANOR LUTZ" -d -v -iv -m

shapefile 2.0.1
watermark 1.8.1
cartopy   0.17.0
pandas    0.23.4
ELEANOR LUTZ 2019-08-24 

CPython 3.7.1
IPython 7.2.0

compiler   : MSC v.1900 64 bit (AMD64)
system     : Windows
release    : 10
machine    : AMD64
processor  : Intel64 Family 6 Model 63 Stepping 2, GenuineIntel
CPU cores  : 12
interpreter: 64bit


In [3]:
# Labels used by the USGS to demark each of the different datasets
datasets = ['I-0703', 'I-0948', 'I-1034', 'I-1047', 'I-1062', 'I-1162']

In [4]:
# Create a master dataframe containing all geologic unit descriptions and symbols
# This dataframe is used to assign colors for each geologic unit. 
totaldf = pd.DataFrame()

for s in datasets:
    s2 = s.replace('-','_')
    fname = "A:/gitrepos/geology_atlas_of_space/data/Lunar_Geologic_GIS_Renovation_March2013/"+\
        s+"/Shapefiles/"+s2+"_Geology.shp"
    shp = shpreader.Reader(fname)
    unitsymbols, unitnames, majorgroups, unitdescs = [], [], [], []
    for record, state in zip(shp.records(), shp.geometries()):
        unitsymbols.append(record.attributes['UnitSymbol'])
        try:
            unitnames.append(record.attributes['UnitName'])
        except:
            unitnames.append(record.attributes['UnitName_1'])
        majorgroups.append(record.attributes['MajorGroup'])
        unitdescs.append(record.attributes['UnitDescri'])
        
    tempdf = pd.DataFrame.from_dict({'UnitSymbol':unitsymbols, 'UnitName':unitnames,
                                     'MajorGroup':majorgroups, 'UnitDescri':unitdescs})
    tempdf.drop_duplicates(subset='UnitSymbol', inplace=True, keep='first')
    tempdf['Data_source'] = s
    totaldf = pd.concat([totaldf, tempdf])
    
totaldf['Duplicated'] = totaldf.duplicated(subset='UnitSymbol')
totaldf.sort_values(by=['MajorGroup', 'UnitSymbol'], inplace=True)
totaldf.dropna(subset=['UnitSymbol'], inplace=True)
totaldf = totaldf[totaldf['UnitSymbol'].str.len() > 0]
totaldf.to_csv('./data/unit_descriptions_from_files.csv', index=False)

display(totaldf.head())
print(len(totaldf[totaldf['Duplicated'] == True]), 'duplicated symbols')
print(len(totaldf[totaldf['Duplicated'] == False]), 'unit symbols that are only found in one dataset')

Unnamed: 0,UnitSymbol,UnitName,MajorGroup,UnitDescri,Data_source,Duplicated
86,cf,,,,I-1062,False
133,INbl,Undivided Lineated Basin Material,Basin Materials,"Undivided Lineated Basin Material, Imbrian and...",I-1062,False
23,Ia,Alpes Formation,Basin Materials,"Alpes Formation, Imbrian System",I-1062,False
70,If,Fra Mauro Formation,Basin Materials,"Fra Mauro Formation, Imbrian System",I-1062,True
822,Iic,Material of Imbrium-Basin Secondary-Impact Cra...,Basin Materials,Material of Imbrium-Basin Secondary-Impact Cra...,I-1162,False


87 duplicated symbols
107 unit symbols that are only found in one dataset
