# MSHA Mines Data Munging

The data sets are on the [MSHA data sets page](https://arlweb.msha.gov/opengovernmentdata/ogimsha.asp).

The purpose of this code is to munge the data in order to reduce the number of columns and rows to what is required for a different application.

In [1]:
import pandas as pd
from IPython.display import display

Read the mines list file.

In [2]:
# import from the MSHA website
mines_definition = pd.read_table(
    'https://arlweb.msha.gov/opengovernmentdata/DataSets/Mines_Definition_File.txt', 
    sep='|')

In [17]:
with pd.option_context('display.max_colwidth', 9999):
     display(mines_definition.head())

Unnamed: 0,TABLE_NAME,COLUMN_NAME,DATA_TYPE,DATA_LENGTH,FIELD_DESCRIPTION
0,MINES,MINE_ID,VARCHAR2,7,"Identification number assigned to the mine by MSHA. It is a unique primary key to join to the Inspections, Mine Address, Accidents, Annual Employ/Prod and Qrtly Employ/Prod tables."
1,MINES,CURRENT_MINE_NAME,VARCHAR2,50,Name of the mine as designated on the Legal ID Form (LID) or Mine Information Form (MIF).
2,MINES,COAL_METAL_IND,VARCHAR2,1,Identifies if the mine is a Coal or Metal/Non-Metal mine.
3,MINES,CURRENT_MINE_TYPE,VARCHAR2,20,"From the Legal ID (LID) form. The types are Facility, Surface or Underground."
4,MINES,CURRENT_MINE_STATUS,VARCHAR2,50,"Current status of the mine. Values are Abandoned, Abandoned and Sealed, Active, Intermittent, New Mine, NonProducing and Temporarily Idled."


In [4]:
columns_to_read = ['MINE_ID', 'CURRENT_MINE_NAME', 'COAL_METAL_IND', \
                  'CURRENT_OPERATOR_ID', 'CURRENT_OPERATOR_NAME', \
                  'CURRENT_MINE_STATUS', 'LONGITUDE', 'LATITUDE', 'NO_EMPLOYEES']

In [5]:
# to watch the download in a terminal
# du -h Mines.zip
from zipfile import ZipFile

with ZipFile('Mines.zip', 'r') as zf:
    print(zf.namelist())
    with zf.open('Mines.txt') as fp:
        mines = pd.read_table(fp, encoding='latin-1', sep='|', \
                              usecols=columns_to_read, \
                              parse_dates=True)

['Mines.txt']


In [6]:
mines.shape

(86880, 9)

In [7]:
mines.columns

Index(['MINE_ID', 'CURRENT_MINE_NAME', 'COAL_METAL_IND', 'CURRENT_MINE_STATUS',
       'CURRENT_OPERATOR_ID', 'CURRENT_OPERATOR_NAME', 'NO_EMPLOYEES',
       'LONGITUDE', 'LATITUDE'],
      dtype='object')

In [8]:
mines = mines[mines['CURRENT_MINE_STATUS'] == "Active"]
mines.shape

(6352, 9)

In [9]:
mines = mines[mines.COAL_METAL_IND == "M"]
mines.shape

(5482, 9)

In [18]:
# create a set of specific mines
mine_ids = [1400411, 1400412, 1400413, 1600358, 1600509, 1600970, \
            2000552, 2901042, 2901043, 2901785, 3000663, 3003255, \
            3301993, 3301994, 4101776, 4102478]

In [19]:
# subset further with the specific mines
mines = mines[mines['MINE_ID'].isin(mine_ids)]

In [20]:
#mines.head(100)
with pd.option_context('display.max_colwidth', 9999):
     display(mines.head(5))

Unnamed: 0,MINE_ID,CURRENT_MINE_NAME,COAL_METAL_IND,CURRENT_MINE_STATUS,CURRENT_OPERATOR_ID,CURRENT_OPERATOR_NAME,NO_EMPLOYEES,LONGITUDE,LATITUDE
17203,1400411,Independent Salt Company,M,Active,L11317,Independent Salt Company,58.0,-98.127222,38.720833
17204,1400412,Hutchinson Salt Company,M,Active,L11799,Hutchinson Salt Company,52.0,-97.871111,38.046389
17205,1400413,Lyons Salt,M,Active,L11902,Lyons Salt Company,47.0,-98.19,38.333056
31986,1600358,Cote Blanche Mine,M,Active,0136233,"Compass Minerals Louisiana, Inc.",161.0,-91.723611,39.750833
32018,1600509,AVERY ISLAND,M,Active,L00028,Cargill Deicing Technology,178.0,-91.910278,29.895


In [13]:
# drop duplicate rows, by default keeping the first in each set of duplicates
mines.drop_duplicates()
mines.shape

(16, 9)

In [14]:
mines['LONGITUDE'] = mines.LONGITUDE * -1
#mines.head(100)
with pd.option_context('display.max_colwidth', 9999):
     display(mines.head(100))

Unnamed: 0,MINE_ID,CURRENT_MINE_NAME,COAL_METAL_IND,CURRENT_MINE_STATUS,CURRENT_OPERATOR_ID,CURRENT_OPERATOR_NAME,NO_EMPLOYEES,LONGITUDE,LATITUDE
17203,1400411,Independent Salt Company,M,Active,L11317,Independent Salt Company,58.0,-98.127222,38.720833
17204,1400412,Hutchinson Salt Company,M,Active,L11799,Hutchinson Salt Company,52.0,-97.871111,38.046389
17205,1400413,Lyons Salt,M,Active,L11902,Lyons Salt Company,47.0,-98.19,38.333056
31986,1600358,Cote Blanche Mine,M,Active,0136233,"Compass Minerals Louisiana, Inc.",161.0,-91.723611,39.750833
32018,1600509,AVERY ISLAND,M,Active,L00028,Cargill Deicing Technology,178.0,-91.910278,29.895
32162,1600970,Weeks Island Mine and Mill,M,Active,0114165,Morton Salt Inc,227.0,-91.811389,29.805556
34545,2000552,DETROIT SALT MINE,M,Active,L15686,Detroit Salt Co LLC,41.0,-83.148889,42.282778
44463,2901042,CARLSBAD LAKE PLANT,M,Active,L00708,United Salt Carlsbad LLC,40.0,-103.973056,32.3925
44464,2901043,CARLSBAD PLANT,M,Active,L00708,United Salt Carlsbad LLC,38.0,-103.989167,32.552222
44704,2901785,New Mexico Salt & Minerals Corp,M,Active,L06884,New Mexico Salt & Minerals Corp,1.0,-104.041111,32.304444


In [15]:
mines.to_csv("mines.csv")

In [16]:
# select rows that contain one or more words
#mines[mines["CURRENT_MINE_NAME"]
#           .str.contains("Fairport|Windsor|Baker", na = False)].head()