# MSHA Mines Data Munging

The data sets are on the [MSHA data sets page](https://arlweb.msha.gov/opengovernmentdata/ogimsha.asp).

The purpose of this code is to munge the data in order to reduce the number of columns and rows to what is required for a different application.

In [1]:
import pandas as pd
from IPython.display import display

Read the mines list file.

In [2]:
# to watch the download in a terminal
# du -h Mines.zip
from zipfile import ZipFile

with ZipFile('Mines.zip', 'r') as zf:
    print(zf.namelist())
    with zf.open('Mines.txt') as fp:
        mines = pd.read_table(fp, encoding='latin-1', sep='|', 
                              usecols=["MINE_ID", "CURRENT_MINE_NAME", \
                                       "COAL_METAL_IND", "CURRENT_OPERATOR_ID", \
                                       "CURRENT_OPERATOR_NAME", "STATE", \
                                       "CURRENT_MINE_STATUS"], \
                              parse_dates=True)

['Mines.txt']


In [3]:
mines.shape

(86880, 7)

In [4]:
mines.columns

Index(['MINE_ID', 'CURRENT_MINE_NAME', 'COAL_METAL_IND', 'CURRENT_MINE_STATUS',
       'CURRENT_OPERATOR_ID', 'CURRENT_OPERATOR_NAME', 'STATE'],
      dtype='object')

In [5]:
mines = mines[mines.CURRENT_MINE_STATUS == "Active"]
mines.shape

(6352, 7)

In [6]:
mines = mines[mines.COAL_METAL_IND == "M"]
mines.shape

(5482, 7)

In [7]:
#mines.head(2)
with pd.option_context('display.max_colwidth', 9999):
     display(mines.head(2))

Unnamed: 0,MINE_ID,CURRENT_MINE_NAME,COAL_METAL_IND,CURRENT_MINE_STATUS,CURRENT_OPERATOR_ID,CURRENT_OPERATOR_NAME,STATE
0,100003,O'Neal Quarry & Mill,M,Active,L13586,Lhoist North America,AL
1,100004,Brierfield Quarry,M,Active,L13586,"Lhoist North America of Alabama, LLC",AL


In [8]:
mines.tail(2)

Unnamed: 0,MINE_ID,CURRENT_MINE_NAME,COAL_METAL_IND,CURRENT_MINE_STATUS,CURRENT_OPERATOR_ID,CURRENT_OPERATOR_NAME,STATE
86826,5500008,Brookman Quarry,M,Active,72683,Heavy Materials LLC,VI
86831,5500013,Aggregate Inc,M,Active,51892,Aggregate Inc,VI


In [9]:
# drop duplicate rows, by default keeping the first in each set of duplicates
mines.drop_duplicates()
mines.shape

(5482, 7)

In [10]:
mines.to_csv("mines.csv")

In [11]:
# select rows that contain one or more words
#mines[mines["CURRENT_MINE_NAME"]
#           .str.contains("Fairport|Windsor|Baker", na = False)].head()