# MSHA Mines + Violations

The data sets are on the [MSHA data sets page](https://arlweb.msha.gov/opengovernmentdata/ogimsha.asp)

In [None]:
from datetime import datetime
start_time = datetime.now()

In [None]:
import pandas as pd
from IPython.display import display

Import the [mine definition file](https://arlweb.msha.gov/opengovernmentdata/DataSets/Mines_Definition_File.txt).

In [None]:
# import from local storage
#mines_definition = pd.read_table('Mines_Definition_File.txt', sep='|')

In [None]:
# import from the MSHA website
mines_definition = pd.read_table(
    'https://arlweb.msha.gov/opengovernmentdata/DataSets/Mines_Definition_File.txt', 
    sep='|')

In [None]:
mines_definition.shape

In [None]:
with pd.option_context('display.max_colwidth', 9999):
     display(mines_definition.head(2))

Import the [violations definition file](https://arlweb.msha.gov/opengovernmentdata/DataSets/violations_Definition_File.txt).

In [None]:
#read file from local storage
#violations_definition = pd.read_table('violations_Definition_File.txt',
#                                 sep='|',
#                                 index_col='COLUMN_NAME')

In [None]:
#read file from website
violations_definition = pd.read_table(
    'https://arlweb.msha.gov/opengovernmentdata/DataSets/violations_Definition_File.txt', 
    sep='|',
    index_col='COLUMN_NAME'
)

In [None]:
with pd.option_context('display.max_colwidth', 9999):
     display(violations_definition.head(2))

Read the mines list file.

In [None]:
from os import access, R_OK, unlink

import requests


def download(url, *, to, _chunk_size=8 << 10):
    '''
    Stream contents of url to file
    '''
    # Arbitrarily chose to write in 8KiB chunks. Could tweak after profiling.
    r = requests.get(url, stream=True)
    r.raise_for_status()
    try:
        with open(to, 'wb') as fp:
            for chunk in r.iter_content(chunk_size=_chunk_size):
                fp.write(chunk)
    except:
        unlink(to)


#try:
#    st = stat('Mines.zip')
#    # TODO: Which GMT
#    # Locales can mess this up, but let's assume only en_US for now.
#    headers = {'If-Modified-Since':
#                   st.mtime
#                     .astimezone()
#                     .strftime('%a, %d %b %Y %H:%M:%S GMT')}

# If you have downloaded the Mines.zip and Violations.zip files manually, the script
# will read your local files.
# Otherwise it will download them from MSHA.

if not access('Mines.zip', R_OK):
    download('https://arlweb.msha.gov/opengovernmentdata/DataSets/Mines.zip',
             to='Mines.zip')
if not access('Violations.zip', R_OK):
    download('https://arlweb.msha.gov/opengovernmentdata/DataSets/Violations.zip',
             to='Violations.zip')

In [None]:
from zipfile import ZipFile

with ZipFile('Mines.zip', 'r') as zf:
    print(zf.namelist())
    with zf.open('Mines.txt') as fp:
        mines = pd.read_table(fp, encoding='latin-1', sep='|', parse_dates=True)

# to watch the download in a terminal
# du -h Mines.zip

In [None]:
#from zipfile import ZipFile
#
#with ZipFile('Mines.zip', 'r') as zf:
#    print(zf.namelist())
#    with zf.open('Mines.txt') as fp:
#        mines = pd.read_table(fp, encoding='latin-1', sep='|', 
#                              usecols=["MINE_ID", "CURRENT_MINE_NAME"],
#                              parse_dates=True)

In [None]:
mines.shape

In [None]:
mines.columns

In [None]:
#mines = pd.read_table('Mines.txt', encoding='latin-1', sep='|', 
#                       usecols=["MINE_ID", "CURRENT_MINE_NAME"])
#mines.head(5)

In [None]:
#mines.head(2)
with pd.option_context('display.max_colwidth', 9999):
     display(mines.head(2))

In [None]:
mines.tail(2)

In [None]:
#pd.crosstab([mines.STATE], mines.CURRENT_MINE_TYPE, margins=True)

In [None]:
#from IPython.display import display
#with pd.option_context('display.max_rows', 1000):
#    display(pd.crosstab([mines.STATE, mines.CURRENT_MINE_STATUS],
#                         mines.CURRENT_MINE_TYPE, margins=True))

In [None]:
# use this to read the data and delete the last row, if it is consistent every week
#violations = pd.read_table('Violations.txt', encoding='latin-1', sep='|', \
#                           parse_dates=True, skip_footer=1)

In [None]:
#violations = pd.read_csv('Violations.txt', encoding='latin-1', sep='|', parse_dates=True)

In [None]:
#violations = pd.read_csv('Violations.txt', encoding='latin-1', sep='|', 
#                           parse_dates=["INSPECTION_BEGIN_DT"], 
#                           usecols=["MINE_ID", "INSPECTION_BEGIN_DT", "MINE_NAME", 
#                                    "PROPOSED_PENALTY", "PART_SECTION", "EVENT_NO"])

In [None]:
%%time
with ZipFile('Violations.zip', 'r') as zf:
    with zf.open('Violations.txt') as fp:
        violations = pd.read_csv(fp, encoding='latin-1', sep='|',
                               parse_dates=["INSPECTION_BEGIN_DT"],
                               infer_datetime_format=True,
                               usecols=["MINE_ID", "INSPECTION_BEGIN_DT", "MINE_NAME",
                                        "PROPOSED_PENALTY", "PART_SECTION", "EVENT_NO"])

# to watch the download in a terminal
# du -h Violations.zip

In [None]:
violations.shape

In [None]:
violations.head()

In [None]:
# The INSPECTION_BEGIN_DT column is MM/DD/YYYY which is stupid to sort.
violations = violations.sort_values(by = ["INSPECTION_BEGIN_DT"])
violations.head()

In [None]:
# drop rows before a specific date
violations = violations[(violations["INSPECTION_BEGIN_DT"] >= "2017-01-01")]
violations.head()

In [None]:
violations.shape

In [None]:
# identify duplicate rows, by default keep the first in each set of duplicates
violations.duplicated().value_counts()

In [None]:
# drop duplicate rows, by default keeping the first in each set of duplicates
#violations.drop_duplicates()

In [None]:
# select rows that contain one or more words
violations[violations["MINE_NAME"]
           .str.contains("Fairport|Windsor|Baker", na = False)].head()

In [None]:
violations.index

In [None]:
#for column_index in (0,25,34,36,37,46,55,56,58):
#    column_name = violations.columns[column_index]
#    with pd.option_context('display.max_colwidth', 9999):
#        display(violations_definition.loc[column_name],
#                set(map(type,
#                        violations[column_name].unique())))

In [None]:
violations.columns

In [None]:
with pd.option_context('display.max_colwidth', 9999):
     display(violations.head(2))

In [None]:
violations.tail(2)

In [None]:
# Delete the last row
violations = violations.drop(violations.tail(1).index)

In [None]:
violations.tail(2)

In [None]:
# counts for column content
violation_counts = violations['EVENT_NO'].value_counts()
violation_counts.head()

In [None]:
# counts for column content
#violations['CIT_ORD_SAFE'].value_counts()

In [None]:
# counts for column content
#violations['PART_SECTION'].value_counts()

In [None]:
#pd.options.display.max_rows=1000
#pd.crosstab([violations['MINE_ID']], violations['PART_SECTION'], margins=True)

In [None]:
#violations.insert(loc=9, column='CURRENT_OPERATOR_NAME', value='')

In [None]:
#violations.head(2)

In [None]:
set(violations.columns) & set(mines.columns)

In [None]:
%time df = pd.crosstab([violations.MINE_ID], violations.PART_SECTION, margins=True)

In [None]:
rhs = mines[['MINE_ID', 'CURRENT_OPERATOR_NAME']]
rhs = rhs.set_index('MINE_ID')
#df[[]].join(rhs)
df['CURRENT_OPERATOR_NAME'] = df[[]].join(rhs)['CURRENT_OPERATOR_NAME']

In [None]:
df = df.set_index('CURRENT_OPERATOR_NAME', append=True)
df.head()

In [None]:
from sys import getsizeof
pd.DataFrame.from_records([(k, getsizeof(v) / 1024**3)
                           for k, v
                           in locals().items()],
                          columns=('name', 'size (bytes)'),
                          index='name') \
            .sort_values('size (bytes)', ascending=False)

We really should `del` unused objects, such as `df`.

In [None]:
%reset -f in out

In [None]:
del df

In [None]:
from sys import getsizeof
pd.DataFrame.from_records([(k, getsizeof(v) / 1024**3)
                           for k, v
                           in locals().items()],
                          columns=('name', 'size (GB)'),
                          index='name') \
            .sort_values('size (GB)', ascending=False)

In [None]:
end_time = datetime.now()
(end_time - start_time).total_seconds()