# West Virginia Production dataset - WVGES version



In [1]:
# python/pandas preamble
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import zipfile, os

# different sets contain input files that are formatted similarly
#set1_dir = './sources/set1/'
#set2_dir = './sources/set2/'
#set3_dir = './sources/set3/'
#set4_dir = './sources/set4/'
datadir = './sources/'
outdir = './out/'

# Production data from WVGES

See website <http://www.wvgs.wvnet.edu/www/datastat/devshales.htm>  for lots of resources.  This includes
an excel sheet with lots of metadata on each unconventional well. We use that here to filter out conventional wells and for lat/lon, as well as monthly production.

**Note that the original file is in read-only lock.  You must make an explicit copy within Excel to successfully read in these data.**

In [2]:
klst = ['API','YEAR','G_JAN','G_FEB','G_MAR','G_APR','G_MAY','G_JUN','G_JUL','G_AUG','G_OCT','G_NOV','G_DEC',
        'O_JAN','O_FEB','O_MAR','O_APR','O_MAY','O_JUN','O_JUL','O_AUG','O_OCT','O_NOV','O_DEC',
        'NGL_JAN','NGL_FEB','NGL_MAR','NGL_APR','NGL_MAY','NGL_JUN','NGL_JUL','NGL_AUG','NGL_OCT','NGL_NOV','NGL_DEC',]
prod = pd.read_excel(datadir+'Copy of WVGES Marcellus Wells.xls',sheet_name='WV Marcellus Prod',
                     usecols=klst, skiprows=1)

The dataset has some records back to 1979 for working wells that are fracked in the early 2000's.  Presumably, the back records are included to demonstrate the difference between conventional and non-conventional.   I am leaving off years before 2002 (the earliest spud date) to keep the dataset from being swamped with empty cells.

In [3]:
prod = prod[prod.YEAR>2001]
prod.head()

Unnamed: 0,API,YEAR,G_JAN,G_FEB,G_MAR,G_APR,G_MAY,G_JUN,G_JUL,G_AUG,...,NGL_FEB,NGL_MAR,NGL_APR,NGL_MAY,NGL_JUN,NGL_JUL,NGL_AUG,NGL_OCT,NGL_NOV,NGL_DEC
0,4700102885,2008,0.0,0.0,956.0,1082.0,1397.0,1158.0,1179.0,1013.0,...,,,,,,,,,,
1,4700102885,2009,758.0,1104.0,774.0,881.0,437.0,711.0,408.0,346.0,...,,,,,,,,,,
2,4700102885,2010,1797.0,1385.0,1270.0,1024.0,899.0,1170.0,952.0,747.0,...,,,,,,,,,,
3,4700102885,2011,748.0,717.0,617.0,590.0,590.0,689.0,636.0,512.0,...,,,,,,,,,,
4,4700102885,2012,589.0,636.0,719.0,602.0,677.0,433.0,583.0,259.0,...,,,,,,,,,,


In [4]:
prod.groupby("YEAR").count()

Unnamed: 0_level_0,API,G_JAN,G_FEB,G_MAR,G_APR,G_MAY,G_JUN,G_JUL,G_AUG,G_OCT,...,NGL_FEB,NGL_MAR,NGL_APR,NGL_MAY,NGL_JUN,NGL_JUL,NGL_AUG,NGL_OCT,NGL_NOV,NGL_DEC
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2002,53,53,53,53,53,53,53,53,53,53,...,0,0,0,0,0,0,0,0,0,0
2003,55,55,55,55,55,55,55,55,55,55,...,0,0,0,0,0,0,0,0,0,0
2004,58,58,58,58,58,58,58,58,58,58,...,0,0,0,0,0,0,0,0,0,0
2005,126,126,126,126,126,126,126,126,126,126,...,0,0,0,0,0,0,0,0,0,0
2006,380,380,380,380,380,380,380,380,380,380,...,0,0,0,0,0,0,0,0,0,0
2007,734,734,734,734,734,734,734,734,734,734,...,0,0,0,0,0,0,0,0,0,0
2008,1218,1218,1218,1218,1218,1218,1218,1218,1218,1218,...,0,0,0,0,0,0,0,0,0,0
2009,1378,1378,1378,1378,1378,1378,1378,1378,1378,1378,...,0,0,0,0,0,0,0,0,0,0
2010,1563,1563,1563,1563,1563,1563,1563,1563,1563,1563,...,0,0,0,0,0,0,0,0,0,0
2011,1802,1802,1802,1802,1802,1802,1802,1802,1802,1802,...,0,0,0,0,0,0,0,0,0,0


In [4]:
xlate = {'JAN':'01', 'FEB':'02', 'MAR':'03', 'APR':'04', 'MAY':'05', 'JUN':'06',
         'JUL':'07', 'AUG':'08', 'SEP':'09', 'OCT':'10', 'NOV':'11', 'DEC':'12'}

def getColName(row):
    yr = str(row.YEAR)
    mo = xlate[row.variable[-3:]]
    return f'Y{yr}M{mo}'
    

mprod = prod.melt(id_vars=['API','YEAR'])
mprod = mprod.groupby(['API','YEAR','variable'],as_index=False)['value'].max()

mprod['typ'] = np.where(mprod.variable.str[0]=='G','GAS',np.nan)
mprod.typ = np.where(mprod.variable.str[0]=='O','OIL',mprod.typ)
mprod.typ = np.where(mprod.variable.str[0]=='N','NGL',mprod.typ)
mprod['yrmo'] = mprod.apply(lambda x: getColName(x),axis=1)
mprod.tail()

Unnamed: 0,API,YEAR,variable,value,typ,yrmo
855487,4710903065,2017,O_JUN,,OIL,Y2017M06
855488,4710903065,2017,O_MAR,,OIL,Y2017M03
855489,4710903065,2017,O_MAY,,OIL,Y2017M05
855490,4710903065,2017,O_NOV,,OIL,Y2017M11
855491,4710903065,2017,O_OCT,,OIL,Y2017M10


In [6]:
mprod.to_pickle(outdir+'ver2.pkl')

# Metadata from WVGES

In [6]:
klst = ['API Number','County Name','Operator at Completion',
        'Surface Loc Longitude','Surface Loc Latitude','Spud Date']
meta = pd.read_excel(datadir+'Copy of WVGES Marcellus Wells.xlsx',sheet_name='Completed Marcellus Wells',
                     usecols=klst, skiprows=1)
meta.columns = ['API','CountyName','OperatorName',
        'Surf_Long','Surf_Lat','Spud Date']

meta.head()



Unnamed: 0,API,CountyName,OperatorName,Surf_Long,Surf_Lat,Spud Date
0,4700102885,Barbour,Seneca-Upshur Petroleum Corp.,-79.97391,39.008158,8/13/2007
1,4700102981,Barbour,"EXCO - North Coast Energy Eastern, Inc.",-80.108525,39.045433,8/30/2008
2,4700102985,Barbour,Dominion Exploration & Production,-80.020647,39.20418,7/30/2009
3,4700102994,Barbour,Petroleum Development Corp.,-80.016268,39.185397,1/17/2009
4,4700103013,Barbour,Petroleum Development Corp.,-80.170851,39.232015,7/15/2009


In [7]:
print(f'Len meta: {len(meta)}, unique API: {len(meta.API.unique())}')

Len meta: 3718, unique API: 3703


In [8]:
#  Need to take most recent meta record:
meta = meta.groupby('API',as_index=False).last()

In [9]:
print(f'Len meta: {len(meta)}, unique API: {len(meta.API.unique())}')

Len meta: 3703, unique API: 3703


# Make the output files

In [10]:
def makeFile(data,prodName):
    mprod = data[data.typ==prodName].filter(['API','yrmo','value'],axis=1)
    pivot = mprod.pivot(index='API',columns='yrmo')
    pivot.columns = pivot.columns.get_level_values(1)
    #pivot.head()
    mg = pd.merge(meta,pivot,on='API',how='right',validate='1:1')
    #mg.head()
    mg.to_excel(outdir+'WV_'+prodName+'_production_through_2018.xlsx')

In [11]:
makeFile(mprod,'GAS')

In [12]:
makeFile(mprod,'OIL')

In [13]:
makeFile(mprod,'NGL')