## DUC DATA PRODUCTION INFO TO TIME SERIES

### WORKFLOW

    - Import Well Production Info
    - Group Production file by EPAssetsId and ProdType and pivot to get a time series of Production and Hours
    - Sum Hours to cumulative hours to find first month where Hours > 1400 to infer completion date for missing completion     data
    - Get time series for each fluid type and BOE in monthly and cumulative tables
    - Get an inferred completion date when there are 3 months of reported production (for wells missing hours)
    - Get a last month of production date and add as a feature
    - Check total "volumes"  (volumes and hours against original file
    - Save Wells dataframe with cumulative production info
    

In [None]:
# Import libraries
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

In [None]:
# Load file
Prodn = pd.read_csv('WellProduction_No_Duplicates.csv')

In [None]:
Prodn.head()

In [None]:
Prodn.drop('Unnamed: 0', axis = 1, inplace = True)

In [None]:
Prodn.info()

### Get number of wells present in each ProdType

In [None]:
print('Well count with Hours',len(set(Prodn['EPAssetsId'][Prodn['ProdType']=='Production Hours'])))

In [None]:
print('Well count with Oil Production',len(set(Prodn['EPAssetsId'][Prodn['ProdType']=='Oil Production (Bbls)'])))

In [None]:
print('Well count with Gas Production',len(set(Prodn['EPAssetsId'][Prodn['ProdType']=='Gas Production (MMcf)'])))

In [None]:
print('Well count with Condensate Production',len(set(Prodn['EPAssetsId'][Prodn['ProdType']=='Condensate Production (Bbls)'])))

In [None]:
print('Well count with Water Production',len(set(Prodn['EPAssetsId'][Prodn['ProdType']=='Water Production (Bbls)'])))

## Get Time Series for Hours

### Hours

In [None]:
P_Hours = Prodn[Prodn['ProdType']=='Production Hours'].pivot(index = 'EPAssetsId', columns = 'ProdPeriod' ,values = 'Volume').reset_index()

P_Hours.head()

In [None]:
P_Hours.shape

In [None]:
P_Hours.fillna(0, inplace = True)
P_Hours.head()

In [None]:
P_Cum_Hours = pd.DataFrame(columns = P_Hours.columns)
P_Cum_Hours['EPAssetsId'] = P_Hours['EPAssetsId']
P_Cum_Hours['2015-01-31 00:00:00.000'] = P_Hours['2015-01-31 00:00:00.000']

for col in range(2,62):
    P_Cum_Hours.iloc[:,col] = P_Cum_Hours.iloc[:,col-1] + P_Hours.iloc[:,col]
    
P_Cum_Hours.head()

In [None]:
P_Comp_Flag_by_Hours = pd.DataFrame(columns = P_Hours.columns)
P_Comp_Flag_by_Hours['EPAssetsId'] = P_Hours['EPAssetsId']
P_Comp_Flag_by_Hours['2015-01-31 00:00:00.000'] = 0

for col in range(2,62):
    P_Comp_Flag_by_Hours.iloc[:,col] = P_Cum_Hours.iloc[:,col]>1400

P_Comp_Flag_by_Hours.head()

In [None]:
# Get a Flag for when the effective completion date (month) is based on 
#    cumulative hours > 1400 as denoted P_Comp_Flag_by_Hours.  Code saves a 1 when status changes.
P_Comp_Flag = pd.DataFrame(columns = P_Hours.columns)
P_Comp_Flag['EPAssetsId'] = P_Hours['EPAssetsId']
P_Comp_Flag.fillna( 0, inplace = True)

for col in range(2,62):
    P_Comp_Flag.iloc[:,col] = 1*(P_Comp_Flag_by_Hours.iloc[:,col-1] <  P_Comp_Flag_by_Hours.iloc[:,col])
P_Comp_Flag.head()

In [None]:
# Convert the status change flag to a month date two months prior from the column index
#   If there is no change in status the effective completion is set to Dec 2021
CompDates = []
Months = P_Comp_Flag.columns
for row in range(len(P_Comp_Flag)):
    if True in list(P_Comp_Flag.iloc[row,:]):
        CompDates.append(Months[(list(P_Comp_Flag.iloc[row,:]).index(True))-2])
    else:
        CompDates.append('2020-12-31 00:00:00.000')
CompDates[0:5]

In [None]:
CompDates[195:200]

In [None]:
P_Comp_Flag['Inferred_Comp_Date'] = CompDates

In [None]:
P_Comp_Flag.head()

## Add the Inferred Completion Date to the Hours and Cumulative Hours Time Series and Save

In [None]:
P_Hours['Inferred_Comp_Date'] = CompDates
P_Hours.to_csv('ProdHours_Time_Series.csv')

In [None]:
P_Cum_Hours['Inferred_Comp_Date'] = CompDates
P_Cum_Hours.to_csv('Cum_ProdHours_Time_Series.csv')

In [None]:
P_Comp_Flag.to_csv('ProdHours_Inferred_Completion_Flag.csv')

## Get Time Series for Gas

In [None]:
P_Gas = Prodn[Prodn['ProdType']=='Gas Production (MMcf)'].pivot(index = 'EPAssetsId', columns = 'ProdPeriod' ,values = 'Volume').reset_index()

P_Gas.head()

In [None]:
P_Gas.shape

In [None]:
P_Gas.fillna(0, inplace = True)
P_Gas.head()

In [None]:
P_Cum_Gas = pd.DataFrame(columns = P_Gas.columns)
P_Cum_Gas['EPAssetsId'] = P_Gas['EPAssetsId']
P_Cum_Gas['2015-01-31 00:00:00.000'] = P_Gas['2015-01-31 00:00:00.000']

for col in range(2,62):
    P_Cum_Gas.iloc[:,col] = P_Cum_Gas.iloc[:,col-1] + P_Gas.iloc[:,col]
    
P_Cum_Gas.head()

## Get Time Series for Oil

In [None]:
P_Oil = Prodn[Prodn['ProdType']=='Oil Production (Bbls)'].pivot(index = 'EPAssetsId', columns = 'ProdPeriod' ,values = 'Volume').reset_index()

P_Oil.head()

In [None]:
P_Oil.tail()

In [None]:
P_Oil.shape

In [None]:
P_Oil.fillna(0, inplace = True)
P_Oil.head()

In [None]:
P_Cum_Oil = pd.DataFrame(columns = P_Oil.columns)
P_Cum_Oil['EPAssetsId'] = P_Oil['EPAssetsId']
P_Cum_Oil['2015-01-31 00:00:00.000'] = P_Oil['2015-01-31 00:00:00.000']

for col in range(2,62):
    P_Cum_Oil.iloc[:,col] = P_Cum_Oil.iloc[:,col-1] + P_Oil.iloc[:,col]
    
P_Cum_Oil.head()

## Get Time Series for Condensate

In [None]:
P_Cond = Prodn[Prodn['ProdType']=='Condensate Production (Bbls)'].pivot(index = 'EPAssetsId', columns = 'ProdPeriod' ,values = 'Volume').reset_index()

P_Cond.head()

In [None]:
P_Cond.shape

In [None]:
P_Cond.fillna(0, inplace = True)
P_Cond.head()

In [None]:
P_Cum_Cond = pd.DataFrame(columns = P_Cond.columns)
P_Cum_Cond['EPAssetsId'] = P_Cond['EPAssetsId']
P_Cum_Cond['2015-02-28 00:00:00.000'] = P_Cond['2015-02-28 00:00:00.000']

for col in range(2,61):
    P_Cum_Cond.iloc[:,col] = P_Cum_Cond.iloc[:,col-1] + P_Cond.iloc[:,col]
    
P_Cum_Cond.head()

## Get Time Series for Water

In [None]:
P_Water = Prodn[Prodn['ProdType']=='Water Production (Bbls)'].pivot(index = 'EPAssetsId', columns = 'ProdPeriod' ,values = 'Volume').reset_index()

P_Water.head()

In [None]:
P_Water.shape

In [None]:
P_Water.fillna(0, inplace = True)
P_Water.head()

In [None]:
P_Cum_Water = pd.DataFrame(columns = P_Water.columns)
P_Cum_Water['EPAssetsId'] = P_Water['EPAssetsId']
P_Cum_Water['2015-01-31 00:00:00.000'] = P_Water['2015-01-31 00:00:00.000']

for col in range(2,62):
    P_Cum_Water.iloc[:,col] = P_Cum_Water.iloc[:,col-1] + P_Water.iloc[:,col]
    
P_Cum_Water.head()

## Save Monthly and Cumulative Production for Gas, Oil, Condensate & Water

In [None]:
P_Gas.to_csv('GasProd_Time_Series.csv')
P_Cum_Gas.to_csv('Cum_GasProd_Time_Series.csv')

In [None]:
P_Oil.to_csv('OilProd_Time_Series.csv')
P_Cum_Oil.to_csv('Cum_OilProd_Time_Series.csv')

In [None]:
P_Cond.to_csv('CondProd_Time_Series.csv')
P_Cum_Cond.to_csv('Cum_CondProd_Time_Series.csv')

In [None]:
P_Water.to_csv('WaterProd_Time_Series.csv')
P_Cum_Water.to_csv('Cum_WaterProd_Time_Series.csv')

## Check total sum of all volumes - matches  total of original Production file

In [None]:
col = '2020-01-31 00:00:00.000'
S = sum(P_Cum_Hours[col]) +sum(P_Cum_Oil[col]) + sum(P_Cum_Gas[col]) + sum(P_Cum_Cond[col]) + sum(P_Cum_Water[col])
S

## Get Total Production BOE

### There are 10,386 Wells in the Production Data, but only 10,374 have Gas production and 10,196 have Hours recorded.  Fewer instances in Oil, Condensate, & Water

#### So all EPAssetId's are not represented in all subsets.

## Convert Gas to BOE

In [None]:
P_Gas_BOE = pd.DataFrame(columns = P_Gas.columns)
P_Gas_BOE['EPAssetsId'] = P_Gas['EPAssetsId']
for col in P_Gas.columns:
    if col != 'EPAssetsId':
        P_Gas_BOE[col] = P_Gas[col]*1000/6

P_Gas_BOE.head()

In [None]:
# Save Gas as BOE
P_Gas_BOE.to_csv('P_Gas_as_BOE.csv')

In [None]:
Liq = P_Oil.append(P_Cond, ignore_index=True)
Liq.head()

In [None]:
Liq.info()

In [None]:
BOE = Liq.append(P_Gas_BOE, ignore_index=True)

In [None]:
P_BOE = BOE.groupby('EPAssetsId').sum()
P_BOE.head()

In [None]:
P_BOE.index = range(P_BOE.shape[0])
P_BOE.head()

In [None]:
P_BOE.info()

In [None]:
# Save P_BOE by month
P_BOE.to_csv('BOEProd_Time_Series.csv')

### Get BOE as Cumulative BOE

In [None]:
P_Cum_BOE = pd.DataFrame(columns = P_Oil.columns)
P_Cum_BOE['EPAssetsId'] = P_BOE['EPAssetsId']
P_Cum_BOE['2015-01-31 00:00:00.000'] = P_BOE['2015-01-31 00:00:00.000']

for col in range(2,62):
    P_Cum_BOE.iloc[:,col] = P_Cum_BOE.iloc[:,col-1] + P_BOE.iloc[:,col-1]
    
P_Cum_BOE.head()

In [None]:
P_Cum_BOE.tail()

In [None]:
#Save Cumulative BOE 
P_Cum_BOE.to_csv('Cum_BOEProd_Time_Series.csv')

## Get Inferred Completion Month from Cumulative Production for missing date for Wells Without Hours

In [None]:
P_Comp_Flag_by_BOE = pd.DataFrame(columns = P_Cum_BOE.columns)
P_Comp_Flag_by_BOE['EPAssetsId'] = P_Cum_BOE['EPAssetsId']
P_Comp_Flag_by_BOE['2015-01-31 00:00:00.000'] = P_Cum_BOE['2015-01-31 00:00:00.000'] > 0

for col in range(2,62):
    P_Comp_Flag_by_BOE.iloc[:,col] = P_Cum_BOE.iloc[:,col]>P_Cum_BOE.iloc[:,col-1]

P_Comp_Flag_by_BOE.head()

In [None]:
P_Comp_Flag_by_BOE.tail()

In [None]:

P_Comp_Flag_by_BOE['2015-01-31 00:00:00.000'] = 1 * (P_Comp_Flag_by_BOE['2015-01-31 00:00:00.000'] == True )

for col in range(2,62):
    P_Comp_Flag_by_BOE.iloc[:,col] = 1 * (P_Comp_Flag_by_BOE.iloc[:,col] == True ) + P_Comp_Flag_by_BOE.iloc[:,col-1]

P_Comp_Flag_by_BOE.head()

In [None]:
P_Comp_Flag_by_BOE.tail()

In [None]:
# Convert where the status flag reaches 3 to a month date two months prior from the column index
#   If there is not 3 months of increasing cumulative production the effective completion is set to Dec 2021
CompDates_by_BOE = []
Months = P_Comp_Flag_by_BOE.columns
for row in range(len(P_Comp_Flag_by_BOE)):
    if 3 in list(P_Comp_Flag_by_BOE.iloc[row,1:]):
        CompDates_by_BOE.append(Months[(list(P_Comp_Flag_by_BOE.iloc[row,:]).index(3))-2])
    else:
        CompDates_by_BOE.append('2020-12-31 00:00:00.000')
CompDates_by_BOE[0:5]

In [None]:
'EPAssetsId' in CompDates_by_BOE

In [None]:
P_Comp_Flag_by_BOE['Inferred_Comp_Date'] = CompDates_by_BOE
P_Comp_Flag_by_BOE.head()

In [None]:
P_Comp_Flag_by_BOE.tail()

In [None]:
# Convert the status flag to the last month of production from the maximum sum of increasing cumulative prod by month
#   If there is no production, last prod'n is set to Dec 2021
LastProd_by_BOE = []
Months = P_Comp_Flag_by_BOE.columns
for row in range(len(P_Comp_Flag_by_BOE)):
    if 1 in list(P_Comp_Flag_by_BOE.iloc[row,1:62]):
        Max = np.max(P_Comp_Flag_by_BOE.iloc[row,1:62])
        LastProd_by_BOE.append(Months[(list(P_Comp_Flag_by_BOE.iloc[row,:]).index(Max))])
    else:
        CompDates_by_BOE.append('2020-12-31 00:00:00.000')
LastProd_by_BOE[0:5]

In [None]:
'EPAssetsId' in LastProd_by_BOE

In [None]:
P_Comp_Flag_by_BOE['Last_Prod']=LastProd_by_BOE

In [None]:
P_Comp_Flag_by_BOE.to_csv('Prod_by_BOE_Inferred_Completion_Flag.csv')

## Final check  on Volumes

In [None]:
# Sum of separate volumes by ProdType was correct.  So cross check that separate volumes total the Cum BOE volumes

col = '2020-01-31 00:00:00.000'
S =  sum(P_Cum_Oil[col]) + sum(P_Cum_Gas[col])*1000/6 + sum(P_Cum_Cond[col])
S, sum(P_Cum_BOE[col])

## Volumes match