## DUC DATA GROUP PRODUCTION INFO


### WORKFLOW

    - Import Well Header with completion info
    - Import Well Production Info from Time Series grouping
    - Use merge to add total hours, C5+, gas, oil & water production from cumulative time series files 
    - Add early & late production date features and production features to Wells file
    - Add a production flag
    - Add a production hours flag for wells with production hours (some wells have production and no hours)
    - Record and save wells without production hours,  Save and record wells with production but without production hours
    - Save Wells dataframe with cumulative production info

In [None]:
# Import libraries
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

In [None]:
# Load files
Wells = pd.read_csv('WellHeader_with_Completions.csv')
Cum_BOE=pd.read_csv('Cum_BOEProd_Time_Series.csv')

In [None]:
Wells.head()

In [None]:
Wells.columns

In [None]:
Wells.info()

In [None]:
Cum_BOE.head()

In [None]:
Cum_BOE.shape

In [None]:
P_Comp_Flag_by_BOE = pd.read_csv('Prod_by_BOE_Inferred_Completion_Flag.csv')
P_Comp_Flag_by_BOE.head()

In [None]:
Cum_Cond = pd.read_csv('Cum_CondProd_Time_Series.csv')
Cum_Gas = pd.read_csv('Cum_GasProd_Time_Series.csv')
Cum_Oil = pd.read_csv('Cum_OilProd_Time_Series.csv')
Cum_Hours = pd.read_csv('Cum_ProdHours_Time_Series.csv')
Cum_Water = pd.read_csv('Cum_WaterProd_Time_Series.csv')

In [None]:
Cum_Cond.head()

In [None]:
Cum_Gas.head()

In [None]:
Cum_Oil.head()

In [None]:
Cum_Hours.head()

In [None]:
Cum_Water.head()

In [None]:
Cum_Cond['Condensate Production (Bbls)'] = Cum_Cond['2020-01-31 00:00:00.000']
Wells = pd.merge(Wells, Cum_Cond[['EPAssetsId', 'Condensate Production (Bbls)']], left_on = 'EPAssetsId', right_on = 'EPAssetsId', how = 'outer', sort = False)

In [None]:
Cum_Gas['Gas Production (MMcf)'] = Cum_Gas['2020-01-31 00:00:00.000']
Wells = pd.merge(Wells, Cum_Gas[['EPAssetsId', 'Gas Production (MMcf)']], left_on = 'EPAssetsId', right_on = 'EPAssetsId', how = 'outer', sort = False)

In [None]:
Cum_Oil['Oil Production (Bbls)'] = Cum_Oil['2020-01-31 00:00:00.000']
Wells = pd.merge(Wells, Cum_Oil[['EPAssetsId', 'Oil Production (Bbls)']], left_on = 'EPAssetsId', right_on = 'EPAssetsId', how = 'outer', sort = False)

In [None]:
Cum_Hours['Production Hours'] = Cum_Hours['2020-01-31 00:00:00.000']
Wells = pd.merge(Wells, Cum_Hours[['EPAssetsId', 'Production Hours']], left_on = 'EPAssetsId', right_on = 'EPAssetsId', how = 'outer', sort = False)

In [None]:
Cum_Water['Water Production (Bbls)'] = Cum_Water['2020-01-31 00:00:00.000']
Wells = pd.merge(Wells, Cum_Water[['EPAssetsId', 'Water Production (Bbls)']], left_on = 'EPAssetsId', right_on = 'EPAssetsId', how = 'outer', sort = False)

In [None]:
Wells.head()

In [None]:
Cum_BOE['Prod_Flag'] = 1 * (Cum_BOE['2020-01-31 00:00:00.000'] > 0)
Wells = pd.merge(Wells, Cum_BOE[['EPAssetsId', 'Prod_Flag']], left_on = 'EPAssetsId', right_on = 'EPAssetsId', how = 'outer', sort = False)

In [None]:
sum(Cum_BOE['2020-01-31 00:00:00.000']>0), sum(Wells['Prod_Flag'] ==1), sum(Wells['Prod_Flag'].isnull())

### Add Prod_Flag for wells with only water production

In [None]:
Wells['Prod_Flag'].fillna(Wells['Water Production (Bbls)'] > 0, inplace = True)

In [None]:
sum(Cum_BOE['2020-01-31 00:00:00.000']>0), sum(Wells['Prod_Flag'] ==1), sum(Wells['Prod_Flag'].isnull())

### Add Hours_Flag to indicate wells that have hours in the production information and wells that do not.

In [None]:
Cum_Hours['Hours_Flag'] = 1 * (Cum_Hours['2020-01-31 00:00:00.000'] > 0)
Wells = pd.merge(Wells, Cum_Hours[['EPAssetsId', 'Hours_Flag']], left_on = 'EPAssetsId', right_on = 'EPAssetsId', how = 'outer', sort = False)

### Add First and Last Production Dates

In [None]:
#Drop Unnamed column if needed
#P_Comp_Flag_by_BOE.drop(['Unnamed: 0'], axis = 1, inplace = True)


# Convert the status flag to the First month of production from the first occurance of 1 indicating prod in that month
#   If there is no production, First prod'n is set to Dec 2021
FirstProd_by_BOE = []
Months = P_Comp_Flag_by_BOE.columns
for row in range(len(P_Comp_Flag_by_BOE)):
    if 1 in list(P_Comp_Flag_by_BOE.iloc[row,1:62]):
        FirstProd_by_BOE.append(Months[(list(P_Comp_Flag_by_BOE.iloc[row,:]).index(1))])
    else:
        FirstProd_by_BOE.append('2020-12-31 00:00:00.000')
FirstProd_by_BOE[0:5]

In [None]:
P_Comp_Flag_by_BOE['First_Prod'] = FirstProd_by_BOE
P_Comp_Flag_by_BOE.head()

In [None]:
Wells = pd.merge(Wells, P_Comp_Flag_by_BOE[['EPAssetsId', 'First_Prod']], left_on = 'EPAssetsId', right_on = 'EPAssetsId', how = 'outer', sort = False)

In [None]:
Wells = pd.merge(Wells, P_Comp_Flag_by_BOE[['EPAssetsId', 'Last_Prod']], left_on = 'EPAssetsId', right_on = 'EPAssetsId', how = 'outer', sort = False)

In [None]:
Wells.head()

### Add Profile Label 

In [None]:
#Encode Well Profile
from sklearn.preprocessing import LabelEncoder

encoder_Profile = LabelEncoder()
encoder_Profile.fit(Wells['WellProfile'])
Wells['Profile_C'] = encoder_Profile.transform(Wells['WellProfile'])

### Add a Flag for Wells with production but without hours - HoursEst_Flag, Use to estiamte total hours for average daily BOE

In [None]:
Wells['Hours_Flag'].fillna(0, inplace = True)
Wells['HoursEst_Flag'] = (Wells['Prod_Flag']==1) * (Wells['Hours_Flag']==0)
sum(Wells['HoursEst_Flag']==1)

In [None]:
Wells['HoursEst_Flag'].value_counts()

## Estimate production hours for wells on HoursEst_Flag

In [None]:
df = Wells[['EPAssetsId','Production Hours', 'HoursEst_Flag']][Wells['HoursEst_Flag']==1]
df.head()

In [None]:
sum(Wells['Production Hours'].isnull()), sum(Wells['Production Hours']>0), sum(df['Production Hours'].isnull()), sum(df['Production Hours']>0)

In [None]:
P_Comp_Flag_by_BOE['Inferred_Hours_Est'] = P_Comp_Flag_by_BOE['2020-01-31 00:00:00.000']*700-350
P_Comp_Flag_by_BOE.head()

In [None]:
Wells = pd.merge(Wells, P_Comp_Flag_by_BOE[['EPAssetsId', 'Inferred_Hours_Est']], left_on = 'EPAssetsId', right_on = 'EPAssetsId', how = 'outer', sort = False)

In [None]:
sum(Wells['Inferred_Hours_Est'].isnull()), sum(Wells['Inferred_Hours_Est']>0)

In [None]:
Wells['Production Hours'].fillna(Wells['Inferred_Hours_Est'], inplace = True)

In [None]:
sum(Wells['Production Hours'].isnull()), sum(Wells['Production Hours']>0),  sum(Wells['Production Hours']>=0)

###  Added production hours to 190 wells that were missing hours

## Add Cumulative BOE and Daily Average BOE

In [None]:
Cum_BOE['TotalProd(BOE)'] = Cum_BOE['2020-01-31 00:00:00.000']
Wells = pd.merge(Wells, Cum_BOE[['EPAssetsId', 'TotalProd(BOE)']], left_on = 'EPAssetsId', right_on = 'EPAssetsId', how = 'outer', sort = False)

In [None]:
Wells['DailyAvgProd(BOEpd)'] = 24 * Wells['TotalProd(BOE)'] / Wells['Production Hours']

## Get Days Since Last Production

In [None]:
# Convert Date Features in Wells to datetime
date_cols = ['SpudDate', 'FinalDrillDate','RigReleaseDate','StatusDate', 'Early_Comp', 'Late_Comp', 'First_Prod', 'Last_Prod']
for col in date_cols:
    Wells[col] = pd.to_datetime(Wells[col], infer_datetime_format=True)

In [None]:
Wells['DaysSinceProdn'] = pd.to_datetime('2020-02-01 00:00:00.000') -  Wells['Last_Prod']

In [None]:
Wells['DaysSinceProdn'].fillna(pd.to_datetime('2020-02-01 00:00:00.000')-pd.to_datetime('2020-02-01 00:00:00.000'), inplace =True)

In [None]:
Wells['DaysSinceProdn'] = Wells['DaysSinceProdn'].dt.days.astype('int16')

In [None]:
sum(Wells['DaysSinceProdn']==0)

In [None]:
Wells['DaysSinceProdn'] = Wells['DaysSinceProdn'].apply(lambda x: -1 if x==0 else x)

In [None]:
sum(Wells['DaysSinceProdn']==0), sum(Wells['DaysSinceProdn']== -1)

### Get Inferred Completion Flag and Get Inferred Completion Date and enter as Early_Comp and Late_Comp

### Use inferred completion derived from Cumulaitve Production since there are more wells with production than with hours.

In [None]:
Wells['InferComp_Flag'] = 1*(Wells['Prod_Flag']==1)*(Wells['Comp_Flag']==0)
sum(Wells['InferComp_Flag']==1), sum(Wells['InferComp_Flag']==0), sum(Wells['InferComp_Flag'].isnull())

In [None]:
Wells = pd.merge(Wells, P_Comp_Flag_by_BOE[['EPAssetsId', 'Inferred_Comp_Date']], left_on = 'EPAssetsId', right_on = 'EPAssetsId', how = 'outer', sort = False)

In [None]:
Wells['Inferred_Comp_Date'] = pd.to_datetime(Wells['Inferred_Comp_Date'], infer_datetime_format=True)

In [None]:
Wells['Early_Comp'].fillna(Wells['Inferred_Comp_Date'], inplace = True)
Wells['Late_Comp'].fillna(Wells['Inferred_Comp_Date'], inplace = True)

## Suspect there are 2 wells with only water production

In [None]:
Wells[['EPAssetsId','Formation','Field','SpudDate','Production Hours','First_Prod','Water Production (Bbls)','TotalProd(BOE)','DailyAvgProd(BOEpd)']][Wells['Production Hours']>0][Wells['Last_Prod'].isnull()].head()

## Check Other Flags

In [None]:
sum(Wells['Comp_Flag']==1), sum(Wells['Comp_Flag'].isnull())

In [None]:
Wells['Comp_Flag'].fillna(0, inplace = True)

In [None]:
sum(Wells['Hours_Flag']==0), sum(Wells['Hours_Flag']==1), sum(Wells['Hours_Flag']== -1), sum(Wells['Hours_Flag'].isnull())

In [None]:
sum(Wells['HoursEst_Flag']==0), sum(Wells['HoursEst_Flag']==1), sum(Wells['HoursEst_Flag']== -1), sum(Wells['HoursEst_Flag'].isnull())

In [None]:
sum(Wells['Prod_Flag']==0), sum(Wells['Prod_Flag']==1), sum(Wells['Prod_Flag']== -1), sum(Wells['Prod_Flag'].isnull())

## Reset Hours Flag  so:
#### Flag is: 1 for Well Hours in WellProduction (10386 EPAssetsId's), 0 for No Info, but Production, -1 for No Hours and No Production

In [None]:
Wells['Hours_Flag'] = -1*(Wells['Prod_Flag']==0)*(Wells['HoursEst_Flag']==0) + 1*(Wells['Prod_Flag']==1)*(Wells['HoursEst_Flag']==0)

In [None]:
sum(Wells['Hours_Flag']==0), sum(Wells['Hours_Flag']==1), sum(Wells['Hours_Flag']== -1), sum(Wells['Hours_Flag'].isnull())

In [None]:
Wells.head()

In [None]:
Wells.info()

### Fill Days Drilling NaN's 

In [None]:
Wells['DaysDrilling'].fillna( (Wells['FinalDrillDate'] - Wells['SpudDate']).apply(lambda x: float(x.days)), inplace = True)

In [None]:
Wells.info()

In [None]:
# Number of Wells Missing Completion Info that have Production
sum(Wells['Prod_Flag'][Wells['Comp_Flag']==0])

In [None]:
# Number of Wells with Production that are Missing Hours from the Production data
sum(Wells['Prod_Flag'][Wells['Hours_Flag']==0])

In [None]:
# Number of Wells with Production Hours over 1400 that are Missing Completion Information
sum(Wells['Prod_Flag'][Wells['Production Hours']>1400][Wells['Comp_Flag']==0])