## DUC DATA CLEAN PRODUCTION INFO OF DUPLICATES (AMENDMENTS)

### WORKFLOW

    - Import Well Production Info
    - Find duplicates
    - Check Hours total by adding duplicate period entries.  If not greater then 744, likely ammendment entries
    - If ammendments, add oil, gas, C5+, & water volumes for total and drop duplicate entry,
   
    

In [None]:
# Import libraries
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

In [None]:
# Load file
Prodn = pd.read_csv('WellProduction.csv')

In [None]:
Prodn.head()

In [None]:
Prodn.info()

### Get number of wells present in each ProdType

In [None]:
print('Well count with Hours',len(set(Prodn['EPAssetsId'][Prodn['ProdType']=='Production Hours'])))

In [None]:
print('Well count with Oil Production',len(set(Prodn['EPAssetsId'][Prodn['ProdType']=='Oil Production (Bbls)'])))

In [None]:
print('Well count with Gas Production',len(set(Prodn['EPAssetsId'][Prodn['ProdType']=='Gas Production (MMcf)'])))

In [None]:
print('Well count with Condensate Production',len(set(Prodn['EPAssetsId'][Prodn['ProdType']=='Condensate Production (Bbls)'])))

In [None]:
print('Well count with Water Production',len(set(Prodn['EPAssetsId'][Prodn['ProdType']=='Water Production (Bbls)'])))

## Prodn contains duplicate indexes.  Fix Issue First to be able to pivot data in next Notebook

In [None]:
Dups = Prodn.drop(['Volume','WellHeader.Match'], axis =1)

In [None]:
Dups['is_duplicated'] = Dups.duplicated()
Dups.shape

In [None]:
Prodn['is_duplicated'] = Dups['is_duplicated']

### Hours

In [None]:
Dups[Dups['is_duplicated']==True][Dups['ProdType']=='Production Hours'].head()


In [None]:
len(Dups[Dups['is_duplicated']==True][Dups['ProdType']=='Production Hours'])

In [None]:
len(Dups[Dups['is_duplicated']==True])

### Examine duplicates for two wells

In [None]:
Prodn[Prodn['EPAssetsId']==1167456][Prodn['ProdPeriod']=='2019-01-31 00:00:00.000']

In [None]:
Prodn[Prodn['EPAssetsId']==1169598][Prodn['ProdPeriod']=='2019-08-31 00:00:00.000']

### Work with a smaller df for duplicate hours to verify if duplicates are merged that hours does not excedd 744 in a month.

In [None]:
H_Dup = Dups[Dups['is_duplicated']==True][Dups['ProdType']=='Production Hours']

In [None]:
H_Dup['Dup_Hours']= 0
H_Dup['Non_Dup_Hours']=0
H_Dup['All_Hours']=0

In [None]:
H_Dup.head()

In [None]:
dup_hours = []
hours = []
for row in range(len(H_Dup)):
    dup_hours.append(float(Prodn['Volume'][Prodn['EPAssetsId']==H_Dup.iloc[row][0]][Prodn['ProdPeriod']==H_Dup.iloc[row][1]][Prodn['ProdType']==H_Dup.iloc[row][2]][Prodn['is_duplicated']==True]))
    hours.append(float(Prodn['Volume'][Prodn['EPAssetsId']==H_Dup.iloc[row][0]][Prodn['ProdPeriod']==H_Dup.iloc[row][1]][Prodn['ProdType']==H_Dup.iloc[row][2]][Prodn['is_duplicated']==False]))

H_Dup['Dup_Hours'] = dup_hours
H_Dup['Non_Dup_Hours'] = hours
H_Dup['All_Hours'] = H_Dup['Dup_Hours'] + H_Dup['Non_Dup_Hours']


In [None]:
H_Dup.head()

In [None]:
H_Dup.shape

In [None]:
plt.plot(H_Dup['All_Hours'])
plt.show()

In [None]:
max(H_Dup['All_Hours'])

### Appears the hours are not double entries but amendments.  Still two entries for hours or production far a well in a month will throw an eror on a pivot

In [None]:
H_Dup.info()

## Now get duplicates for all volumes & hours.

In [None]:
All_Dup = Dups[Dups['is_duplicated']==True]

In [None]:
All_Dup.info()

In [None]:
All_Dup['ProdType'].value_counts()

In [None]:
dup_vols = []
vols = []
for row in range(len(All_Dup)):
    dup_vols.append(float(Prodn['Volume'][Prodn['EPAssetsId']==All_Dup.iloc[row][0]][Prodn['ProdPeriod']==All_Dup.iloc[row][1]][Prodn['ProdType']==All_Dup.iloc[row][2]][Prodn['is_duplicated']==True]))
    vols.append(float(Prodn['Volume'][Prodn['EPAssetsId']==All_Dup.iloc[row][0]][Prodn['ProdPeriod']==All_Dup.iloc[row][1]][Prodn['ProdType']==All_Dup.iloc[row][2]][Prodn['is_duplicated']==False]))

All_Dup['Dup_Vols'] = dup_vols
All_Dup['Non_Dup_Vols'] = vols
All_Dup['All_Vols'] = All_Dup['Dup_Vols'] + All_Dup['Non_Dup_Vols']

In [None]:
len(dup_vols), len(vols), len(All_Dup)

In [None]:
sum(All_Dup['Dup_Vols']), sum(All_Dup['Non_Dup_Vols']), sum(All_Dup['All_Vols'])

In [None]:
sum(Prodn['Volume'][Prodn['is_duplicated']==True]), sum(Prodn['Volume'][Prodn['is_duplicated']==False]),sum(Prodn['Volume'])

### The duplicated volume (is_duplicated = True) matches

### Now, replace the volumes in Prodn where is_duplicated = False with the volume summation.

### Then drop is_duplicated = True from Prodn

In [None]:
Prodn[Prodn['is_duplicated']==True].head()

In [None]:
Prodn[Prodn['EPAssetsId']==1167456][Prodn['ProdPeriod']=='2019-01-31 00:00:00.000']

In [None]:
All_Dup.head()

In [None]:
# Cycle through All_Dup samples and Take All_Vols values and use it to replace the value in Prodn 
#    where matchiong EPAssetsId, ProdPeriod, ProdType, and is_duplicated is False

for row in range(len(All_Dup)):
    Vol = float(All_Dup.iloc[row][6])
    Ind = Prodn['Volume'][Prodn['EPAssetsId']==All_Dup.iloc[row][0]][Prodn['ProdPeriod']==All_Dup.iloc[row][1]][Prodn['ProdType']==All_Dup.iloc[row][2]][Prodn['is_duplicated']==False].index[0]
    Prodn.iloc[Ind,3] = Vol

In [None]:
Prodn[Prodn['EPAssetsId']==1167456][Prodn['ProdPeriod']=='2019-01-31 00:00:00.000']

### Now the Total volume or hours is adjacent to the False Flag 

In [None]:
Prodn.shape

In [None]:
Prodn_Clean = Prodn[Prodn['is_duplicated'] == False]
Prodn_Clean.shape

In [None]:
sum(Prodn_Clean['Volume']), sum(Prodn_Clean['is_duplicated'])

### Total of volume (hours and volumes) matched the total in Prodn with multiple time period entries (ammendments) to 5 decimal places.

In [None]:
Prodn_Clean.drop(['WellHeader.Match', 'is_duplicated'], axis = 1, inplace = True)
Prodn_Clean.reset_index
Prodn_Clean.head()

In [None]:
Prodn_Clean.to_csv('WellProduction_No_Duplicates.csv')