In [135]:
from bs4 import BeautifulSoup
from zipfile import ZipFile
import re

import pandas as pd
import plotly.offline as pyo
import plotly.graph_objs as go
# Set notebook mode to work in offline
pyo.init_notebook_mode(connected=True)

import plotly.express as px

In [136]:
with ZipFile('../data/raw/Pozos Productores Activos.kmz', 'r') as kmz:
    kml = kmz.open(kmz.filelist[0].filename, 'r').read()
soup = BeautifulSoup(kml, 'xml')

In [137]:
wells_data = []

#steps = soup.find("kml:Placemark")
for step in soup.find_all("kml:Placemark"):
    well_name = step.text.split('\n')[1]
    well_coordinates = step.text.split('\n')[-3]
    well_coordinate_x = well_coordinates.split(',')[1]
    well_coordinate_y = well_coordinates.split(',')[0]
    wells_data.append((well_name, well_coordinate_x, well_coordinate_y))

In [138]:
df_wells_location_data = pd.DataFrame(wells_data, columns=['well_id', 'coor_x', 'coor_y'])
df_wells_location_data['coor_x'] = df_wells_location_data.coor_x.astype(float)
df_wells_location_data['coor_y'] = df_wells_location_data.coor_y.astype(float)

In [140]:
df_wells_location_data.to_csv('../data/external/well_locations.csv', index=False)

In [5]:
df_wells_location_data[df_wells_location_data.well_id == 'E-175']

Unnamed: 0,well_id,coor_x,coor_y
134,E-175,-45.759404,-67.757817


In [6]:
df_wells_location_data[df_wells_location_data.well_id.isin(['E-173', 
                                                            'E-174',
                                                            'E-176',
                                                            'E-177'])]

Unnamed: 0,well_id,coor_x,coor_y
132,E-173,-45.759454,-67.760902
133,E-174,-45.754529,-67.730324
135,E-176,-45.756508,-67.757112


In [7]:
df_wells_location_data['diff_e175'] = (df_wells_location_data.coor_x.abs() - 45.759404).abs() + (df_wells_location_data.coor_y.abs() - 67.757817).abs()

In [8]:
df_wells_location_data.sort_values('diff_e175').head(5)

Unnamed: 0,well_id,coor_x,coor_y,diff_e175
134,E-175,-45.759404,-67.757817,0.0
156,E-200,-45.759167,-67.756944,0.00111
132,E-173,-45.759454,-67.760902,0.003135
135,E-176,-45.756508,-67.757112,0.003601
82,E-102,-45.760147,-67.754927,0.003633


In [9]:
df = pd.read_csv('../data/raw/dinamometrias.csv')

In [10]:
wells_codes = df.pozo.unique()

In [11]:
wells_codes_new = []
for w in wells_codes:
    if 'COMBINADA' in w:
        wells_codes_new.append(w[0] + re.findall(r'\d+', w)[0])
    else:
        wells_codes_new.append(w.replace(' ', '-'))

In [12]:
df_wells_location_data_filtered = df_wells_location_data[df_wells_location_data.well_id.isin(wells_codes_new)].copy()

In [13]:
compared_wells = []
similar_plots = []

for w in wells_codes_new:
    compared_wells.append(w)
    _df = df_wells_location_data_filtered[df_wells_location_data_filtered.well_id == w]
    
    if len(_df) > 0:
        well_x = _df.coor_x.abs().values[0]
        well_y = _df.coor_y.abs().values[0]
        df_wells_location_data_filtered['distance'] = (df_wells_location_data_filtered.coor_x.abs() - well_x).abs() + (df_wells_location_data_filtered.coor_y.abs() - well_y).abs()
        _df = df_wells_location_data_filtered[df_wells_location_data_filtered.distance < 0.003].copy()
        
        # Remove distance to the same well.
        _df = _df[_df.well_id != w].sort_values('distance').head(5)
        
        # Remove already analyzed wells (to avoid repetitions).
        _df = _df[~_df.well_id.isin(compared_wells)]
        
        if len(_df) > 0:
            print('*' * 20)
            print(w)
            print(_df)
            ws = [w.replace('-', ' ') for w in _df.well_id.values]
            similar_plots.append(([w.replace('-', ' ')] + ws))

********************
E-147
    well_id     coor_x     coor_y  diff_e175  distance
104   E-136 -45.756045 -67.774709   0.020251  0.002422
********************
E-180
    well_id     coor_x     coor_y  diff_e175  distance
162    E-92 -45.751038 -67.742041   0.024142  0.002368
********************
E-198
    well_id     coor_x  coor_y  diff_e175  distance
153   E-196 -45.758611 -67.745    0.01361  0.001667
********************
L-83
    well_id     coor_x     coor_y  diff_e175  distance
492    L-73 -45.687935 -67.731783   0.097503  0.002807
505    L-90 -45.690075 -67.733885   0.093261  0.002845
********************
E-176
    well_id  coor_x     coor_y  diff_e175  distance
145   E-188 -45.755 -67.758333    0.00492  0.002729
********************
H-125
    well_id     coor_x     coor_y  diff_e175  distance
276   H-123 -45.713246 -67.731734   0.072241  0.002599


In [14]:
similar_plots

[['E 147', 'E 136'],
 ['E 180', 'E 92'],
 ['E 198', 'E 196'],
 ['L 83', 'L 73', 'L 90'],
 ['E 176', 'E 188'],
 ['H 125', 'H 123']]

In [15]:
df['fecha'] = pd.to_datetime(df.fecha)
df['porcentaje agua'] = pd.to_numeric(df['porcentaje agua'], errors='coerce')

In [121]:
def plot_nearby_wells(wells, plot_production=True):
    fig = px.line(df[df.pozo.isin(wells)].sort_values(['pozo', 'fecha']),
                  x='fecha',
                  y='porcentaje agua',
                  color='pozo',
                  title='Porcentaje de agua pozos: ' + ', '.join(wells))
    fig.show()
    
    if plot_production:
        fig = px.line(df[df.pozo.isin(wells)].sort_values(['pozo', 'fecha']),
                      x='fecha',
                      y='produccion petroleo',
                      color='pozo',
                      title='Producción de petróleo pozos: ' + ', '.join(wells))
        fig.show()

In [17]:
for similarities in similar_plots:
    plot_nearby_wells(similarities)
    print('#' * 20)

####################


####################


####################


####################


####################


####################


In [39]:
_df = df[df.pozo == 'L 73'].set_index('fecha')['porcentaje agua'].resample('1d').mean().interpolate(method='polynomial', order=2)

In [40]:
fig = px.line(df[df.pozo == 'L 73'].sort_values(['pozo', 'fecha']),
              x='fecha',
              y='porcentaje agua',
              color='pozo',
              title='Porcentaje de agua pozo')
fig.show()

fig = px.line(_df.reset_index(),
              x='fecha',
              y='porcentaje agua',
              title='Porcentaje de agua pozo (remuestreado)')
fig.show()

In [41]:
import numpy as np

In [None]:
np.corrcoef([])

In [110]:
def get_wells_correlation(_df, well_a, well_b, column):
    _df_a = _df[_df.pozo == well_a]
    _df_b = _df[_df.pozo == well_b]

    if (len(_df_a) < 12) or (len(_df_b) < 12):
        return 0
    
    _df_a = _df_a.set_index('fecha')[column].resample('1d').mean().interpolate(method='polynomial', order=2).reset_index()
    _df_b = _df_b.set_index('fecha')[column].resample('1d').mean().interpolate(method='polynomial', order=2).reset_index()
    
    min_date = max([_df_a.fecha.min(), _df_b.fecha.min()])
    max_date = min([_df_a.fecha.max(), _df_b.fecha.max()])
    
    _df_a_values = _df_a[(_df_a.fecha >= min_date) & (_df_a.fecha <= max_date)][column].values
    _df_b_values = _df_b[(_df_b.fecha >= min_date) & (_df_b.fecha <= max_date)][column].values
    
    if ((max_date - min_date).days < 360) or min([len(_df_a_values), len(_df_b_values)]) < 12:
        return 0

    corr = np.corrcoef([_df_a_values, _df_b_values])
    
    return corr[1, 0]

In [111]:
import itertools

In [112]:
wells_correlations = []

for zone in df.zona.unique():
    _df_zone = df[df.zona == zone]
    
    wells = _df_zone.pozo.unique()
    wells_combinations = list(itertools.combinations(wells, 2))
    
    for c in wells_combinations:
        corr = get_wells_correlation(_df_zone, c[0], c[1], 'porcentaje agua')
        wells_correlations.append((c[0], c[1], corr))

In [113]:
df_correlations = pd.DataFrame(wells_correlations, columns=['well_a', 'well_b', 'wells_corr'])

In [116]:
df_correlations['wells_corr_abs'] = df_correlations.wells_corr.abs()
df_correlations.sort_values('wells_corr_abs', ascending=False).head(20)

Unnamed: 0,well_a,well_b,wells_corr,wells_corr_abs
1199,E 180,F 270,0.856053,0.856053
517,E 209,E 186,-0.834207,0.834207
443,E 189,E 213,-0.820379,0.820379
539,E 223,A 51,-0.77826,0.77826
784,A 50,E 213,-0.766903,0.766903
424,E 147,E 196,0.751395,0.751395
440,E 189,A 50,0.741643,0.741643
833,E 176,E 213,-0.722358,0.722358
859,E 213,E 186,-0.702459,0.702459
446,E 189,E 186,0.696846,0.696846


In [117]:
_df = df_correlations.sort_values('wells_corr_abs', ascending=False).head(20)

In [122]:
for row in _df.itertuples():
    print('Correlación: %.4f' % row.wells_corr)
    plot_nearby_wells([row.well_a, row.well_b], False)

Correlación: 0.8561


Correlación: -0.8342


Correlación: -0.8204


Correlación: -0.7783


Correlación: -0.7669


Correlación: 0.7514


Correlación: 0.7416


Correlación: -0.7224


Correlación: -0.7025


Correlación: 0.6968


Correlación: 0.6949


Correlación: -0.6815


Correlación: 0.6781


Correlación: 0.6753


Correlación: 0.6742


Correlación: 0.6703


Correlación: 0.6654


Correlación: -0.6617


Correlación: 0.6591


Correlación: -0.6590


In [133]:
df_wells_location_data[df_wells_location_data.well_id.isin(['E-180', 'F-270'])]

Unnamed: 0,well_id,coor_x,coor_y,diff_e175
138,E-180,-45.752523,-67.742924,0.021774


In [134]:
df_wells_location_data[df_wells_location_data.well_id.str.contains('F')]

Unnamed: 0,well_id,coor_x,coor_y,diff_e175
168,F-104,-45.735203,-67.678743,0.103275
169,F-11,-45.731316,-67.67272,0.113185
170,F-118,-45.736177,-67.67518,0.105864
171,F-120,-45.7408,-67.705715,0.070706
172,F-124,-45.740524,-67.710925,0.065772
173,F-142,-45.738318,-67.675624,0.103279
174,F-151,-45.727563,-67.692471,0.097187
175,F-154,-45.740641,-67.713534,0.063046
176,F-161,-45.74694,-67.705517,0.064764
177,F-163,-45.728474,-67.702061,0.086686


In [128]:
df_wells_location_data

Unnamed: 0,well_id,coor_x,coor_y,diff_e175
0,A-106,-45.771733,-67.675479,0.094667
1,A-113,-45.775784,-67.690876,0.083321
2,A-114,-45.775366,-67.685375,0.088404
3,A-118,-45.772633,-67.688638,0.082408
4,A-119,-45.771097,-67.677427,0.092083
...,...,...,...,...
547,TAY-2009,-45.701417,-67.423651,0.392153
548,TAY-2010,-45.703972,-67.437116,0.376133
549,TAY-2011,-45.703463,-67.434253,0.379505
550,TAY-2012,-45.700984,-67.426889,0.389348
