## Setup 

In [1]:
%%capture
!pip install kmapper

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import sklearn

## Cargando datos (DB Actualizada)

In [3]:
url = 'https://www.cpc.ncep.noaa.gov/data/indices/sstoi.indices'

# Read the data into a Pandas DataFrame
data = pd.read_csv(url, header=0, delim_whitespace=True)
# Print the first few rows of the data
print(data.head())

     YR  MON  NINO1+2  ANOM  NINO3  ANOM.1  NINO4  ANOM.2  NINO3.4  ANOM.3
0  1982    1    24.28 -0.24  25.84    0.17  28.01   -0.21    26.65    0.08
1  1982    2    25.38 -0.72  26.26   -0.11  27.99   -0.11    26.54   -0.20
2  1982    3    25.22 -1.38  26.92   -0.25  28.18   -0.05    27.09   -0.14
3  1982    4    24.57 -1.16  27.52   -0.05  28.61    0.10    27.83    0.02
4  1982    5    24.00 -0.62  27.70    0.49  29.19    0.40    28.37    0.49


In [4]:
datetime_col = data[["YR", "MON"]]
datetime_col.columns = ["year", "month"]
data['Date'] = pd.to_datetime(datetime_col.assign(DAY=1))

data = data[["Date", "YR", "MON", "NINO3.4", "ANOM.3"]]
data.head()

Unnamed: 0,Date,YR,MON,NINO3.4,ANOM.3
0,1982-01-01,1982,1,26.65,0.08
1,1982-02-01,1982,2,26.54,-0.2
2,1982-03-01,1982,3,27.09,-0.14
3,1982-04-01,1982,4,27.83,0.02
4,1982-05-01,1982,5,28.37,0.49


## Mapper

In [5]:
import kmapper as km

# Lluvia California Dataset

In [6]:
rain_ca = pd.read_csv("/content/drive/Shareddrives/Reto Topologia/data/rain_cali_processed.csv")

In [7]:
rain_ca["Date"] = pd.to_datetime(rain_ca["Date"])

In [8]:
rain_ca

Unnamed: 0,Date,Value
0,1895-01-01,9.25
1,1895-02-01,2.56
2,1895-03-01,2.52
3,1895-04-01,1.25
4,1895-05-01,1.41
...,...,...
1535,2022-12-01,6.41
1536,2023-01-01,8.25
1537,2023-02-01,3.38
1538,2023-03-01,7.11


In [9]:
# Slice para filtrar a las mismas fechas que existen en los datos de ENSO.
dates_to_filter = data['Date']
dates_list = dates_to_filter.to_list()
rain_ca = rain_ca[rain_ca['Date'].isin(dates_list)]
rain_ca

Unnamed: 0,Date,Value
1044,1982-01-01,4.24
1045,1982-02-01,2.95
1046,1982-03-01,5.78
1047,1982-04-01,3.64
1048,1982-05-01,0.22
...,...,...
1535,2022-12-01,6.41
1536,2023-01-01,8.25
1537,2023-02-01,3.38
1538,2023-03-01,7.11


In [10]:
# Probar con distintos shifts para la variable lluvia
mean_rain_ca = rain_ca["Value"].mean()
shifted_rain_ca = rain_ca["Value"].shift(4)
shifted_rain_ca = shifted_rain_ca.fillna(mean_rain_ca)

In [11]:
shifted_rain_ca

1044    1.850524
1045    1.850524
1046    1.850524
1047    1.850524
1048    4.240000
          ...   
1535    0.400000
1536    0.830000
1537    0.170000
1538    2.000000
1539    6.410000
Name: Value, Length: 496, dtype: float64

In [12]:
# Utilizados para la función de color en Mapper
year = data["YR"]
month = data["MON"]
temp = data["NINO3.4"]

In [13]:
data_mapper = data[["NINO3.4", "ANOM.3"]]
data_mapper

Unnamed: 0,NINO3.4,ANOM.3
0,26.65,0.08
1,26.54,-0.20
2,27.09,-0.14
3,27.83,0.02
4,28.37,0.49
...,...,...
491,25.81,-0.84
492,25.88,-0.69
493,26.29,-0.44
494,27.23,-0.01


In [14]:
mapper = km.KeplerMapper(verbose=1)
# Proyección en la variable indice ENSO
projected_data = mapper.fit_transform(X=data_mapper.values, projection=[1])

KeplerMapper(verbose=1)
..Composing projection pipeline of length 1:
	Projections: [1]
	Distance matrices: False
	Scalers: MinMaxScaler()
..Projecting on data shaped (496, 2)

..Projecting data using: [1]

..Scaling with: MinMaxScaler()



In [15]:
covering=km.Cover(n_cubes=15, perc_overlap=0.6)

In [None]:
G = mapper.map(lens=projected_data, 
               X=data_mapper.values,
               clusterer=sklearn.cluster.KMeans(n_clusters=3),
               cover=covering)

In [17]:
PROJ_ENSO_RAIN_PATH = "rain_cubes_15_overlap_0.6_shift_0.html"

mapper.visualize(G, 
                title='ENSO',
                color_values = shifted_rain_ca.values,
                custom_tooltips = shifted_rain_ca.values,
                color_function_name = 'RainFall',
                node_color_function=np.array(['average','std','sum','max','min']),
                path_html=PROJ_ENSO_RAIN_PATH)

Wrote visualization to: rain_cubes_15_overlap_0.6_shift_0.html


'<!DOCTYPE html>\n<html>\n\n<head>\n  <meta charset="utf-8">\n  <meta name="generator" content="KeplerMapper">\n  <title>ENSO | KeplerMapper</title>\n\n  <link rel="icon" type="image/png" href="http://i.imgur.com/axOG6GJ.jpg" />\n\n  <link href=\'https://fonts.googleapis.com/css?family=Roboto+Mono:700,300\' rel=\'stylesheet\' type=\'text/css\'>\n  <style>* {\n  margin: 0;\n  padding: 0;\n}\n\nhtml, body {\n  height: 100%;\n}\n\nbody {\n  font-family: "Roboto Mono", "Helvetica", sans-serif;\n  font-size: 14px;\n}\n\n#logo {\n  width:  85px;\n  height: 85px;\n}\n\n#display {\n  color: #95A5A6;\n  background: #212121;\n}\n\n#header {\n  background: #111111;\n}\n\n#print {\n  color: #000;\n  background: #FFF;\n}\n\nh1 {\n  font-size: 21px;\n  font-weight: 300;\n  font-weight: 300;\n}\n\nh2 {\n  font-size: 18px;\n  padding-bottom: 20px;\n  font-weight: 300;\n}\n\nh3 {\n  font-size: 14px;\n  font-weight: 700;\n  text-transform: uppercase;\n}\n\nh4 {\n  font-size: 13px;\n  font-weight: 700;\n

## Hyperparam Tuning

In [18]:
from itertools import product

n_cubes_list = list(range(10, 16, 2))
perc_overlaps_list = [0.4, 0.5]
shifts_list = list(range(4,15, 1))

prods = list(product(n_cubes_list, perc_overlaps_list, shifts_list))

In [None]:
# for i,  prod in enumerate(prods):
#     n_cubes = prod[0]
#     perc_overlap = round(prod[1], 2)
#     shift = prod[2]

#     FILE_PATH = f"rain_cubes_{n_cubes}_overlap_{perc_overlap}_shift_{shift}.html"
#     PATH = "/content/drive/Shareddrives/Reto Topologia/mappers/" + FILE_PATH


#     mean_rain_ca = rain_ca["Value"].mean()
#     shifted_rain_ca = rain_ca["Value"].shift(shift)
#     shifted_rain_ca = shifted_rain_ca.fillna(mean_rain_ca)

#     covering=km.Cover(n_cubes=n_cubes, perc_overlap=perc_overlap)


#     G = mapper.map(lens=projected_data, 
#                 X=data_mapper.values,
#                 clusterer=sklearn.cluster.KMeans(n_clusters=3),
#                 cover=covering)

#     mapper.visualize(G,
#                     title='ENSO',
#                     color_values = shifted_rain_ca.values,
#                     custom_tooltips = month.values,
#                     color_function_name = 'RainFall',
#                     node_color_function=np.array(['average','std','sum','max','min']),
#                     path_html=PATH)
    
#     print("Saved ", i+1)

#  Squid Dataset

In [21]:
df_squid = pd.read_csv("/content/drive/Shareddrives/Reto Topologia/data/df_squid.csv")
df_squid

Unnamed: 0,PERIOD,VALUE
0,1950,2720.000
1,1951,5617.000
2,1952,1665.000
3,1953,4045.000
4,1954,3699.000
...,...,...
67,2017,66325.698
68,2018,36044.286
69,2019,28254.318
70,2020,35633.049


In [22]:
enso_squid = pd.merge(data, df_squid, left_on="YR", right_on="PERIOD", how="inner")
enso_squid["VALUE"] = enso_squid["VALUE"] / 12

In [25]:
mapper = km.KeplerMapper(verbose=1)
# Proyección en la variable indice ENSO
X = enso_squid[["NINO3.4", "ANOM.3"]].values
projected_data = mapper.fit_transform(X=X, projection=[1])

KeplerMapper(verbose=1)
..Composing projection pipeline of length 1:
	Projections: [1]
	Distance matrices: False
	Scalers: MinMaxScaler()
..Projecting on data shaped (480, 2)

..Projecting data using: [1]

..Scaling with: MinMaxScaler()



In [26]:
mean_squid = enso_squid["VALUE"].mean()
shift = 5
shifted_squid = enso_squid["VALUE"].shift(shift)
shifted_squid = shifted_squid.fillna(mean_rain_ca)

In [None]:
covering=km.Cover(n_cubes=15, perc_overlap=0.6)

G = mapper.map(lens=projected_data, 
               X=X,
               clusterer=sklearn.cluster.KMeans(n_clusters=3),
               cover=covering)

In [None]:
PROJ_ENSO_SQUID_PATH = "squid_cubes_15_overlap_0.6_shift_4.html"

mapper.visualize(G, 
                title='ENSO',
                color_values = enso_squid.VALUE.values,
                custom_tooltips = enso_squid.VALUE.values,
                color_function_name = 'Squid',
                node_color_function=np.array(['average','std','sum','max','min']),
                path_html=PROJ_ENSO_SQUID_PATH)

### Hyperparam tuning

In [None]:
# for shift in range(0, 12, 2):

#     n_cubes = 12
#     perc_overlap = 0.6

#     FILE_PATH = f"squid_cubes_{n_cubes}_overlap_{perc_overlap}_shift_{shift}.html"
#     PATH = "/content/drive/Shareddrives/Reto Topologia/mappers/" + FILE_PATH


#     mapper = km.KeplerMapper(verbose=1)
#     # Proyección en la variable indice ENSO
#     X = enso_squid[["NINO3.4", "ANOM.3"]].values
#     projected_data = mapper.fit_transform(X=X, projection=[1])

#     mean_squid = enso_squid["VALUE"].mean()
#     shifted_squid = enso_squid["VALUE"].shift(shift)
#     shifted_squid = shifted_squid.fillna(mean_squid)

#     covering=km.Cover(n_cubes=n_cubes, perc_overlap=perc_overlap)

#     G = mapper.map(lens=projected_data, 
#                 X=X,
#                 clusterer=sklearn.cluster.KMeans(n_clusters=3),
#                 cover=covering)

#     mapper.visualize(G,
#                     title='ENSO',
#                     color_values = shifted_squid.values,
#                     custom_tooltips = shifted_squid.values,
#                     color_function_name = 'Squid',
#                     node_color_function=np.array(['average','std','sum','max','min']),
#                     path_html=PATH)