In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.cluster import DBSCAN

%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = [12, 6]

In [2]:
data_path = "../data/origineel"

if not Path(data_path).is_dir():
    raise Warning("Data path does not exist")

In [3]:
# Lijst van .csv-bestanden
data_file_paths = [p for p in Path(data_path).iterdir() if p.suffix==".csv"]

In [4]:
# Lees de drie bestanden van het circuit as pandas DataFrame
cableconfig = pd.read_csv(data_file_paths[3], sep=";")
partialdischarges = pd.read_csv(data_file_paths[4], sep=";")
warning = pd.read_csv(data_file_paths[5], sep=";")

In [5]:
# Namen van de kolommen
time_col, location_col, charge_col = partialdischarges.columns

# Check that each row either has location and charge undefinied, or both defined:
for i in np.arange(len(partialdischarges))[np.isnan(partialdischarges[location_col]) != np.isnan(partialdischarges[charge_col])]:
    raise Warning(f"Row {i} has missing values")

# Lijst met bools die aangeeft of er op dat tijdstip een PD was
pd_occured = ~np.isnan(partialdischarges[location_col])

# De drie kolommen; alleen op tijdstippen met PD
locations = partialdischarges[location_col][pd_occured]
charges = partialdischarges[charge_col][pd_occured]
times = partialdischarges[time_col][pd_occured].apply(datetime.datetime.fromisoformat)

In [14]:

# de data in de goede vorm zetten
data = np.array(locations.sort_values())
nice_data=data.reshape(-1,1)

# DBSCAN uitvoeren
labels = DBSCAN(eps=5, min_samples=800).fit(nice_data).labels_ 

# de labels die DBSCAN geeft toevoegen aan de dataframe
partialdischarges_sorted=partialdischarges.sort_values(location_col)
partialdischarges_sorted.index = pd.Series(np.arange(len(partialdischarges_sorted)))
labels_series = pd.Series(labels)
partialdischarges_sorted["label"]=labels_series
partialdischarges_sorted

Unnamed: 0,Date/time (UTC),Location in meters (m),Charge (picocoulomb),label
0,2017-08-22 00:25:00,0.0,25141.5,-1.0
1,2018-07-24 02:35:00,0.0,1201.0,-1.0
2,2018-05-14 12:33:00,0.0,11328.0,-1.0
3,2018-07-24 05:47:00,0.0,1387.5,-1.0
4,2018-07-24 06:41:00,0.0,14008.5,-1.0
5,2018-02-18 10:34:00,0.0,15909.5,-1.0
6,2018-07-24 08:16:00,0.0,16675.5,-1.0
7,2018-07-24 21:18:00,0.0,25267.5,-1.0
8,2017-12-06 20:14:00,0.0,14483.5,-1.0
9,2017-12-11 22:01:00,0.0,24609.5,-1.0


In [15]:
partialdischarges_sorted.groupby("label").mean()

Unnamed: 0_level_0,Location in meters (m),Charge (picocoulomb)
label,Unnamed: 1_level_1,Unnamed: 2_level_1
-1.0,870.529385,9091.547148
0.0,349.010939,5988.147044
1.0,605.765194,4970.828622
