In [4]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
import datetime

pd.plotting.register_matplotlib_converters()

%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = [12, 6]

In [5]:
data_path = "../../data/origineel"
if not Path(data_path).is_dir():
    raise Warning("Data path does not exist")

In [6]:
# Lijst van .csv-bestanden
data_file_paths = [path for path in Path(data_path).iterdir() if path.suffix==".csv"]

# Geef het circuitnummer (4 cijfers) dat in de naam van een Path staat
circuitnum_from_file_path = lambda file_path: int(file_path.name.split('-')[0])

# Drie dictionaries, met als keys de circuitnummers (int), als value de Pandas DataFrame van de .csv.
cable_config       = {circuitnum_from_file_path(fp): pd.read_csv(fp, sep=";") for fp in data_file_paths if 'cableconfig' in fp.name}
partial_discharges = {circuitnum_from_file_path(fp): pd.read_csv(fp, sep=";") for fp in data_file_paths if 'pd' in fp.name}
warning            = {circuitnum_from_file_path(fp): pd.read_csv(fp, sep=";") for fp in data_file_paths if 'warning' in fp.name}

# Sla de keys op van alle kabels waarvoor PD data bestaat.
circuits = list(cable_config)
circuits.sort()
print(circuits)

# Kies een circuit
circuit = circuits[1]
cc = cable_config[circuit]
pardis = partial_discharges[circuit]

[1512, 2063, 2145, 2806, 2870, 2979, 2980, 3010]


In [7]:
# voor het gemak: de kolomnamen
pardis.columns
# je zou deze namen ook kunnen opslaan, maar ik heb ervoor gekozen om
# de daadwerkelijke kolomnaam-string te gebruiken
# opslaan is waarschijnlijk beter aangezien namen kunnen veranderen

Index(['Date/time (UTC)', 'Location in meters (m)', 'Charge (picocoulomb)'], dtype='object')

In [8]:
# print een kolom van de tabel
pardis["Date/time (UTC)"] # Blijkbaar is print() hiervoor niet nodig,
print(pardis["Location in meters (m)"]) # maar het kan wel

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
5        NaN
6        NaN
7        NaN
8        NaN
9        NaN
10       NaN
11       NaN
12       NaN
13       NaN
14       NaN
15       NaN
16       NaN
17       NaN
18       NaN
19       NaN
20       NaN
21       NaN
22       NaN
23       NaN
24       NaN
25       NaN
26       NaN
27       NaN
28       NaN
29       NaN
          ..
368981   NaN
368982   NaN
368983   NaN
368984   NaN
368985   NaN
368986   NaN
368987   NaN
368988   NaN
368989   NaN
368990   NaN
368991   NaN
368992   NaN
368993   NaN
368994   NaN
368995   NaN
368996   NaN
368997   NaN
368998   NaN
368999   NaN
369000   NaN
369001   NaN
369002   NaN
369003   NaN
369004   NaN
369005   NaN
369006   NaN
369007   NaN
369008   NaN
369009   NaN
369010   NaN
Name: Location in meters (m), Length: 369011, dtype: float64


In [9]:
# print twee van de drie kolommen
pardis.loc[:, ["Date/time (UTC)", "Location in meters (m)"]]

Unnamed: 0,Date/time (UTC),Location in meters (m)
0,2017-08-21 14:39:00,
1,2017-08-21 14:40:00,
2,2017-08-21 14:41:00,
3,2017-08-21 14:42:00,
4,2017-08-21 14:43:00,
5,2017-08-21 14:44:00,
6,2017-08-21 14:45:00,
7,2017-08-21 14:46:00,
8,2017-08-21 14:47:00,
9,2017-08-21 14:48:00,


In [10]:
# Begintijd en eindtijd
pardis["Date/time (UTC)"][pardis.index[0]] # begintijd
pardis["Date/time (UTC)"][pardis.index[-1]] # eindtijd
# -1 want python arrays werken modulo het aantal elementen
# Deze oplossing maakt ervan gebruik dat de data op tijd gesorteerd is

'2018-11-13 11:28:00'

In [11]:
# Gemiddelde lading
pardis["Charge (picocoulomb)"].mean()

6755.31446811563

In [12]:
# Gemiddelde lading voor pds met lading > 2000
high_charge = pardis["Charge (picocoulomb)"] > 2000 # Bepaal waar die ladingen zijn
pardis["Charge (picocoulomb)"][high_charge].mean() # Bereken het gemiddelde
# Dit kan op 1 regel, maar op 2 vind ik overzichtelijker

7733.465520134228

In [13]:
# Print 10 willekeurige rijen
pardis.iloc[np.random.randint(pardis.index[-1], size = 10)]

Unnamed: 0,Date/time (UTC),Location in meters (m),Charge (picocoulomb)
164546,2018-04-03 06:24:00,,
198253,2018-05-09 13:23:00,,
110467,2018-02-05 07:47:00,,
195028,2018-05-06 07:13:00,,
21786,2017-09-18 14:26:00,,
325244,2018-09-22 12:18:00,,
190867,2018-05-02 00:02:00,,
277247,2018-07-27 07:11:00,,
145425,2018-03-13 21:07:00,349.006593,7898.5
341507,2018-10-11 11:41:00,,


In [21]:
# Totale lading van alle pds in september tussen 100m en 200m
loc = [100 < place < 200 for place in pardis["Location in meters (m)"]]
times = pardis["Date/time (UTC)"].apply(convert_times)
convert_times = lambda date_string: datetime.datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S")
september = [dt.month == 9 for dt in times]
wanted_rows = np.logical_and(loc, september)
pardis["Charge (picocoulomb)"][wanted_rows].sum()

605684.5

In [20]:
# Gemiddelde afstand tussen moffen
joints = cc["Component type"].apply(lambda x: "Joint" in x)
print(np.diff(cc["Cumulative length (m)"][joints]))

[ 16.  84. 242.   9. 240.  15. 240.   3. 244. 242.  26.   5. 146.   6.
 146.  17.  49.  97. 189.  15.  73.  12. 166.]


In [17]:
# Totale lading PDs tussen 18:00 en 21:00 en tussen 02:00 en 05:00
convert_times = lambda date_string: datetime.datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S")
times = pardis["Date/time (UTC)"].apply(convert_times)
late = [dt.hour > 18 and dt.hour < 21 for dt in times]
early = [dt.hour > 2 and dt.hour < 5 for dt in times]
print(pardis["Charge (picocoulomb)"][late].sum())
print(pardis["Charge (picocoulomb)"][early].sum())

5355956.5
12147171.0
