In [None]:
import math
import pandas
import sqlalchemy
print(pandas.__version__)

import geopandas as gpd
import folium
#from folium.plugins import MarkerCluster # for clustering the markers
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
# 
pandas.set_option('max_columns', None)

In [None]:
# Path to sql query files
SQL_PATH = r"./sqlCommands/"
SQL_DATA = r"loadMastrSolar.sql"

SQL_DATA_PATH = SQL_PATH + SQL_DATA

# Parameters to create a connection to the MaStR-postgreSQL DB
CONN_PARAM_DICT = {
    "host": "10.0.0.102",
    "dbname": "mastr",
    "user": "uba_user",
    "password": "UBAit2021!",
    "port": "5432"
}

bundesländer = {
    1400:"Brandenburg",
    1401:"Berlin",
    1402:"Baden-Würtenberg",
    1403:"Bayern",
    1404:"Bremen",
    1405:"Hessen",
    1406:"Hamburg",
    1407:"Mecklenburg-Vorpommern",
    1408:"Niedersachsen",
    1409:"Nordrhein-Westfahlen",
    1410:"Rheinland-Pfalz",
    1411:"Schleswig-Holstein",
    1412:"Saarland",
    1413:"Sachsen",
    1414:"Sachsen-Anhalt",
    1415:"Thüringen"}

In [None]:
def build_postgres_conn_string (param:dict) -> str:
    return f'postgresql+psycopg2://{param["user"]}:{param["password"]}@{param["host"]}:{param["port"]}/{param["dbname"]}'

def create_postgres_engine (param:dict) -> sqlalchemy.engine.base.Engine:
    conString = build_postgres_conn_string(param)
    engine = sqlalchemy.create_engine(conString, pool_recycle=3600)
    return engine

def read_postgres_from_queryfile (sqlpath:str, postgresLogin:dict) -> pandas.DataFrame:
    engine = create_postgres_engine(postgresLogin)
    
    scriptFile = open(sqlpath,'r')
    script = scriptFile.read()
    df = pandas.read_sql(script, engine)

    return df


def filter_wrong_bundeslaender(df:pandas.DataFrame) -> pandas.DataFrame:
    unique_bundesl = df["Bundesland"].unique()
    wrong_bundesl = [ i for i in unique_bundesl if not isinstance(i, str) and str(i) != 'nan']
    
    for wrong in wrong_bundesl:
        wrong_raws = df.loc[df["Bundesland"]==wrong]

        for index, raw in wrong_raws.iterrows():
            gmshl_of_wrong_raw = raw["Gemeindeschluessel"]
            most_common_bundesland = df.loc[df["Gemeindeschluessel"]==gmshl_of_wrong_raw]["Bundesland"].value_counts().index[0]
            df.loc[index,"Bundesland"] = most_common_bundesland

    return df

In [None]:
dfMastrSolar = read_postgres_from_queryfile(sqlpath=SQL_DATA_PATH, postgresLogin=CONN_PARAM_DICT)

dfMastrSolar["Anlagenzahl"] = 1

dfMastrSolar = dfMastrSolar.replace({"Bundesland": bundesländer})

dfMastrSolar.columns

In [None]:
dfg = filter_wrong_bundeslaender(dfMastrSolar).groupby([dfMastrSolar['Inbetriebnahmedatum'].dt.weekday, "Bundesland"])["Nettonennleistung"].sum()
ax = dfg.unstack(level=0).plot(kind='bar', subplots=True, rot=90, figsize=(20, 10), layout=(2, 4), sharey=True, logy=True)
plt.tight_layout()

In [None]:
dfg

In [None]:
#dfg = filter_wrong_bundeslaender(dfMastrSolar).groupby([dfMastrSolar['Inbetriebnahmedatum'].dt.weekday, "Bundesland"])["Nettonennleistung"].sum()
dfg = filter_wrong_bundeslaender(dfMastrSolar).groupby([dfMastrSolar['Inbetriebnahmedatum'].dt.weekday,"Bundesland"])["Nettonennleistung"].sum()

wochentage = {
    0.0:"Montag",
    1.0:"Dienstag",
    2.0:"Mittwoch",
    3.0:"Donnerstag",
    4.0:"Freitag",
    5.0:"Samstag",
    6.0:"Sonntag"}

sns.set_style("whitegrid")

data = dfg.reset_index().replace({"Inbetriebnahmedatum":wochentage})
g = sns.catplot(kind='bar', data=data,x='Inbetriebnahmedatum', y='Nettonennleistung', col='Bundesland', col_wrap=5, palette="Paired", height=3, aspect=1.2)#, log=True)
g.set_xticklabels(rotation=90)
#g.set(ylabel="log Nettonennleistung")
g.set(xlabel=None)
#g.set(ylim=(100, None))

In [None]:
dfg = filter_wrong_bundeslaender(dfMastrSolar).groupby([dfMastrSolar['Inbetriebnahmedatum'].dt.weekday, "Bundesland"])["Nettonennleistung"].sum()

data = dfg.reset_index().replace({"Inbetriebnahmedatum":wochentage})
g = sns.catplot(kind='bar', data=data,x='Bundesland', y='Nettonennleistung', col='Inbetriebnahmedatum', log=True, col_wrap=4, palette="Paired", height=3, aspect=1.2)
g.set_xticklabels(rotation=90)
#g.set(ylabel="log Nettonennleistung")
g.set(xlabel=None)

In [None]:
datum = dfMastrSolar["Inbetriebnahmedatum"] > "2010"
leistung_bot = dfMastrSolar["Nettonennleistung"] > 400
leistung_top = dfMastrSolar["Nettonennleistung"] < 500

spalten = ['Laengengrad', 'Breitengrad']

dfMastrSolar.loc[leistung_bot & leistung_top & datum,spalten]

In [None]:
def classify_Leistung(NennLeistung,threshklasse):
    #threshklasse=(0,10,40,100,1000)
    #k=0
    #Klasse=0
    for n, k in enumerate(threshklasse):
        if k == threshklasse [-1]:
            if NennLeistung > threshklasse[n]:
                Klasse = k
        else:
            if NennLeistung > threshklasse[n] and NennLeistung <= threshklasse [n+1]: 
                Klasse = k
    #if Klasse==0:
        #Klasse =len(threshklasse)
        #k=k+1
    return Klasse

#dummy= [0,10,40,100,1000]
#dfMastrSolar["Groessenklasse"]=dfMastrSolar.apply(lambda x: classify_Leistung(NennLeistung=x["Nettonennleistung"],threshklasse= dummy), axis=1)
#dfMastrSolar.loc[:,["Nettonennleistung", "Groessenklasse"]]

In [None]:
dfMastrSolar.groupby(["Groessenklasse"]).sum().reset_index().plot.bar(y="Nettonennleistung",x="Groessenklasse")

### Dask implementation 

In [None]:
import dask
from dask.distributed import Client

with dask.config.set({"distributed.worker.deamon":False}):
    client = Client()

In [None]:
import time, random, sys

def load(x):
    time.sleep(random.random())
    return x

def preprocess (x):
    time.sleep(random.random())
    return x

def scary_function (x):
    if random.random() < 0.4:
        sys.exit(1)
    else:
        time.sleep(random.random() * 4)
    return x

def save (x):
    time.sleep(random.random())
    return x

data = client.map(load, range(20))
data = client.map(preprocess, data)

data = client.map(scary_function, data)

data = client.map(save, data)

In [None]:
client.gather(data)

In [None]:
client.restart()

In [None]:
from dask import dataframe as dd
from typing import List

In [None]:
ddMastrSolar = dd.from_pandas(dfMastrSolar, npartitions=16)

In [None]:
def classify_Leistung(raw,threshklasse:List[int]) -> int:
    #threshklasse=(0,10,40,100,1000)
    #k=0
    #Klasse=0
    NennLeistung = raw#["Nettonennleistung"]

    for n, k in enumerate(threshklasse):
        if k == threshklasse [-1]:
            if NennLeistung > threshklasse[n]:
                Klasse = k
        else:
            if NennLeistung > threshklasse[n] and NennLeistung <= threshklasse [n+1]: 
                Klasse = k
    #if Klasse==0:
        #Klasse =len(threshklasse)
        #k=k+1
    return Klasse


dummy= [0,5,10,100,500]
#ddMastrSolar["Groessenklasse"] = ddMastrSolar.apply(lambda x: classify_Leistung(NennLeistung=x["Nettonennleistung"],threshklasse= dummy), axis=1)
#grklass = ddMastrSolar.apply(classify_Leistung, axis=1, args=(dummy,), meta=('Groessenklasse', 'int16'))
grklass = ddMastrSolar.applymap(lambda x: classify_Leistung(raw=x["Nettonennleistung"],threshklasse= dummy))
grklass.compute()
#ddMastrSolar.groupby(["Groessenklasse"]).sum().reset_index().plot.bar(y="Nettonennleistung",x="Groessenklasse")

In [None]:
df = pandas.DataFrame({'x': [1, 2, 3, 4, 5],

                   'y': [1., 2., 3., 4., 5.]})

ddf = dd.from_pandas(df, npartitions=2)
df

In [None]:
def myadd(row, a, b=1):
    return row.sum() + a*a + b

#res = ddf.apply(myadd, axis=1, args=(2,), b=1.5)
res = ddf.apply(myadd, axis=1, args=(5,), b=1.5, meta=('x', 'f8'))
res.compute()