In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Dropout, Input
from keras.losses import CategoricalCrossentropy 
import sys

def column_one_hot (dataframe, columns): 
    for column in columns:
        if column in dataframe:
            one_hot = pd.get_dummies(dataframe[column])
            dataframe = dataframe.drop(column,axis = 1)
            dataframe = pd.concat([dataframe, one_hot], axis=1)
    return dataframe

def drop_columns (dataframe, columns):
    for column in columns:
        if column in dataframe.columns:
            dataframe = dataframe.drop(column, axis=1)
    return dataframe

def drop_column (dataframe, column):
    if column in dataframe.columns:
        dataframe = dataframe.drop(column, axis=1)
    return dataframe

In [38]:
df = pd.read_csv('SAR_Data.csv')
df.shape

status = df['Status'].unique()

maskOpen = ((df['Status'] == 'postponed') | (df['Status'] == 'partly open') |( df['Status'] == 'In creation'))
maskClosed = (df['Status'] == 'partly closed')
df.loc[maskOpen, 'Status'] = 'open'
df.loc[maskClosed, 'Status'] = 'closed'
df.loc[(df['Status'] == 'non applicable'), 'Status'] = 'not applicable'
status = df['Status'].unique()

# postponed, partly open, in creation -> open;    partly closed -> closed

# falsche Ordnerstruktur in /RA Application Conditions/03_PG_OCS/Service and diagnostic systems
# fehlt ein Ordner bevor Module kommen, deshalb händisch eintragen

df.loc[df['Version'].str.contains('VICOS_S_D'), 'Product'] = 'VICOS_S_D'
df.loc[df['Version'].str.contains('VICOS_S_D'), 'Version'] = df['Version'].str[-5:]

In [39]:
paths = df['Path'].unique()
accessDB = pd.read_xml("X:/File/DE/bwga024a_IMORA_RM/05_Process_Management/14_Metriken & KPI/KPI-Erhebung/KPI_01-04_General/Data/Input/Input_BWG_Combined_Access.xml")

for path in paths:
    try:
        if(path == "/ML Realization Projects Algeria"):
            result = accessDB.loc[(accessDB['Type'] == "Real") & (accessDB['Location'] == "BWG") & (accessDB['Offset'] == "/ML Realization Projects Algeria/20006_ML_BM_Boughezoul_MSila")].iloc[0]
        else:
            result = accessDB.loc[(accessDB['Type'] == "Real") & (accessDB['Location'] == "BWG") & ((accessDB['Offset'] == str(path)) | (accessDB['Offset'] == (str(path) + "/")))].iloc[0]           
    except:
        print(str(path) + " has no entry in the AccessDB!")

    mask = df['Path'] == str(path)
    df.loc[mask, 'Project_category'] = result['Project_category']
    df.loc[mask, 'BS'] = result['BS']
    df.loc[mask, 'RU'] = result['RU']
    df.loc[mask, 'ProjectYear'] = result['ProjectYear']
    df.loc[mask, 'section'] = result['section']
    df.loc[mask, 'Project_name'] = result['Project_name']
    df['ProductVersion'] = df["Product"].str.cat(df["Version"], sep = "-")

df['ProjectYear'] = df['ProjectYear'].astype('int')
df = df[['Text', 'Product', 'ProductVersion', 'Project_name', 'section', 'Project_category', 'BS', 'RU', 'ProjectYear', 'Status', 'Statement']]
#df = drop_column(df, 'Statement') # später wieder löschen!
df.shape

(14572, 11)

In [40]:
# TEST (WORKING!) ---------------------------------------
products = df['Product'].unique()
df = column_one_hot(df, ['Product'])
projects = df['Project_name'].unique()
for project in projects:
    for product in products:
        df.loc[df['Project_name'] == project, product] = 1 if (df.loc[df['Project_name'] == project][product].sum()) >= 1 else 0
df = column_one_hot(df, ['ProductVersion', 'Project_name', 'section',
       'Project_category', 'BS', 'RU', 'ProjectYear'])
df.shape

(14572, 180)

In [41]:
df_training = df
text = "Zur Anschaltung des Antriebes in der Außenanlage müssen Signalkabel nach VDE 0816/2 oder Kabel mit vergleichbaren Eigenschaften verwendet werden. Die Verlegevorschriften des Kabels sind einzuhalten."
df_training = df_training.loc[df_training['Text'] == text]
df_training.reset_index(inplace=True, drop=True)

test_col = 8
test = drop_columns(df_training.iloc[[test_col]], ['Text', 'Status', 'Statement'])
df_training = df_training.drop(test_col)

trainX = drop_columns(df_training, ['Status', 'Text', 'Statement'])
trainYStatus = drop_column(column_one_hot(df_training[['Text', 'Status']], ['Status']), "Text")
trainYStatement = drop_column(column_one_hot(df_training[['Text', 'Statement']], ['Statement']), "Text")

In [64]:
modelStatus = Sequential()
modelStatus.add(Dense(8, input_shape=(trainX.shape[1],), activation='relu'))
modelStatus.add(Dense(16, activation='relu'))
modelStatus.add(Dense(trainYStatus.shape[1], activation='softmax'))
modelStatus.summary()
modelStatus.compile(optimizer='adam',
              loss=CategoricalCrossentropy(),
              metrics=['accuracy'])

historyStatus = modelStatus.fit(trainX, trainYStatus,
                    batch_size=2,
                    epochs=50,
                    verbose=2,
                    validation_split=0.4)

#modelStatement = Sequential()
#modelStatement.add(Dense(16, input_shape=(trainX.shape[1],), activation='relu'))
#modelStatement.add(Dense(8, activation='relu'))
#modelStatement.add(Dense(trainYStatement.shape[1], activation='softmax'))
#modelStatement.summary()
#modelStatement.compile(optimizer='adam',
#              loss=CategoricalCrossentropy(),
#              metrics=['accuracy'])
#
#historyStatement = modelStatement.fit(trainX, trainYStatement,
#                    batch_size=2,
#                    epochs=50,
#                    verbose=2,
#                    validation_split=0.4)

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_48 (Dense)            (None, 8)                 1424      
                                                                 
 dense_49 (Dense)            (None, 16)                144       
                                                                 
 dense_50 (Dense)            (None, 3)                 51        
                                                                 
Total params: 1,619
Trainable params: 1,619
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
3/3 - 2s - loss: 1.0611 - accuracy: 0.4444 - val_loss: 1.2511 - val_accuracy: 0.1429 - 2s/epoch - 788ms/step
Epoch 2/50
3/3 - 0s - loss: 1.0474 - accuracy: 0.4444 - val_loss: 1.2556 - val_accuracy: 0.1429 - 226ms/epoch - 75ms/step
Epoch 3/50
3/3 - 0s - loss: 1.0389 - accuracy: 0.4444 - val_loss: 1.2583 

In [65]:
predictionStatus = modelStatus.predict(test)
col = 0
for i in predictionStatus:
    for j in i:
        print (trainYStatus.columns[col] + " " + '{:.1%}'.format(j))
        col += 1

print ("-----------------------------------------------------")

#predictionStatement = modelStatement.predict(test)
#index_max = np.argmax(predictionStatement)
#print (trainYStatement.columns[index_max] + " " + '{:.1%}'.format(predictionStatement[0][index_max]))
#col = 0
#for i in predictionStatement:
#    for j in i:
#        print (trainYStatement.columns[col] + " " + '{:.1%}'.format(j))
#        col += 1
#

closed 16.7%
compliant 63.3%
not applicable 20.0%
-----------------------------------------------------


In [31]:
#Functional
inputs = Input(shape=trainX.shape[1])

hidden_layer = Dense(16, activation='relu')(inputs)

output1 = Dense(trainYStatus.shape[1], activation='softmax', name='status')(hidden_layer)

output2 = Dense(trainYStatement.shape[1], activation='softmax', name='statement')(hidden_layer)

model = Model(inputs=inputs, outputs=[output1, output2])

model.summary()

model.compile(optimizer='adam',
              loss={'status': CategoricalCrossentropy(), 'statement': CategoricalCrossentropy()},
              metrics=['accuracy'])

history = model.fit(trainX, {'status': trainYStatus, 'statement': trainYStatement},
                    batch_size=2,
                    epochs=50,
                    verbose=2,
                    validation_split=0.4)

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 177)]        0           []                               
                                                                                                  
 dense_2 (Dense)                (None, 16)           2848        ['input_3[0][0]']                
                                                                                                  
 status (Dense)                 (None, 3)            51          ['dense_2[0][0]']                
                                                                                                  
 statement (Dense)              (None, 16)           272         ['dense_2[0][0]']                
                                                                                            

In [33]:
# Functional!
# Vorhersage machen
output_status, output_statement = model.predict(test)

# Ausgabe 1 vorhersagen
status_prediction = np.argmax(output_status)

# Ausgabe 2 vorhersagen
statement_prediction = np.argmax(output_statement)

for val in output_status:
    print (str(val))

#print("Status Probabilities:", output_status)
#print("Statement Prediction:", statement_prediction)

[0.04301697 0.758653   0.1983301 ]


In [None]:
# nur noch Infos --------------------------------------

In [None]:
status = df['Status'].unique()
hauefigkeit = []
for x in status:
    hauefigkeit.append(len(df[df['Status'] == x]))
fig, ax = plt.subplots()
ax.pie(hauefigkeit, labels=status, autopct='%1.1f%%')

In [None]:
print(df['Product'].unique())
df[df['Product'] == "/"].shape

In [None]:
def addlabels(x,y):
    for i in range(len(x)):
        plt.text(i, y[i], y[i], ha = 'center')

text = df['Text'].value_counts()
eins = 0
zwei = 0
drei = 0
vier = 0
for anzahl in text:
    if anzahl  < 5:                         # anzahl 0:4
        eins += 1
    if ((anzahl  >= 5) & (anzahl <= 9)):           # anzahl 5:9
        zwei += 1                           
    if ((anzahl  >= 10) & (anzahl <= 14)):           # anzahl 10:14
        drei += 1
    if anzahl  >= 15:                       # anzahl ab 15
        vier += 1
y = [eins, zwei, drei, vier]
x = ('0-4', '5-9', '10-14', 'ab 15')


plt.bar(x, y, align='center')
plt.xticks(x)
plt.title('Anzahl Anwendungsregeln nach Häufigkeit der Bewertung')
addlabels(x, y)
plt.show()

print ("Anzahl Anwendungsregeln, welche nur einmal bewertet wurden: " + str(eins))
print ("Anteil am Datensatz: " + str(round(round(eins/df.shape[0], 4) * 100, 2)) + "%")

In [None]:
text = df['Text'].value_counts()
text.head()
test = df.loc[df['Text'] == "Zur Anschaltung des Antriebes in der Außenanlage müssen Signalkabel nach VDE 0816/2 oder Kabel mit vergleichbaren Eigenschaften verwendet werden. Die Verlegevorschriften des Kabels sind einzuhalten."]
test.reset_index(inplace=True, drop=True)
test.head(17)
#df['Text'].value_counts()

In [None]:
plt.plot(historyStatus.history['loss'], label = 'Training loss')
plt.plot(historyStatus.history['val_loss'], label = 'Validation loss')
plt.legend()