In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Read Data
df_ekko = pd.read_excel('../Data1/EKKO.XLSX')

In [None]:
df_ekko.head(4)

In [None]:
df_ekpo = pd.read_excel('../Data1/EKPO.XLSX')

In [None]:
df_ekpo.head(4)

In [None]:
print(df_ekko.shape)
print(df_ekpo.shape)

In [None]:
df_ek = pd.merge(df_ekpo, df_ekko, how='left', on='EBELN')

In [None]:
#sanity check (first number should be the higher one from before)
df_ek.shape

In [None]:
df_ek.head(5)

In [None]:
for (columnName, columnData) in df_ek.items():
    print('Colunm Name : ', columnName )
    print('Colunm Name : ', df_ek[columnName].dtypes )
    print('Number of unique values : ', df_ek[columnName].nunique() )
    print('Column unique values : ', np.unique(df_ek[columnName].astype(str)) )
    print('######')

In [None]:
import missingno as msno
%matplotlib inline
msno.matrix(df_ek)

# Data Cleansing

## initial cleaning

In [None]:
df = df_ek.copy()

In [None]:
# Spalten die nur NAN enthalten entfernen
df = df.dropna(axis=1, how='all')

# Spalten die ausschlieÃŸlich XY Werte enthalten (0 / X / 1 / etc.) dropen
df = df.loc[:, (df.astype(str) != 0).any(axis=0)]
df = df.loc[:, (df.astype(str) != '0,0').any(axis=0)]
df = df.loc[:, (df.astype(str) != '0,00').any(axis=0)]
df = df.loc[:, (df.astype(str) != '0,000').any(axis=0)]
df = df.loc[:, (df.astype(str) != 'X').any(axis=0)]
df = df.loc[:, (df.astype(str) != '1').any(axis=0)]

#spaletn entfernen die immer die selben Werte enthalten
keep_columns = [col for col in df.columns if len(df[col].unique()) > 1]
df = df[keep_columns].copy()

#print(len(df_io.index))
df.shape

In [None]:
df.head(4)

In [None]:
#sanity check
df.shape

# manual cleaning of plant dependent naming

In [None]:
df.drop(['BUKRS_x', 'BUKRS_y', 'WERKS', 'KO_PRCTR', 'EKORG'],axis=1, inplace=True)

In [None]:
df.EMATN = df.EMATN.str.slice(3)
df.MATNR = df.MATNR.str.slice(3)
df.MATKL = df.MATKL.str.slice(3)

## data types correction

In [None]:
df.describe(include='all').T

In [None]:
df.head(10)

In [None]:
cont_Cols = ['EBELN', 'EBELP']

cat_Cols = ['TXZ01', 'MATNR', 'EMATN', 'MATKL', 'INFNR', 'MEINS', 'BPRME', 'LMEIN', 'GEWEI', 'BANFN', 'ERNAM', 'LPONR', 'LIFNR', 'EKGRP', 'KNUMV' ]

num_Cols = ['MENGE', 'NETPR', 'NETWR', 'BRTWR', 'EFFWR', 'BONBA']

dat_Cols = ['BEDAT', 'PRDAT', 'AEDAT_x', 'AEDAT_y']

#2 Del
#BUZEI, POSN2, EBELP

In [None]:
#sanity check of cols
print(len(cont_Cols)+len(cat_Cols)+len(num_Cols)+len(dat_Cols))
print(df.shape[1]) 

In [None]:
# Change dtype to Category for relevant features
df_cor = df.copy()
df_cor.head()

# we have to change NaN top 0 to avoid the infer dtype "object" for some cols
#df_cor['ZUONR'] = df_cor['ZUONR'].fillna(0)


# change Dtype to Cat for categorical columns
cats_cat = df_cor[cat_Cols]
        
for (columnName, columnData) in cats_cat.items():
    df_cor[columnName] = df_cor[columnName].astype(str)
    if isinstance(df_cor[columnName][0], str):
        s = pd.Series(df_cor[columnName], dtype="category")
        df_cor[columnName] = df_cor[columnName].astype('category')



In [None]:
#check 1
df_cor.dtypes

In [None]:
#check 2
print('Categorical columns: {}'.format(list(df_cor.select_dtypes('category').columns)))

In [None]:
#check 3
df_cor.shape

In [None]:
import missingno as msno
%matplotlib inline
msno.matrix(df_cor)

In [None]:
df_cor.to_feather('../Data/D3_EKKO-EKPO_raw.ftr')

# lets check the dataset properities / Correlation  / distribution

In [None]:
# Compute the corr matrix, only for numerics since we use category dtype
corr = df_cor.corr(numeric_only=True).round(2)

# Set up the triangular mask / matplotlib figure / Colors
mask = np.triu(np.ones_like(corr, dtype=bool)) # tril for upper
f, ax = plt.subplots(figsize=(7, 7))
cmap1 = sns.cubehelix_palette(dark=0, light=1, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap1, vmax=1,vmin=-1, center=0, square=True, annot=True, linewidths=.5, cbar_kws={"shrink": .5})


In [None]:

# Compute the corr matrix and 
corr = df.corr(numeric_only=True).round(2)

# Set up the triangular mask / matplotlib figure / Colors
mask = np.triu(np.ones_like(corr, dtype=bool)) # tril for upper
f, ax = plt.subplots(figsize=(7, 7))
cmap1 = sns.cubehelix_palette(dark=0, light=1, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap1, vmax=1,vmin=-1, center=0, square=True, annot=True, linewidths=.5, cbar_kws={"shrink": .5})


In [None]:
df.sample(10)

In [None]:
#manually delete the hihgly correlating features in both dataframes
# WRBTR PSWBT VBEL2 POSN2 ERFMG AUFPL SAKNR SAKNR LOKKT KIDNO
df_cor = df_cor.drop(['KNUMV'], axis=1)
df = df.drop(['KNUMV'], axis=1)

df_cor = df_cor.drop(['BRTWR'], axis=1)
df = df.drop(['BRTWR'], axis=1)

df_cor = df_cor.drop(['EFFWR'], axis=1)
df = df.drop(['EFFWR'], axis=1)

df_cor = df_cor.drop(['BONBA'], axis=1)
df = df.drop(['BONBA'], axis=1)



In [None]:
df.head()

In [None]:
# Compute the corr matrix and 
corr = df.corr(numeric_only=True).round(2)

# Set up the triangular mask / matplotlib figure / Colors
mask = np.triu(np.ones_like(corr, dtype=bool)) # tril for upper
f, ax = plt.subplots(figsize=(7, 7))
cmap1 = sns.cubehelix_palette(dark=0, light=1, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap1, vmax=1,vmin=-1, center=0, square=True, annot=True, linewidths=.5, cbar_kws={"shrink": .5})


In [None]:
df_cor.head()

In [None]:
df_cor.to_feather('../Data/D3_EKKO-EKPO_cleaned.ftr')