In [14]:
%load_ext autoreload
%autoreload 2

## Import necessary modules
import os,sys
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from matplotlib.dates import date2num, AutoDateFormatter, AutoDateLocator, WeekdayLocator, MonthLocator, DayLocator, DateLocator, DateFormatter
from matplotlib.dates import MO, TU, WE, TH, FR, SA, SU
from matplotlib.ticker import AutoMinorLocator, AutoLocator, FormatStrFormatter, ScalarFormatter
import numpy as np
import datetime, calendar
from datetime import timedelta
import matplotlib.patches as mpatches
from itertools import tee
from traitlets import traitlets

sys.path.append(os.path.abspath('/home/keuch/gits/keuch/code_box/pyt/spreadsheetparsing/entwuerfe/xls_testruns/lib/'))
from ce_funclib import determine_kernzeit as dtkz
from ce_funclib import continuity_check


from ipywidgets import widgets, interact, interactive, fixed, interact_manual, Layout
from IPython.display import display
#%matplotlib inline
%matplotlib tk

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Deklarationen

In [21]:
## Lese-Verzeichnis, in dem die Agenten-Reports liegen
arcpth='/home/keuch/gits/keuch/code_box/pyt/spreadsheetparsing/datenhalde/alle_agents_taeglich/'
## Schreib-Verzeichnis, in das der gesammelte pickle abgelegt werden soll
pklpth='/home/keuch/gits/keuch/code_box/pyt/spreadsheetparsing/datenhalde/datapickles/'
heute=datetime.date.today().strftime('%Y-%m-%d')

## Funktionen

In [15]:
######## GET A LIST OF MATCHING .xls FILES FROM THE GIVEN DIRECTORY
def collectxlfiles(arcpath):
    xlfilelist=list()

    for xlfile in os.listdir(arcpath):
        if xlfile.startswith('CE_alle_Ag'):
            xlfileabs=os.path.join(arcpath,xlfile)
            xlfilelist.append(xlfileabs)
    return sorted(xlfilelist)

In [16]:
def makeniceframe(ixframe):        ## funktion nimmt einen mit daten befüllten frame und bringt ihn in eine vernünftige form
    
    exframe=ixframe.copy()
    
    exframe.columns=range(0,30) # rename columns to a temporarily more readable format, fancy rename later
    usefulcols={0:'tstamp',1:'agent',3:'an',4:'be',22:'vl',24:'ht_float',29:'tt_float'} # map cols to decent names
    exframe=exframe[sorted(usefulcols.keys())] # skip cols and keep the ones we need
    exframe.rename(columns=usefulcols,inplace=True) # rename cols
    exframe=exframe[3:-1] # strip text rows and the mangled sum row
    exframe['tstamp']=pd.to_datetime(exframe['tstamp'],format=' %d.%m.%Y %H:%M ')
    exframe['date']=exframe['tstamp'].dt.date
    exframe[['wd','ww','mm','yy']]=exframe['tstamp'].dt.strftime('%a,%W,%m,%Y').str.split(',',expand=True) # make ww,yy,mm,wd columns
    exframe['bz']=exframe['tstamp'].apply(dtkz)

    exframe['ort']=exframe['agent'].str[0] # split the identifier into useable columns
    exframe['id']=exframe['agent'].str[-6:] # split the identifier into useable columns
    exframe['agent']=exframe['agent'].str[2:-7] # split the identifier into useable columns
    
    #### trying unification while parse
    unify_id={'gesinst':'995887','stanzju':'878457','papkeda':'891914'}
    exframe.loc[exframe['id'] == unify_id['gesinst'],'agent'] = 'gesinst'
    exframe.loc[exframe['id'] == unify_id['stanzju'],'agent'] = 'stanzju'
    exframe.loc[exframe['id'] == unify_id['papkeda'],'agent'] = 'papkeda'
    
    # integers should be of appropriate datatype, we received them as strings
    exframe[['vl','an','be','ww','mm','yy']]=exframe[['vl','an','be','ww','mm','yy']].astype(np.int64) #just for the beauty of it
    
    return exframe

In [18]:
def parse_filelist_to_dataframe(liste):
    
    liste_leerer_tage=list()    
    bigframe=pd.DataFrame()
    
    for datei in liste:
        readframe=pd.read_excel(datei)

        if len(readframe.columns) == 3:           # Dateien mit nur 3 Spalten enthalten keine Call-Daten
            print('empty ', end='')
            liste_leerer_tage.append(datei)
        
        elif len(readframe.columns) == 30:
            print('. ', end='')
            niceframe=makeniceframe(readframe)
            bigframe=bigframe.append(niceframe)
            
        else:
            print('Datei zum Nachgucken, hat weder 3 noch 30 Spalten')

    bigframe.sort_values(by='tstamp', inplace=True) # ganzen frame nach Uhrzeiten sortieren
    bigframe.set_index('tstamp',inplace=True) # timestamp als neuer Index
    
    return bigframe,liste_leerer_tage

## Filescraping und Roh-Dataframe

In [19]:
xlfilelist=collectxlfiles(arcpth)
big_frame,leeretage=parse_filelist_to_dataframe(xlfilelist)

. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . empty . . . . . . . . . . . . . . . . . . . . . . . . . . empty . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . empty . . . . . . . . . . . . . empty . . . . . . empty . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . empty . . . . . . empty . . . . . . . . . . . . . . . . . . . . . . . . . . . empty . . . . . . . . 

In [20]:
big_frame.to_pickle(pklpth+'Rohdaten_Agenten-'+heute+'.pkl')
big_frame.to_pickle(pklpth+'Rohdaten_Agenten_aktuell.pkl')
with open(pklpth+'Rohdaten_Agenten_Leertage'+heute, 'wb') as f:
    pickle.dump(leeretage, f)