In [61]:
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns

In [40]:
# function to read txt data files and convert them to proper csv files
# txtFile: input filename (including directory if applicable)
# csvFile: output filename (including directory if applicable)
# vtabchar: vertical tab character in the original file (to be replaced with newline command '\n')
# delim: delimiter character used in the original file (to be replaced with comma)
def txt2csv(txtFile, csvFile, vtabchar, delim):
    with open(txtFile, 'r') as file:
        data = file.read().replace(vtabchar, '\n').replace(delim, ',')
    with open(csvFile, 'w') as file:
        file.write(data)    
    return

Set the filename and location for each dataset

In [41]:
# original filename and directory for txt data files
K1DI2_txt = './Data/Komponente/Komponente_K1DI2.txt'
K2LE1_txt = './Data/Komponente/Komponente_K2LE1.txt' 
K2LE2_txt = './Data/Komponente/Komponente_K2LE2.txt'
K2ST1_txt = './Data/Komponente/Komponente_K2ST1.txt'
K3AG2_txt = './Data/Komponente/Komponente_K3AG2.txt'
K7_txt    = './Data/Komponente/Komponente_K7.txt'

# converted txt filename and directory
K1DI2_csv = './Data/Komponente/Komponente_K1DI2.csv'
K2LE1_csv = './Data/Komponente/Komponente_K2LE1.csv'
K2LE2_csv = './Data/Komponente/Komponente_K2LE2.csv'
K2ST1_csv = './Data/Komponente/Komponente_K2ST1.csv'
K3AG2_csv = './Data/Komponente/Komponente_K3AG2.csv'
K7_csv    = './Data/Komponente/Komponente_K7.csv'

# original filename and directory for csv data files
# component data files
K1BE1_csv = './Data/Komponente/Komponente_K1BE1.csv'
K1BE2_csv = './Data/Komponente/Komponente_K1BE2.csv'
K1DI1_csv = './Data/Komponente/Komponente_K1DI1.csv'
K2ST2_csv = './Data/Komponente/Komponente_K2ST2.csv'
K3AG1_csv = './Data/Komponente/Komponente_K3AG1.csv'
K3SG1_csv = './Data/Komponente/Komponente_K3SG1.csv'
K3SG2_csv = './Data/Komponente/Komponente_K3SG2.csv'
K4_csv    = './Data/Komponente/Komponente_K4.csv'
K5_csv    = './Data/Komponente/Komponente_K5.csv'
K6_csv    = './Data/Komponente/Komponente_K6.csv'    

In [42]:
# read and convert all the txt data files to csv
txt2csv(K1DI2_txt, K1DI2_csv, '	', '\\')
txt2csv(K2LE1_txt, K2LE1_csv, '', 'II')
txt2csv(K2LE2_txt, K2LE2_csv, '', '\\')
txt2csv(K2ST1_txt, K2ST1_csv, '', '|')
txt2csv(K3AG2_txt, K3AG2_csv, '', '\\')
txt2csv(K7_txt   , K7_csv   , '', '	')

In [43]:
# separate the data arrangements into 4 types, namely A, B, C, and D. 
# this separation is based on the column names of the datetime data type
A = ['Fehlerhaft_Datum', 'origin']
B = ['Produktionsdatum.x', 'Fehlerhaft_Datum.x', 
     'Produktionsdatum.y', 'Fehlerhaft_Datum.y']
C = ['Produktionsdatum.x', 'Fehlerhaft_Datum.x', 
     'Produktionsdatum.y', 'Fehlerhaft_Datum.y', 
     'Produktionsdatum', 'Fehlerhaft_Datum']
D = ['Produktionsdatum', 'Fehlerhaft_Datum']

# set up a function to read the csv files
def csvReader(csvFile, arr_type, delim=None):
    if delim is not None:
        dataset = pd.read_csv(csvFile, parse_dates=arr_type, 
                          low_memory=False, sep=delim)
    else:
        dataset = pd.read_csv(csvFile, parse_dates=arr_type, 
                          low_memory=False)
    return dataset

# read the converted csv files using the csvReader function
K1DI2 = csvReader(K1DI2_csv, A)
K2LE1 = csvReader(K2LE1_csv, B)
K2LE2 = csvReader(K2LE2_csv, A)
K2ST1 = csvReader(K2ST1_csv, D)
K3AG2 = csvReader(K3AG2_csv, A)
K7    = csvReader(K7_csv, A)

# read the rest of the csv files using the csvReader function
K1BE1 = csvReader(K1BE1_csv, A)
K1BE2 = csvReader(K1BE2_csv, A, ';')
K1DI1 = csvReader(K1DI1_csv, C)
K2ST2 = csvReader(K2ST2_csv, A, ';')
K3AG1 = csvReader(K3AG1_csv, C)
K3SG1 = csvReader(K3SG1_csv, B)
K3SG2 = csvReader(K3SG2_csv, A)
K4    = csvReader(K4_csv,    B, ';')
K5    = csvReader(K5_csv,    B)
K6    = csvReader(K6_csv,    A, ';')

For datasets with data arrangements of type B and C, we need to consolidate the columns and eliminate the .x and .y suffixes. For type B, the tables are separated into 2, whereas for type C, the tables are separated into 3.

In [44]:
# column names to be renamed for type B
col_names_x = {'Produktionsdatum.x':'Produktionsdatum', 
               'Herstellernummer.x':'Herstellernummer',	
               'Werksnummer.x':'Werksnummer',
               'Fehlerhaft.x':'Fehlerhaft', 
               'Fehlerhaft_Datum.x':'Fehlerhaft_Datum',
               'Fehlerhaft_Fahrleistung.x':'Fehlerhaft_Fahrleistung'}
col_names_y = {'Produktionsdatum.y':'Produktionsdatum', 
               'Herstellernummer.y':'Herstellernummer',	
               'Werksnummer.y':'Werksnummer',
               'Fehlerhaft.y':'Fehlerhaft', 
               'Fehlerhaft_Datum.y':'Fehlerhaft_Datum',
               'Fehlerhaft_Fahrleistung.y':'Fehlerhaft_Fahrleistung'}

# set up variables for the components. The components are engine (Motor), electrical components (Schaltung),
# body components (Karosserie), and seats (Sitze)
idMotor  = 'ID_Motor'        ; idSchalt  = 'ID_Schaltung'
idMotorx = 'ID_Motor.x'      ; idSchaltx = 'ID_Schaltung.x'
idMotory = 'ID_Motor.y'      ; idSchalty = 'ID_Schaltung.y'

idKaros  = 'ID_Karosserie'   ; idSitze  = 'ID_Sitze'
idKarosx = 'ID_Karosserie.x' ; idSitzex = 'ID_Sitze.x' 
idKarosy = 'ID_Karosserie.y' ; idSitzey = 'ID_Sitze.y'

# separate type B tables based on the suffixes, rename the columns, and 
# concatenate vertically, and finally extract the columns that contain the data.
# the cleaned up dataset is added with suffix '_c'
def streamlineTypeB(dataset, colx, coly, ID, IDx, IDy):
    dataset_x = dataset[dataset[IDx].notna()].rename(columns=colx)
    dataset_y = dataset[dataset[IDy].notna()].rename(columns=coly)
    dataset_x = dataset_x.rename(columns={IDx : ID})
    dataset_y = dataset_y.rename(columns={IDy : ID})
    dataset_x = dataset_x.loc[:, ID:'Fehlerhaft_Fahrleistung']
    dataset_y = dataset_y.loc[:, ID:'Fehlerhaft_Fahrleistung']
    dataset_c = pd.concat([dataset_x, dataset_y], axis=0).reset_index(drop=True)
    return dataset_c

K2LE1_c = streamlineTypeB(K2LE1, col_names_x, col_names_y, 
                            idSitze, idSitzex, idSitzey)
K3SG1_c = streamlineTypeB(K3SG1, col_names_x, col_names_y, 
                            idSchalt, idSchaltx, idSchalty)
K4_c    = streamlineTypeB(K4, col_names_x, col_names_y, 
                            idKaros, idKarosx, idKarosy)
K5_c    = streamlineTypeB(K5, col_names_x, col_names_y, 
                            idKaros, idKarosx, idKarosy)

# separate type C tables based on the suffixes, rename the columns, and 
# concatenate vertically, and finally extract the columns that contain the data.
# the cleaned up dataset is added with suffix '_c'
def streamlineTypeC(dataset, colx, coly, ID, IDx, IDy):
    dataset_x = dataset[dataset[IDx].notna()].loc[:, IDx:'Fehlerhaft_Fahrleistung.x'].rename(columns=colx)
    dataset_y = dataset[dataset[IDy].notna()].loc[:, IDy:'Fehlerhaft_Fahrleistung.y'].rename(columns=coly)
    dataset_  = dataset[dataset[ID].notna()]
    dataset_x = dataset_x.rename(columns={IDx : ID})
    dataset_y = dataset_y.rename(columns={IDy : ID})
    dataset_  = dataset_.loc[:, ID:'Fehlerhaft_Fahrleistung']    
    dataset_c = pd.concat([dataset_x, dataset_y, dataset_], axis=0).reset_index(drop=True)
    return dataset_c

K1DI1_c = streamlineTypeC(K1DI1, col_names_x, col_names_y,
                          idMotor, idMotorx, idMotory)
K3AG1_c = streamlineTypeC(K3AG1, col_names_x, col_names_y,
                          idSchalt, idSchaltx, idSchalty)
K2ST1_c = K2ST1.loc[:, idSitze:'Fehlerhaft_Fahrleistung']

In [45]:
def addProduktionsDatum(dataset):
    dataset['Produktionsdatum'] = dataset['origin'] + pd.to_timedelta(dataset['Produktionsdatum_Origin_01011970'].astype('int'), unit='days')
    return dataset

def reformatDataset(dataset, ID):
    dataset_c = pd.concat([dataset.loc[:,ID], 
                           dataset.loc[:,'Produktionsdatum'], 
                           dataset.loc[:,'Herstellernummer':'Fehlerhaft_Fahrleistung']], axis=1)
    return dataset_c

K1BE1_c = reformatDataset(addProduktionsDatum(K1BE1), idMotor)
K1BE2_c = reformatDataset(addProduktionsDatum(K1BE2), idMotor)
K1DI2_c = reformatDataset(addProduktionsDatum(K1DI2), idMotor)
K2LE2_c = reformatDataset(addProduktionsDatum(K2LE2), idSitze)
K2ST2_c = reformatDataset(addProduktionsDatum(K2ST2), idSitze)
K3AG2_c = reformatDataset(addProduktionsDatum(K3AG2), idSchalt)
K3SG2_c = reformatDataset(addProduktionsDatum(K3SG2), idSchalt)
K6_c    = reformatDataset(addProduktionsDatum(K6), idKaros)
K7_c    = reformatDataset(addProduktionsDatum(K7), idKaros)

# classify the components into list based on the types of components (Motor, Sitze, Schaltung, and Karosserie)
motorList      = [K1BE1_c, K1BE2_c, K1DI1_c, K1DI2_c]
sitzeList      = [K2LE1_c, K2LE2_c, K2ST1_c, K2ST2_c]
schaltungList  = [K3AG1_c, K3AG2_c, K3SG1_c, K3SG2_c]
karosserieList = [K4_c, K5_c, K6_c, K7_c]

motorName      = ['K1BE1', 'K1BE2', 'K1DI1', 'K1DI2']
sitzeName      = ['K2LE1', 'K2LE2', 'K2ST1', 'K2ST2']
schaltungName  = ['K3AG1', 'K3AG2', 'K3SG1', 'K3SG2']
karosserieName = ['K4', 'K5', 'K6', 'K7']

# checks for duplicates in the components datalists and drop them if preset
def dropDuplicates(datasetList):
    for i in range(len(datasetList)):
        datasetList[i] = datasetList[i].drop_duplicates()
        datasetList[i] = datasetList[i].reset_index(drop=True)
    return datasetList

motorList      = dropDuplicates(motorList)
sitzeList      = dropDuplicates(sitzeList)
schaltungList  = dropDuplicates(schaltungList)
karosserieList = dropDuplicates(karosserieList)


Now with all the datasets clean and proper, we can start doing the case analysis. In this case study, the components produced between January 1st, 2009 and December 31st, 2015 are considered. 

First, we filter out the components that is not produced within the considered time period.

In [46]:
# create a function that will extract the dataset within the specified time period
def timePeriodFilter(dataset, ID):
    startDate = '2008-01-01'
    endDate   = '2016-12-31'
    dataset_ = pd.concat([dataset[(dataset['Produktionsdatum'] >= startDate) & 
                                       (dataset['Produktionsdatum'] <= endDate)].loc[:,[ID,'Produktionsdatum']],
                                       dataset[(dataset['Produktionsdatum'] >= startDate) & 
                                       (dataset['Produktionsdatum'] <= endDate)].loc[:,'Herstellernummer':'Fehlerhaft_Fahrleistung']],
                                       axis=1).reset_index(drop=True)
    return dataset_

# the function below will slice all the components dataset to the specified time range
def component_(datasetList, ID):
    datasetlist_ = []
    for dataset in datasetList:
        datasetlist_.append(timePeriodFilter(dataset, ID))
    return datasetlist_

motorList_      = component_(motorList, idMotor)
sitzeList_      = component_(sitzeList, idSitze)
schaltungList_  = component_(schaltungList, idSchalt)
karosserieList_ = component_(karosserieList, idKaros)

Now, we create the cleaned dataset that contains all the components information for components produced during the analysis period. The columns consist component type, part number, date of production, supplier number, plant number, failure status, date of failure, and failure mileage. First, we concatenate all the components of the same type and then concatenate all the components together.

In [47]:
# create a function to concatenate the datasets of each component type, merge the component dataset with the vehicle dataset,
# and rename the ID columns of all the dataset.
def concatenateDataset(datasetList_, id):
    for i in (1, len(motorList_) - 1):
        datasetList_all = pd.concat([datasetList_[0], datasetList_[i]], axis=0)
        datasetList_[0]  = datasetList_all
    datasetList_all = datasetList_all.rename(columns={id:'ID'})
    return datasetList_all

# concatenate the components based on type
motorList_all      = concatenateDataset(motorList_, idMotor)
sitzeList_all      = concatenateDataset(sitzeList_, idSitze)
schaltungList_all  = concatenateDataset(schaltungList_, idSchalt)
karosserieList_all = concatenateDataset(karosserieList_, idKaros)

# concatenate all the components into a single dataset
komponente_all = pd.concat([motorList_all,
                                          sitzeList_all,
                                          schaltungList_all,
                                          karosserieList_all], axis=0, join='inner')

# set the Herstellernummer, Werksnummer, and Fehlerhaft columns as int
int_col = ['Herstellernummer','Werksnummer','Fehlerhaft']
komponente_all[int_col] = komponente_all[int_col].astype('int')

# save the cleaned file to the final dataset
komponente_all.to_csv('Final_dataset_group_33.csv')

The case study requirements are as the following:
1. Relative defects for components supplied by each plants
2. Find out the product groups where defects are dominant
3. Decide if product group inspection or comprehensive inspection is necessary

We start with the first requirement: Relative defects for components supplied by each plants. 

First, we list the plants, calculate the annual volumes, calculate the defective components' annual volumes produced by the plants, and calculate its relative frequency.

In [48]:
# find the number of plants that produce the components and 
# make a list of plants
plantList = komponente_all['Werksnummer'].unique()
plantList

# set the production year
productionYear = [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]

In [49]:
plantProd  = []
for year in productionYear:
    for plant in plantList:
        totalVolume  = komponente_all.loc[
                                (komponente_all['Werksnummer'] == plant)
                              & (komponente_all['Produktionsdatum'].dt.year == year)].shape[0]
        defectVolume = komponente_all[
                                (komponente_all['Werksnummer'] == plant)
                              & (komponente_all['Produktionsdatum'].dt.year == year)
                              & (komponente_all['Fehlerhaft'] == 1)].shape[0]
        if (totalVolume != 0) & (defectVolume != 0):
            relativeFreq = defectVolume / totalVolume
            relativeFreqPercentage = relativeFreq * 100
        else: 
            relativeFreq == 0
        plantProd.append([year, plant, totalVolume, defectVolume, relativeFreq, relativeFreqPercentage])

plantList_relFrq = pd.DataFrame(plantProd,
                                columns=['year', 'werksnummer', 'totalVolume', 'defectVolume', 'relativeFreq', 'relativeFreqPercentage'])

In [50]:
# assign the plant name to its respective annual volumentric relative frequency dataset
plant1011 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1011]
plant1041 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1041]
plant1021 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1021]
plant1031 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1031]
plant1091 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1091]
plant1111 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1111]
plant1092 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1092]
plant1101 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1101]
plant1051 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1051]
plant1061 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1061]
plant1082 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1082]
plant1081 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1081]
plant1072 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1072]
plant1121 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1121]
plant1141 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1141]
plant1122 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1122]
plant1131 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1131]
plant1142 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1142]
plant1132 = plantList_relFrq[plantList_relFrq['werksnummer'] == 1132]

In [64]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = plant1011['year'], y = plant1011['relativeFreqPercentage'], 
                         mode='lines+markers', name='1011 Failure Relative Frequency'))
fig.add_trace(go.Scatter(x = plant1041['year'], y = plant1041['relativeFreqPercentage'], 
                         mode='lines+markers', name='1041 Failure Relative Frequency'))
fig.add_trace(go.Scatter(x = plant1021['year'], y = plant1021['relativeFreqPercentage'], 
                         mode='lines+markers', name='1021 Failure Relative Frequency'))
fig.add_trace(go.Scatter(x = plant1031['year'], y = plant1031['relativeFreqPercentage'], 
                         mode='lines+markers', name='1031 Failure Relative Frequency'))
fig.add_trace(go.Scatter(x = plant1091['year'], y = plant1091['relativeFreqPercentage'], 
                         mode='lines+markers', name='1091 Failure Relative Frequency'))
fig.show()

In [69]:
plantList_relFrq[plantList_relFrq['year'] == 2013].loc[:,'relativeFreqPercentage'].sort_values(ascending=False).head()

100    19.845147
95     17.824932
96     17.573741
103    12.741376
97     12.606702
Name: relativeFreqPercentage, dtype: float64