In [1]:
import pandas as pd
import numpy as np
import sqlite3
import warnings
import tqdm
from datetime import datetime, timedelta
import math
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'

Create script that reads two files (Reliability Data and Cost Data) and renames it to the same file names that are used in the data cleaning

In [15]:
#insert code here

Read in Reliability Data

In [2]:
reliability = pd.read_excel("GT Reliability Data 2019-2022.xlsx", sheet_name = "Reliability Data", header = 2)

Fix the dates

In [3]:
dates_to_fix = ['PO Document Date','Changed On Date','Confirmed Delivery Date','Creation Date of Confirmation','Posting Date','Item Delivery Date','Scheduled-relevant delivery date']
for i in dates_to_fix:
    if i == 'Scheduled-relevant delivery date':
        reliability[i] = pd.to_datetime(reliability[i].astype(str))
    elif i == 'PO Document Date':
        reliability[i] = pd.to_datetime(reliability[i].astype(str), format = '%Y%m%d')
    else:
        reliability[i] = reliability[i].apply(lambda x: str(x))
        reliability[i] = reliability[i].apply(lambda x: datetime(year=int(x[0:4]), month=int(x[4:6]), day=int(x[6:8])) if (x != 'nan' and x!= '0.0') else np.nan)

Create an empty dataframe of all the columns and the length of the columns

In [4]:
clean_reliability = pd.DataFrame(dict(zip(reliability.columns, [[]*len(reliability.columns)])))

Create a list of tuples that stores all of the unique PO Number/PO Line pairs

In [5]:
nums_lines = reliability[['PO Number','PO Line Number']].sort_values('PO Number')
numbers_lines = set(list(zip(nums_lines['PO Number'].to_list(), nums_lines['PO Line Number'].to_list())))

Now, for each pair of PO Number/PO Line, we will get the most updated information for each respective column in the dataframe and append it to our new dataframe

In [6]:
for item in tqdm.tqdm(numbers_lines): #PO Numbers
    ponum = item[0]
    polin = item[1]
    check = reliability[(reliability['PO Number']==ponum)&(reliability['PO Line Number']==polin)]
    #most recent changed on date
    last_changed = check.sort_values(by='Changed On Date', ascending=False).iloc[0]['Changed On Date']
    #most recent error code
    error_final = check.sort_values(by='Changed On Date', ascending=False).iloc[0]['Supply Chain Error Reason Code']
    #most recent due date
    last_due_date = check.sort_values(by='Scheduled-relevant delivery date', ascending=False).iloc[0]['Scheduled-relevant delivery date']
    #most recent arrival date
    arrival = check.sort_values(by='Posting Date', ascending=False).iloc[0]['Posting Date']
    last = check.groupby(['PO Number','PO Line Number']).first()
    new_row = pd.DataFrame({
        'PO_Number': ponum,
        'PO_Line_Number': polin,
        'Supplier': last['Zsupplier___T'],
        'Vendor_number': last['Vendor Number'],
        'Material': last['Material'],
        'Material_Group': last['Material Group'],
        'PO_Document_Date': last['PO Document Date'],
        'Scheduled_relevant_delivery_date': last_due_date,
        'Posting_Date': arrival,
        'error_code': error_final,
    })
    clean_reliability = pd.concat([clean_reliability,new_row])

100%|████████████████████████████████████████████████████████████████████████████| 67284/67284 [58:49<00:00, 19.06it/s]


In [7]:
clean_reliability.head(200)

Unnamed: 0,Material,Material_Group,PO_Document_Date,PO_Line_Number,PO_Number,Posting_Date,Scheduled_relevant_delivery_date,Supplier,Vendor_number,Zsupplier___T,error_code
"(4509790558, 1)",1071767001,014,2020-07-21,1.0,4.509791e+09,2020-09-22,2020-09-23,ERVINS GROUP LLC,542319.0,,
"(4510067718, 9)",209774019,018,2021-02-17,9.0,4.510068e+09,2021-02-18,2021-02-19,MANSCO A DIVISION OF FASTENAL,529485.0,,
"(4509654970, 1)",1025207002-6053,073,2020-04-06,1.0,4.509655e+09,2020-04-14,2020-04-13,ROYAL TECHNOLOGIES CORPORATION,556264.0,,W
"(4509619787, 37)",1071601001-5U23,073,2020-03-05,37.0,4.509620e+09,2020-03-12,2020-03-12,ROYAL TECHNOLOGIES CORPORATION,556264.0,,
"(4509720903, 1)",1036670001,030,2020-06-03,1.0,4.509721e+09,2020-06-10,2020-06-10,SAMHONGSA CO LTD,557704.0,,
...,...,...,...,...,...,...,...,...,...,...,...
"(4510810767, 2)",1221649001-6249,073,2022-07-11,2.0,4.510811e+09,2022-07-25,2022-07-25,ROYAL TECHNOLOGIES CORPORATION,556264.0,,
"(4509806170, 3)",1027350002,073,2020-07-31,3.0,4.509806e+09,2020-08-11,2020-08-11,ROYAL TECHNOLOGIES CORPORATION,556264.0,,
"(4510603942, 30)",1038017001,073,2022-02-25,30.0,4.510604e+09,2022-03-04,2022-03-04,ROYAL TECHNOLOGIES CORPORATION,556264.0,,
"(4510007630, 1)",1027357001,018,2020-12-29,1.0,4.510008e+09,2021-01-04,2021-01-22,MANSCO A DIVISION OF FASTENAL,529485.0,,


Remove unnecessary columns

In [8]:
reliability_data = clean_reliability[list(new_row.columns)].reset_index()
reliability_data = reliability_data.drop(columns=['index'], axis = 1)

Fix data types of columns to ensure the merge with te Cost Data goes smoothly

In [9]:
cols_to_change = ['PO_Number','PO_Line_Number', 'Vendor_number']
reliability_data[cols_to_change] = reliability_data[cols_to_change].applymap(np.int64)
reliability_data['Vendor_number'] = reliability_data['Vendor_number'].astype(object)

Import Cost Data and rename columns

In [10]:
cost = pd.read_excel("PO Line Item GT Data.xlsx", header = 3, names = ['PO_Number', 'PO_Line_Number', 'Material', 'Material Description', 'Material_Group', 'Material Group Description', 'Vendor_number', 'Supplier', 'Invoice ($)', 'Invoice Qty', 'Purchase Order $', 'Purchase Order Qty'])

Remove "NA/" from Vendor_number and Material columns

In [11]:
cost[['Vendor_number', 'Material']] = cost[['Vendor_number','Material']].applymap(lambda x: x[3:])

Remove similar columns from Cost Data except for PO_Number and PO_Line_Number

In [12]:
cost_data = cost.drop(columns = ['Material', 'Vendor_number', 'Material_Group', 'Supplier'])

Left merge Cost Data to Reliability Data in order to preserve Supplier and Material Group data without cost data

In [13]:
merged_data = pd.merge(reliability_data,cost_data,how='left',left_on = ('PO_Number', 'PO_Line_Number'), right_on = ('PO_Number', 'PO_Line_Number'))

In [14]:
merged_data

Unnamed: 0,PO_Number,PO_Line_Number,Supplier,Vendor_number,Material,Material_Group,PO_Document_Date,Scheduled_relevant_delivery_date,Posting_Date,error_code,Material Description,Material Group Description,Invoice ($),Invoice Qty,Purchase Order $,Purchase Order Qty
0,4509790558,1,ERVINS GROUP LLC,542319,1071767001,014,2020-07-21,2020-09-23,2020-09-22,,"THREAD - MEMBRANE, BACK",Seating Uph'y Matl's,63.99,3.0,63.99,3.0
1,4510067718,9,MANSCO A DIVISION OF FASTENAL,529485,209774019,018,2021-02-17,2021-02-19,2021-02-18,,"SCREW - TPG, FL CSKH, M4 X 1.7",Fasteners,75.00,2500.0,75.00,2500.0
2,4509654970,1,ROYAL TECHNOLOGIES CORPORATION,556264,1025207002-6053,073,2020-04-06,2020-04-13,2020-04-14,W,BUSHING - ARM SEAGULL,Inj.Molded Plastics,46.80,120.0,46.80,120.0
3,4509619787,37,ROYAL TECHNOLOGIES CORPORATION,556264,1071601001-5U23,073,2020-03-05,2020-03-12,2020-03-12,,"LUMBAR ASSEMBLY, BLUE JAY, 5U23",Inj.Molded Plastics,420.00,120.0,420.00,120.0
4,4509720903,1,SAMHONGSA CO LTD,557704,1036670001,030,2020-06-03,2020-06-10,2020-06-10,,"CYLINDER - PNEU, STD",Gas/Hydr'c Clndr's,4058.40,760.0,4058.40,760.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67279,4509625927,10,ROYAL TECHNOLOGIES CORPORATION,556264,1025207002-6053,073,2020-03-09,2020-03-16,2020-03-16,,BUSHING - ARM SEAGULL,Inj.Molded Plastics,93.60,240.0,93.60,240.0
67280,4510342330,4,NEFF ENGINEERING COMPANY,514279,,,2021-08-27,2021-09-10,2021-09-10,,,,,,,
67281,4510121902,2,SPECTRUM INDUSTRIES INC,521999,1025287002-9180,012,2021-04-01,2021-04-20,2021-04-14,,"BRACKET - SEAT, RH",Plating/Sub Finisher,131.04,720.0,131.04,720.0
67282,4509500740,1,DESIGNTEX GROUP INC,501967,V00037845403,057,2019-12-26,2020-01-06,2020-01-02,,,,,,,


Create script that will replace the data existing in streamlit with this new, updated data

In [16]:
#insert code here