In [201]:
import os 
import pandas as pd
import string
from typing import List
from datetime import datetime
import re
import numpy as np
import hashlib

In [47]:
def returnSortedCSVList(csv_directory: str) -> List:
    sortedfilelist = list()
    def sort_key(filename):
        underscoreindex = filename.index('_')
        return filename[underscoreindex+1:-4]
    for dirpath, dirnames, filenames in os.walk(csv_directory):
        dirnames.sort()
        sorted_files = sorted(filenames, key=sort_key)
        sorted_files_with_paths = [os.path.join(dirpath, f) for f in sorted_files]
        if sorted_files_with_paths:
            sortedfilelist += sorted_files_with_paths
    return sortedfilelist
        

In [48]:
def findFileNameTimeStamp(csv_file: str) -> str:
    short_index = csv_file.index('B')
    shortfilename = csv_file[short_index:]
    underscore_index = shortfilename.index('_')
    return int(shortfilename[underscore_index+1:-4])

In [49]:
def convertRAWcsvtoDF(csv_file: str) -> pd.DataFrame:
    rawcsvDF = pd.read_csv(csv_file)
    rawcsvDF["Closing Date"] = datetime.fromtimestamp(findFileNameTimeStamp(csv_file))
    return rawcsvDF

In [116]:
def keepColumnsforFeatureVector(csvDF: pd.DataFrame) -> pd.DataFrame:
    csvDF["Closing Date"] = pd.to_datetime(csvDF["Closing Date"]).dt.date
    slimmedDF = csvDF.loc[:,["Proposal", "Item", "Quantity", "Vendor Name", "Bidder ID", "Unit Price", "Extension", "Closing Date"]]
    return slimmedDF
   

In [199]:
def cleanDollarColumns(csvDF: pd.DataFrame) -> pd.DataFrame:
    csvDF["Unit Price"] = csvDF["Unit Price"].astype(str).str.replace(r'[^\d.]', '', regex=True).astype(float)
    csvDF["Quantity"] = csvDF["Quantity"].astype(str).str.replace(r'[^\d.]', '', regex=True).astype(float)
    csvDF["Extension"] = csvDF["Extension"].astype(str).str.replace(r'[^\d.]', '', regex=True).astype(float)
    return csvDF

In [204]:
def BidderIDtoNumber(id_string: str)-> int:
    hash_object = hashlib.sha256(id_string.encode())
    hash_hex = hash_object.hexdigest()
    return int(hash_hex[:16], 16)
def convertBidderIDColumntoNumberID(csvDF: pd.DataFrame) -> pd.DataFrame:
    csvDF["Bidder ID"] = csvDF["Bidder ID"].astype(str).apply(BidderIDtoNumber)
    return csvDF
    

In [61]:
def proposalVendorNameGrouping(csvDF: pd.DataFrame):
    return csvDF.groupby('Vendor Name')

In [65]:
def VendorNameBidderIDdict(csvDF: pd.DataFrame, vendorNameBidderIDmapping: dict) -> dict:
    groupedVendors = proposalVendorNameGrouping(csvDF)
    for name, group in groupedVendors:
        vendorNameBidderIDmapping[name] = group["Bidder ID"].iloc[0]
    return vendorNameBidderIDmapping

In [237]:
def getProposalCode(csvDF: pd.DataFrame) -> str:
    return csvDF['Proposal'].iloc[0]

def getClosingDate(csvDF: pd.DataFrame) -> int:
    return round(csvDF["Closing Date"].iloc[0].toordinal())

def getBidderID(csvDF: pd.DataFrame) -> int:
    return csvDF["Bidder ID"].iloc[0]

def getContractorTotalBidAmount(csvDF: pd.DataFrame) -> float:
    return csvDF["Extension"].sum()

def getContractorName(csvDF: pd.DataFrame) -> str:
    return csvDF["Vendor Name"].iloc[0]

def getWonorNot(csvDF: pd.DataFrame) -> int:
    return csvDF["Won"].iloc[0]


In [253]:
def combineDuplicatedProposalItems(csvDF: pd.DataFrame) -> pd.DataFrame:

    def process_group(group):
        duplicatedItems = group[group["Item"].duplicated(keep=False)]
        
        if duplicatedItems.empty:
            return group
        
        grouped = duplicatedItems.groupby("Item").agg({
            "Quantity": "sum",
            "Unit Price": "mean",
            "Extension": "sum"
        }).reset_index()
        
        mergedRows = duplicatedItems.drop_duplicates(subset="Item").merge(
            grouped, on="Item", suffixes=('', '_new')
        )
        
        mergedRows["Quantity"] = mergedRows["Quantity_new"]
        mergedRows["Unit Price"] = mergedRows["Unit Price_new"]
        mergedRows["Extension"] = mergedRows["Extension_new"]
        mergedRows = mergedRows.drop(columns=["Quantity_new", "Unit Price_new", "Extension_new"])
        
        withoutDuplicatedDF = group[~group["Item"].duplicated(keep='first')].dropna(subset=["Item"])
        
        finalDF = pd.concat([withoutDuplicatedDF, mergedRows]).drop_duplicates(subset="Item", keep="last")
        
        return finalDF
         
    vendorNamegrouping = csvDF.groupby("Vendor Name")
    processed_groups = []
    for name, group in vendorNamegrouping:
        processed_group = process_group(group)
        processed_groups.append(processed_group)
    
    finalDF = pd.concat(processed_groups).reset_index(drop=True)

    vendor_totals = finalDF.groupby("Vendor Name")["Extension"].sum()

    winning_vendor = vendor_totals.idxmin()

    finalDF["Won"] = np.where(finalDF["Vendor Name"] == winning_vendor, 1, 0)
    
    return finalDF
    


In [254]:
def flattenToVector(mergedItemDF: pd.DataFrame) -> np.array:
    numpy2D = mergedItemDF.to_numpy()
    lineitemsflat = numpy2D.flatten()
    return lineitemsflat

In [263]:
def createFeatureVector(proposalcode, contractorname, closingdate, bidderID, wonORlost, totalBidAmount, mergedItemDF: pd.DataFrame) -> np.array:
    lineitemsflat = flattenToVector(mergedItemDF)
    proposalcode_arr = np.array([proposalcode])
    closingdate_arr = np.array([closingdate])
    bidderID_arr = np.array([bidderID])
    totalBidAmount_arr = np.array([totalBidAmount])
    wonORlost_arr = np.array([wonORlost])
    contractorname_arr = np.array([contractorname])
    proposalfeaturevector = np.concatenate([
        proposalcode_arr,
        contractorname_arr,
        closingdate_arr,
        bidderID_arr,
        wonORlost_arr,
        totalBidAmount_arr,
        lineitemsflat
    ])
    return proposalfeaturevector

In [264]:
def mapItemCodestoGDOTItemsandCreateFeatureVector(csvDF: pd.DataFrame) -> np.array:
    proposalfeaturevectors = list()
    vendorNamegrouping = proposalVendorNameGrouping(csvDF)
    GADOT_ITEMS = pd.read_csv("../PayItemIndex_2021.csv").drop(columns=["UNITS", "LS UNITS", "DESCRIPTION"])
    proposalcode = getProposalCode(bidderIDConvertDF)
    closingdate = getClosingDate(bidderIDConvertDF)
    for name, group in vendorNamegrouping:
        bidderID = getBidderID(group)
        contractorname = getContractorName(group)
        wonORlost = getWonorNot(group)
        mergedAllItems = pd.merge(GADOT_ITEMS, group, left_on="ITEM NO.", right_on="Item", how="left").drop(
            columns=["Proposal", "Vendor Name", "Bidder ID", "Closing Date", "Item", "ITEM NO.", "Won"]).reset_index(drop=True)
        mergedAllItems = mergedAllItems.fillna(0)
        totalBidAmount = getContractorTotalBidAmount(mergedAllItems)
        mergedAllItems = mergedAllItems.drop(columns=["Extension"])
        singlefeaturevector = createFeatureVector(proposalcode, contractorname, closingdate, bidderID, wonORlost, totalBidAmount, mergedAllItems)
        proposalfeaturevectors.append(singlefeaturevector)
        #print(f"Merged Items Length: {len(mergedAllItems)}")
        #print(f"{contractorname} Final Bid on {proposalcode}: {totalBidAmount}")
        #print(f"Single Feature Vector Shape: {singlefeaturevector.shape}")
        #print(mergedAllItems[mergedAllItems["Quantity"] > 0].head(40))
    
    return np.array(proposalfeaturevectors).T

    

In [265]:
sortedfiles = returnSortedCSVList('../ga_csv')
vendorNameIDmapping = dict()
allproposalsFeatureVectors = []
for file in sortedfiles:
    rawBidTab = convertRAWcsvtoDF(file)
    neededcolBidTab = keepColumnsforFeatureVector(rawBidTab)
    cleanedBidTab = cleanDollarColumns(neededcolBidTab)
    bidderIDConvertDF = convertBidderIDColumntoNumberID(cleanedBidTab)
    
    vendorNameIDmapping = VendorNameBidderIDdict(bidderIDConvertDF, vendorNameIDmapping)
    proposalcode = getProposalCode(bidderIDConvertDF)
    closingdate = getClosingDate(bidderIDConvertDF)
    
    
    duplicatesRemovedBidTab = combineDuplicatedProposalItems(bidderIDConvertDF)
    
    singleproposalfeaturevectors = mapItemCodestoGDOTItemsandCreateFeatureVector(duplicatesRemovedBidTab)
   # print(singleproposalfeaturevectors.T.tolist()[0] == singleproposalfeaturevectors[:, 0].tolist())
    allproposalsFeatureVectors.append(singleproposalfeaturevectors.T.tolist())
    
    



In [266]:
proposalsfeaturevectorsDF = pd.DataFrame()

In [267]:
data_dict = {}
totalbids = 0
bidindex = 0
for proposal in allproposalsFeatureVectors:
    totalbids +=len(proposal)
    for bid in proposal:
        data_dict[f"Bid {bidindex}"] = bid
        bidindex += 1


In [268]:
proposalsfeaturevectorsDF = pd.DataFrame.from_dict(data_dict)

In [269]:
print(proposalsfeaturevectorsDF.head(10))

                     Bid 0                             Bid 1  \
0           B1CBA2100830-0                    B1CBA2100830-0   
1  BRASFIELD & GORRIE, LLC  GEORGIA BRIDGE AND CONCRETE, LLC   
2                   737842                            737842   
3     17514048918672564955              13160576302212482473   
4                        0                                 0   
5       2058705.8800000001                        1782721.05   
6                      0.0                               0.0   
7                      0.0                               0.0   
8                      0.0                               0.0   
9                      0.0                               0.0   

                 Bid 2                           Bid 3  \
0       B1CBA2100830-0                  B1CBA2100830-0   
1  MCCOY GRADING, INC.  PINE VALLEY CONCRETE CO., INC.   
2               737842                          737842   
3  7492328247865489751             7253993142375347671   
4    

In [270]:
print(proposalsfeaturevectorsDF.shape)

(9896, 2219)


In [None]:
# TODO: Standard Scale across Line Item Roads
# Train Test Split: Don't Shuffle!, make sure proposals with given id stay in same group
# Research on Kaggle some of the Housing Market models for binary classification model/probability model
# Build that damn model!