In [299]:
import os 
import pandas as pd
import string
from typing import List
from datetime import datetime
import re
import numpy as np
import hashlib
import json
from sklearn.preprocessing import StandardScaler

In [300]:
def returnSortedCSVList(csv_directory: str) -> List:
    sortedfilelist = list()
    def sort_key(filename):
        underscoreindex = filename.index('_')
        return filename[underscoreindex+1:-4]
    for dirpath, dirnames, filenames in os.walk(csv_directory):
        dirnames.sort()
        sorted_files = sorted(filenames, key=sort_key)
        sorted_files_with_paths = [os.path.join(dirpath, f) for f in sorted_files]
        if sorted_files_with_paths:
            sortedfilelist += sorted_files_with_paths
    return sortedfilelist
        

In [301]:
def findFileNameTimeStamp(csv_file: str) -> str:
    short_index = csv_file.index('B')
    shortfilename = csv_file[short_index:]
    underscore_index = shortfilename.index('_')
    return int(shortfilename[underscore_index+1:-4])

In [302]:
def convertRAWcsvtoDF(csv_file: str) -> pd.DataFrame:
    rawcsvDF = pd.read_csv(csv_file)
    rawcsvDF["Closing Date"] = datetime.fromtimestamp(findFileNameTimeStamp(csv_file))
    return rawcsvDF

In [303]:
def keepColumnsforFeatureVector(csvDF: pd.DataFrame) -> pd.DataFrame:
    csvDF["Closing Date"] = pd.to_datetime(csvDF["Closing Date"]).dt.date
    slimmedDF = csvDF.loc[:,["Proposal", "Item", "Quantity", "Vendor Name", "Bidder ID", "Unit Price", "Extension", "Closing Date"]]
    return slimmedDF
   

In [304]:
def cleanDollarColumns(csvDF: pd.DataFrame) -> pd.DataFrame:
    csvDF["Unit Price"] = csvDF["Unit Price"].astype(str).str.replace(r'[^\d.]', '', regex=True).astype(float)
    csvDF["Quantity"] = csvDF["Quantity"].astype(str).str.replace(r'[^\d.]', '', regex=True).astype(float)
    csvDF["Extension"] = csvDF["Extension"].astype(str).str.replace(r'[^\d.]', '', regex=True).astype(float)
    return csvDF

In [305]:
def BidderIDtoNumber(id_string: str)-> int:
    hash_object = hashlib.sha256(id_string.encode())
    hash_hex = hash_object.hexdigest()
    return int(hash_hex[:16], 16)
def convertBidderIDColumntoNumberID(csvDF: pd.DataFrame) -> pd.DataFrame:
    csvDF["Bidder ID"] = csvDF["Bidder ID"].astype(str).apply(BidderIDtoNumber)
    return csvDF
    

In [306]:
def proposalVendorNameGrouping(csvDF: pd.DataFrame):
    return csvDF.groupby('Vendor Name')

In [307]:
def VendorNameBidderIDdict(csvDF: pd.DataFrame, vendorNameBidderIDmapping: dict) -> dict:
    groupedVendors = proposalVendorNameGrouping(csvDF)
    for name, group in groupedVendors:
        vendorNameBidderIDmapping[name] = group["Bidder ID"].iloc[0]
    return vendorNameBidderIDmapping

In [308]:
def getProposalCode(csvDF: pd.DataFrame) -> str:
    return csvDF['Proposal'].iloc[0]

def getClosingDate(csvDF: pd.DataFrame) -> int:
    return round(csvDF["Closing Date"].iloc[0].toordinal())

def getBidderID(csvDF: pd.DataFrame) -> int:
    return csvDF["Bidder ID"].iloc[0]

def getContractorTotalBidAmount(csvDF: pd.DataFrame) -> float:
    return csvDF["Extension"].sum()

def getContractorName(csvDF: pd.DataFrame) -> str:
    return csvDF["Vendor Name"].iloc[0]

def getWonorNot(csvDF: pd.DataFrame) -> int:
    return csvDF["Won"].iloc[0]

def getNumberofContractorsBidding(groupofBidders) -> int:
    return len(groupofBidders)


In [344]:
def combineDuplicatedProposalItems(csvDF: pd.DataFrame) -> pd.DataFrame:

    def process_group(group):
        duplicatedItems = group[group["Item"].duplicated(keep=False)]
        
        if duplicatedItems.empty:
            return group
        print(duplicatedItems)
        grouped = duplicatedItems.groupby("Item").agg({
            "Quantity": "sum",
            "Unit Price": "mean",
            "Extension": "sum"
        }).reset_index()
        
        mergedRows = duplicatedItems.drop_duplicates(subset="Item").merge(
            grouped, on="Item", suffixes=('', '_new')
        )
        
        mergedRows["Quantity"] = mergedRows["Quantity_new"]
        mergedRows["Unit Price"] = mergedRows["Unit Price_new"]
        mergedRows["Extension"] = mergedRows["Extension_new"]
        mergedRows = mergedRows.drop(columns=["Quantity_new", "Unit Price_new", "Extension_new"])
        
        withoutDuplicatedDF = group[~group["Item"].duplicated(keep='first')].dropna(subset=["Item"])
        
        finalDF = pd.concat([withoutDuplicatedDF, mergedRows]).drop_duplicates(subset="Item", keep="last")
        
        return finalDF
         
    vendorNamegrouping = csvDF.groupby("Vendor Name")
    processed_groups = []
    for name, group in vendorNamegrouping:
        processed_group = process_group(group)
        processed_groups.append(processed_group)
    
    finalDF = pd.concat(processed_groups).reset_index(drop=True)

    vendor_totals = finalDF.groupby("Vendor Name")["Extension"].sum()

    winning_vendor = vendor_totals.idxmin()

    finalDF["Won"] = np.where(finalDF["Vendor Name"] == winning_vendor, 1, 0)
    
    return finalDF
    


In [310]:
def flattenToVector(mergedItemDF: pd.DataFrame) -> np.array:
    numpy2D = mergedItemDF.to_numpy()
    lineitemsflat = numpy2D.flatten()
    return lineitemsflat

In [311]:
def createFeatureVector(proposalcode, contractorname, closingdate, bidderID, wonORlost, 
                        totalBidAmount, numberofBidders, mergedItemDF: pd.DataFrame) -> np.array:
    lineitemsflat = flattenToVector(mergedItemDF)
    proposalcode_arr = np.array([proposalcode])
    closingdate_arr = np.array([closingdate])
    bidderID_arr = np.array([bidderID])
    totalBidAmount_arr = np.array([totalBidAmount])
    wonORlost_arr = np.array([wonORlost])
    contractorname_arr = np.array([contractorname])
    numberofBidders_arr = np.array([numberofBidders])
    proposalfeaturevector = np.concatenate([
        proposalcode_arr,
        contractorname_arr,
        closingdate_arr,
        bidderID_arr,
        wonORlost_arr,
        totalBidAmount_arr,
        numberofBidders_arr,
        lineitemsflat
    ])
    return proposalfeaturevector

In [312]:
def mapItemCodestoGDOTItemsandCreateFeatureVector(csvDF: pd.DataFrame) -> np.array:
    proposalfeaturevectors = list()
    vendorNamegrouping = proposalVendorNameGrouping(csvDF)
    GADOT_ITEMS = pd.read_csv("../PayItemIndex_2021.csv").drop(columns=["UNITS", "LS UNITS", "DESCRIPTION"])
    proposalcode = getProposalCode(bidderIDConvertDF)
    closingdate = getClosingDate(bidderIDConvertDF)
    for name, group in vendorNamegrouping:
        bidderID = getBidderID(group)
        contractorname = getContractorName(group)
        wonORlost = getWonorNot(group)
        numberofContractorsBidding = getNumberofContractorsBidding(vendorNamegrouping)
        if numberofContractorsBidding == 1: # don't want contracts with only one bidder
            continue
        mergedAllItems = pd.merge(GADOT_ITEMS, group, left_on="ITEM NO.", right_on="Item", how="left").drop(
            columns=["Proposal", "Vendor Name", "Bidder ID", "Closing Date", "Item", "ITEM NO.", "Won"]).reset_index(drop=True)
        mergedAllItems = mergedAllItems.fillna(0)
        totalBidAmount = getContractorTotalBidAmount(mergedAllItems)
        mergedAllItems = mergedAllItems.drop(columns=["Extension"])
        singlefeaturevector = createFeatureVector(proposalcode, contractorname, closingdate, bidderID, 
                                                  wonORlost, totalBidAmount, numberofContractorsBidding, mergedAllItems)
        break
        proposalfeaturevectors.append(singlefeaturevector)
        #print(f"Merged Items Length: {len(mergedAllItems)}")
        #print(f"{contractorname} Final Bid on {proposalcode}: {totalBidAmount}")
        #print(f"Single Feature Vector Shape: {singlefeaturevector.shape}")
        #print(mergedAllItems[mergedAllItems["Quantity"] > 0].head(40))
    
    return np.array(proposalfeaturevectors).T

    

In [345]:
sortedfiles = returnSortedCSVList('../ga_csv')
vendorNameIDmapping = dict()
allproposalsFeatureVectors = []
for file in sortedfiles:
    rawBidTab = convertRAWcsvtoDF(file)
    neededcolBidTab = keepColumnsforFeatureVector(rawBidTab)
    cleanedBidTab = cleanDollarColumns(neededcolBidTab)
    bidderIDConvertDF = convertBidderIDColumntoNumberID(cleanedBidTab)
    
    vendorNameIDmapping = VendorNameBidderIDdict(bidderIDConvertDF, vendorNameIDmapping)
    proposalcode = getProposalCode(bidderIDConvertDF)
    closingdate = getClosingDate(bidderIDConvertDF)
    
    
    duplicatesRemovedBidTab = combineDuplicatedProposalItems(bidderIDConvertDF)
    
    singleproposalfeaturevectors = mapItemCodestoGDOTItemsandCreateFeatureVector(duplicatesRemovedBidTab)
   # print(singleproposalfeaturevectors.T.tolist()[0] == singleproposalfeaturevectors[:, 0].tolist())
    allproposalsFeatureVectors.append(singleproposalfeaturevectors.T.tolist())
    
    
    



           Proposal      Item  Quantity              Vendor Name  \
209  B1CBA2100830-0  603-7000     160.0  BRASFIELD & GORRIE, LLC   
443  B1CBA2100830-0  603-7000     455.0  BRASFIELD & GORRIE, LLC   

                Bidder ID  Unit Price  Extension Closing Date  
209  17514048918672564955        2.15      344.0   2021-02-21  
443  17514048918672564955        2.16      982.8   2021-02-21  
           Proposal      Item  Quantity                       Vendor Name  \
208  B1CBA2100830-0  603-7000     160.0  GEORGIA BRIDGE AND CONCRETE, LLC   
442  B1CBA2100830-0  603-7000     455.0  GEORGIA BRIDGE AND CONCRETE, LLC   

                Bidder ID  Unit Price  Extension Closing Date  
208  13160576302212482473        10.0     1600.0   2021-02-21  
442  13160576302212482473        10.0     4550.0   2021-02-21  
           Proposal      Item  Quantity          Vendor Name  \
204  B1CBA2100830-0  603-7000     160.0  MCCOY GRADING, INC.   
438  B1CBA2100830-0  603-7000     455.0  MCCOY GRAD

In [314]:
proposalsfeaturevectorsDF = pd.DataFrame()

In [315]:
data_dict = {}
totalbids = 0
bidindex = 0
for proposal in allproposalsFeatureVectors:
    totalbids +=len(proposal)
    for bid in proposal:
        data_dict[f"Bid {bidindex}"] = bid
        bidindex += 1


In [316]:
proposalsfeaturevectorsDF = pd.DataFrame.from_dict(data_dict)

In [317]:
print(proposalsfeaturevectorsDF.head(10))

                     Bid 0                             Bid 1  \
0           B1CBA2100830-0                    B1CBA2100830-0   
1  BRASFIELD & GORRIE, LLC  GEORGIA BRIDGE AND CONCRETE, LLC   
2                   737842                            737842   
3     17514048918672564955              13160576302212482473   
4                        0                                 0   
5       2058705.8800000001                        1782721.05   
6                        6                                 6   
7                      0.0                               0.0   
8                      0.0                               0.0   
9                      0.0                               0.0   

                 Bid 2                           Bid 3  \
0       B1CBA2100830-0                  B1CBA2100830-0   
1  MCCOY GRADING, INC.  PINE VALLEY CONCRETE CO., INC.   
2               737842                          737842   
3  7492328247865489751             7253993142375347671   
4    

In [318]:
print(proposalsfeaturevectorsDF.shape)

(9897, 2172)


In [319]:
lineitemsDF = proposalsfeaturevectorsDF.iloc[7:]
lineitemsDF = lineitemsDF.astype(float)
positive_mask = (lineitemsDF > 0).any(axis=1)
lineitemsDF[positive_mask].T.describe()


Unnamed: 0,431,432,515,516,531,532,539,540,549,550,...,9373,9374,9375,9376,9377,9378,9379,9380,9405,9406
count,2172.0,2172.0,2172.0,2172.0,2172.0,2172.0,2172.0,2172.0,2172.0,2172.0,...,2172.0,2172.0,2172.0,2172.0,2172.0,2172.0,2172.0,2172.0,2172.0,2172.0
mean,0.002762,96.207753,1.012891,417302.0,0.862799,1590.198969,0.007366,5.16942,134.763812,6.745216,...,0.001381,14.31948,0.004144,29.159342,0.006906,69.381312,0.001381,162.919088,0.000921,914.4298
std,0.074295,2860.704527,0.191528,978298.3,2.762277,4026.294531,0.171538,123.524394,774.433377,24.044943,...,0.037148,388.269352,0.111443,787.299213,0.185738,1870.328868,0.037148,4407.399575,0.030338,30447.71
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,70450.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,197841.2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,437555.4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,115900.0,4.0,25000000.0,24.0,25143.07,4.0,3665.01,20000.0,300.0,...,1.0,11470.0,3.0,22750.0,5.0,55000.0,1.0,128825.0,1.0,1138142.0


In [320]:
def standard_scale_rows_independently(df, start_row, end_row):
    # Create a copy of the dataframe to avoid modifying the original
    df_scaled = df.copy()
    
    # Initialize the scaler
    scaler = StandardScaler()
    
    # Apply scaling to each row in the selected range
    for idx in range(start_row, end_row):
        # Select the current row
        row = df.iloc[idx].astype(float)
        
        # Identify numeric columns in the row
        numeric_cols = row.index[row.apply(lambda x: isinstance(x, (int, float)))]
    
        if len(numeric_cols) > 0:
            # Reshape for scaler and apply
            row_data = row[numeric_cols].values.reshape(1, -1)
            if np.all(row_data == row_data[0]):
                # If all values are 0, keep them 0. Otherwise, set to 1.
                binary_values = np.where(row_data == 0, 0, 1)
                df_scaled.iloc[idx, df_scaled.columns.get_indexer(numeric_cols)] = binary_values.flatten()
            else:
                scaled_values = scaler.fit_transform(row_data)
                df_scaled.iloc[idx, df_scaled.columns.get_indexer(numeric_cols)] = scaled_values.flatten()
    
    return df_scaled




In [321]:
scaledproposalsfeaturevectorsDF = standard_scale_rows_independently(proposalsfeaturevectorsDF, 7, len(proposalsfeaturevectorsDF))

In [322]:
def OneHotCounties() -> pd.DataFrame:
    with open('../proposalsCounties.json', 'r') as file:
        proposals_counties = json.load(file)
    with open('../AllGAcounties.txt', 'r') as file:
        all_counties = [line.strip() for line in file]

    one_hot_encoded = {}
    for proposal, counties in proposals_counties.items():
        counties = counties.split()
        encoded = {county.lower(): 0 for county in all_counties}
        for county in counties:
            if county.lower() in encoded:
                encoded[county.lower()] = 1
        one_hot_encoded[proposal] = encoded
    return pd.DataFrame.from_dict(one_hot_encoded, orient='index').T


def addOneHotCountiesDF(csvDF: pd.DataFrame) -> pd.DataFrame:
    onehotcountencoding = OneHotCounties()
    new_data = {}

    for csvDF_column in csvDF.columns:
        proposal_id = csvDF[csvDF_column].iloc[0]
        if proposal_id in onehotcountencoding:
            new_column = pd.concat([
                csvDF[csvDF_column].iloc[0:7],
                onehotcountencoding[proposal_id],
                csvDF[csvDF_column].iloc[7:]
            ])
            new_data[csvDF_column] = new_column

    return pd.DataFrame(new_data)
            

In [323]:
featureVectors = addOneHotCountiesDF(scaledproposalsfeaturevectorsDF).reset_index(drop=True)
print(featureVectors.shape)
with open('../AllGAcounties.txt', 'r') as file:
    all_counties = [line.strip() for line in file]
print(len(all_counties))

(10057, 2172)
160


In [324]:
print(featureVectors.head(10))

                     Bid 0                             Bid 1  \
0           B1CBA2100830-0                    B1CBA2100830-0   
1  BRASFIELD & GORRIE, LLC  GEORGIA BRIDGE AND CONCRETE, LLC   
2                   737842                            737842   
3     17514048918672564955              13160576302212482473   
4                        0                                 0   
5       2058705.8800000001                        1782721.05   
6                        6                                 6   
7                        0                                 0   
8                        0                                 0   
9                        0                                 0   

                 Bid 2                           Bid 3  \
0       B1CBA2100830-0                  B1CBA2100830-0   
1  MCCOY GRADING, INC.  PINE VALLEY CONCRETE CO., INC.   
2               737842                          737842   
3  7492328247865489751             7253993142375347671   
4    

In [325]:
# Train Test Split: Don't Shuffle!, make sure proposals with given id stay in same group
# Research on Kaggle some of the Housing Market models for binary classification model/probability model
# Build that damn model!

In [326]:
vendorNameIDmapping

{'BRASFIELD & GORRIE, LLC': 17514048918672564955,
 'GEORGIA BRIDGE AND CONCRETE, LLC': 13160576302212482473,
 'MCCOY GRADING, INC.': 7492328247865489751,
 'PINE VALLEY CONCRETE CO., INC.': 7253993142375347671,
 'TIDWELL CONSTRUCTION COMPANY': 12362214400396910139,
 'WILLIAMS CONTRACTING COMPANY, LLC': 8630780944956241809,
 'PEEK PAVEMENT MARKING, LLC': 12085196802233766010,
 'ROADSAFE TRAFFIC SYSTEMS, INC.': 8144487325791968738,
 'ROADSIDE SPECIALTIES, LLC': 18188031434716553106,
 'TRP CONSTRUCTION GROUP, LLC': 9890812050855712413,
 'E. R. SNELL CONTRACTOR, INC.': 11204624755977916136,
 'SOUTHEASTERN SITE DEVELOPMENT, INC.': 3298001649254912200,
 'SOUTHERN CONCRETE CONSTRUCTION CO., INC.': 3164296553112712866,
 'CMES, INC.': 3920863858128629906,
 'HIGGINS CONSTRUCTION CO. INC': 5798219614709672278,
 'TALLEY CONSTRUCTION COMPANY, INC.': 9375781168584125910,
 'BALDWIN PAVING COMPANY, INC.': 11494767873968677632,
 'C. W. MATTHEWS CONTRACTING CO., INC.': 11295075201831234105,
 'ASTRA GROUP

In [327]:
len(vendorNameIDmapping)

145

In [328]:
def get_one_hot_encoding(value):
    # Create a reverse dictionary for easy lookup
    reverse_dict = {v: k for k, v in vendorNameIDmapping.items()}
    
    # If the value is in the dictionary, get the corresponding contractor name
    contractor = reverse_dict.get(value, 'UNKNOWN')
    
    # Create a dictionary with all contractors set to 0
    one_hot_dict = {f'{cont}': 0 for cont in vendorNameIDmapping.keys()}
    
    # Set the correct contractor to 1
    one_hot_dict[f'{contractor}'] = 1
    
    # Create the DataFrame all at once
    one_hot = pd.Series(one_hot_dict)
    
    return one_hot

In [329]:
get_one_hot_encoding(12362214400396910139)

BRASFIELD & GORRIE, LLC                                 0
GEORGIA BRIDGE AND CONCRETE, LLC                        0
MCCOY GRADING, INC.                                     0
PINE VALLEY CONCRETE CO., INC.                          0
TIDWELL CONSTRUCTION COMPANY                            1
                                                       ..
SUPERIOR TRAFFIC CONTROL, LLC                           0
MCLEROY INC                                             0
STEWART BROS. INC.                                      0
E.R. SNELL CONTRACTOR INC/BALDWIN PAVING COMPANY INC    0
EAST COAST GRADING, INC.                                0
Length: 145, dtype: int64

In [330]:
def convertContractorHashtoOneHot(csvDF: pd.DataFrame) -> pd.DataFrame:
    updated_dict = {}
    for column in csvDF.columns:
        onehotencoding = get_one_hot_encoding(csvDF[column].iloc[3])
        
        # Combine the original column data with the one-hot encoding
        new_column = pd.concat([
            csvDF[column].iloc[0:3],
            csvDF[column].iloc[4:7],
            onehotencoding,
            csvDF[column].iloc[7:],
        ])
        
        new_column = new_column.reset_index(drop=True)
        
        updated_dict[column] = new_column

    updatedDF = pd.DataFrame(updated_dict)
    
    return updatedDF
        

In [331]:
featureVectorsFinal = convertContractorHashtoOneHot(featureVectors)
print(featureVectorsFinal.shape)

(10202, 2172)


In [332]:
print(featureVectorsFinal.head(40))

                      Bid 0                             Bid 1  \
0            B1CBA2100830-0                    B1CBA2100830-0   
1   BRASFIELD & GORRIE, LLC  GEORGIA BRIDGE AND CONCRETE, LLC   
2                    737842                            737842   
3                         0                                 0   
4        2058705.8800000001                        1782721.05   
5                         6                                 6   
6                         0                                 0   
7                         0                                 0   
8                         0                                 0   
9                         0                                 0   
10                        0                                 0   
11                        0                                 0   
12                        0                                 0   
13                        0                                 0   
14                       

In [336]:
def getlabels(featurevector: pd.DataFrame) -> np.array:
    return np.array(featurevector.iloc[3], dtype=np.float16)

In [337]:
def getExamples(featurevector: pd.DataFrame) -> np.array:
    withoutlabels = featurevector.drop(3) # remove lables
    onlyNumeric = withoutlabels.drop([0, 1]).reset_index(drop=True)
    return np.array(onlyNumeric, dtype=np.float64)

In [338]:
print(getlabels(featureVectorsFinal))

[0. 0. 1. ... 0. 0. 0.]


In [339]:
def returnsplitIndexLocation(featurevector: pd.DataFrame, percenttrain: float) -> int:
    location = int(featurevector.shape[1] * percenttrain)
    print(f"Original Location: {location}")
    while featurevector[f"Bid {location}"].iloc[0] == featurevector[f"Bid {location + 1}"].iloc[0]:
        location -= 1
    location+=1
    print(f"Location to maintain Proposal Blocks: {location}")
    print(f"Actual Training Set Percent: {location/featurevector.shape[1]:.3}")
    return location 
print(returnsplitIndexLocation(featureVectorsFinal, .78))

Original Location: 1694
Location to maintain Proposal Blocks: 1691
Actual Training Set Percent: 0.779
1691


In [340]:
for i in range(1680,1700):
    print(f"{i}: {featureVectorsFinal[f'Bid {i}'].iloc[0]}")
    

1680: B1CBA2302493-0
1681: B3CBA2302443-0
1682: B3CBA2302443-0
1683: B3CBA2302459-0
1684: B3CBA2302459-0
1685: B1CBA2302434-0
1686: B1CBA2302434-0
1687: B1CBA2302527-0
1688: B1CBA2302527-0
1689: B1CBA2302527-0
1690: B1CBA2302527-0
1691: B1CBA2302380-0
1692: B1CBA2302380-0
1693: B1CBA2302380-0
1694: B1CBA2302380-0
1695: B1CBA2302380-0
1696: B1CBA2302380-0
1697: B1CBA2302405-0
1698: B1CBA2302405-0
1699: B1CBA2302502-0


In [341]:
def createDataSets(featurevectors: pd.DataFrame, percenttrain: float) -> tuple:
    splitindexlocation = returnsplitIndexLocation(featurevectors, percenttrain)
    y_data = getlabels(featureVectorsFinal)
    X_data = getExamples(featureVectorsFinal)

    y_train = y_data[:splitindexlocation]
    y_test = y_data[splitindexlocation:]

    X_train = X_data[:, :splitindexlocation]
    X_test = X_data[:, splitindexlocation:]

    print(f"y_train shape: {y_train.shape}")
    print(f"y_test shape: {y_test.shape}")
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    return X_train, y_train, X_test, y_test

In [342]:
X_train, y_train, X_test, y_test = createDataSets(featureVectorsFinal, percenttrain=.9)

Original Location: 1954
Location to maintain Proposal Blocks: 1955
Actual Training Set Percent: 0.9
y_train shape: (1955,)
y_test shape: (217,)
X_train shape: (10199, 1955)
X_test shape: (10199, 217)


In [343]:
np.save("../Data/X_train.npy", X_train, allow_pickle=True)
np.save("../Data/y_train.npy", y_train, allow_pickle=True)
np.save("../Data/X_test.npy", X_test, allow_pickle=True)
np.save("../Data/y_test.npy", y_test, allow_pickle=True)