In [1]:
import os 
import pandas as pd
import string
from typing import List
from datetime import datetime
import re
import numpy as np
import hashlib
import json
from sklearn.preprocessing import StandardScaler

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [2]:
def returnSortedCSVList(csv_directory: str) -> List:
    sortedfilelist = list()
    def sort_key(filename):
        underscoreindex = filename.index('_')
        return filename[underscoreindex+1:-4]
    for dirpath, dirnames, filenames in os.walk(csv_directory):
        dirnames.sort()
        sorted_files = sorted(filenames, key=sort_key)
        sorted_files_with_paths = [os.path.join(dirpath, f) for f in sorted_files]
        if sorted_files_with_paths:
            sortedfilelist += sorted_files_with_paths
    return sortedfilelist
        

In [3]:
def findFileNameTimeStamp(csv_file: str) -> str:
    short_index = csv_file.index('B')
    shortfilename = csv_file[short_index:]
    underscore_index = shortfilename.index('_')
    datestring = shortfilename[underscore_index+1:-4]
    digits = re.findall(r'\d+', datestring)
    finaldatedigits = ''.join(digits)
    return int(finaldatedigits)

In [4]:
def convertRAWcsvtoDF(csv_file: str) -> pd.DataFrame:
    rawcsvDF = pd.read_csv(csv_file)
    rawcsvDF["Closing Date"] = datetime.fromtimestamp(findFileNameTimeStamp(csv_file))
    return rawcsvDF

In [5]:
def keepColumnsforFeatureVector(csvDF: pd.DataFrame) -> pd.DataFrame:
    csvDF["Closing Date"] = pd.to_datetime(csvDF["Closing Date"]).dt.date
    slimmedDF = csvDF.loc[:,["Proposal", "Item", "Quantity", "Vendor Name", "Bidder ID", "Unit Price", "Extension", "Closing Date"]]
    return slimmedDF
   

In [6]:
def cleanDollarColumns(csvDF: pd.DataFrame) -> pd.DataFrame:
    csvDF["Unit Price"] = csvDF["Unit Price"].astype(str).str.replace(r'[^\d.]', '', regex=True).astype(float)
    csvDF["Quantity"] = csvDF["Quantity"].astype(str).str.replace(r'[^\d.]', '', regex=True).astype(float)
    csvDF["Extension"] = csvDF["Extension"].astype(str).str.replace(r'[^\d.]', '', regex=True).astype(float)
    return csvDF

In [7]:
def BidderIDtoNumber(id_string: str)-> int:
    hash_object = hashlib.sha256(id_string.encode())
    hash_hex = hash_object.hexdigest()
    return int(hash_hex[:16], 16)
def convertBidderIDColumntoNumberID(csvDF: pd.DataFrame) -> pd.DataFrame:
    csvDF["Bidder ID"] = csvDF["Bidder ID"].astype(str).apply(BidderIDtoNumber)
    return csvDF
    

In [8]:
def proposalVendorNameGrouping(csvDF: pd.DataFrame):
    return csvDF.groupby('Vendor Name')

In [9]:
def VendorNameBidderIDdict(csvDF: pd.DataFrame, vendorNameBidderIDmapping: dict) -> dict:
    groupedVendors = proposalVendorNameGrouping(csvDF)
    for name, group in groupedVendors:
        vendorNameBidderIDmapping[name] = group["Bidder ID"].iloc[0]
    return vendorNameBidderIDmapping

In [10]:
def getProposalCode(csvDF: pd.DataFrame) -> str:
    return csvDF['Proposal'].iloc[0]

def getClosingDate(csvDF: pd.DataFrame) -> int:
    return round(csvDF["Closing Date"].iloc[0].toordinal())

def getBidderID(csvDF: pd.DataFrame) -> int:
    return csvDF["Bidder ID"].iloc[0]

def getContractorTotalBidAmount(csvDF: pd.DataFrame) -> float:
    return csvDF["Extension"].sum()

def getContractorName(csvDF: pd.DataFrame) -> str:
    return csvDF["Vendor Name"].iloc[0]

def getWonorNot(csvDF: pd.DataFrame) -> int:
    return csvDF["Won"].iloc[0]

def getNumberofContractorsBidding(groupofBidders) -> int:
    return len(groupofBidders)


In [11]:
def combineDuplicatedProposalItems(csvDF: pd.DataFrame) -> pd.DataFrame:

    def process_group(group):
        duplicatedItems = group[group["Item"].duplicated(keep=False)]
        
        if duplicatedItems.empty:
            return group
    
        grouped = duplicatedItems.groupby("Item").agg({
            "Quantity": "sum",
            "Unit Price": lambda x: np.average(x, weights=duplicatedItems.loc[x.index, "Quantity"]), # weighted average
            "Extension": "sum"
        }).reset_index()
        
        mergedRows = duplicatedItems.drop_duplicates(subset="Item").merge(
            grouped, on="Item", suffixes=('', '_new')
        )
        
        mergedRows["Quantity"] = mergedRows["Quantity_new"]
        mergedRows["Unit Price"] = mergedRows["Unit Price_new"]
        mergedRows["Extension"] = mergedRows["Extension_new"]
        mergedRows = mergedRows.drop(columns=["Quantity_new", "Unit Price_new", "Extension_new"])
        
        withoutDuplicatedDF = group[~group["Item"].duplicated(keep='first')].dropna(subset=["Item"])
        
        finalDF = pd.concat([withoutDuplicatedDF, mergedRows]).drop_duplicates(subset="Item", keep="last")
        
        return finalDF
         
    vendorNamegrouping = csvDF.groupby("Vendor Name")
    processed_groups = []
    for name, group in vendorNamegrouping:
        processed_group = process_group(group)
        processed_groups.append(processed_group)
    
    finalDF = pd.concat(processed_groups).reset_index(drop=True)
    vendor_totals = finalDF.groupby("Vendor Name")["Extension"].sum()

    winning_vendor = vendor_totals.idxmin()

    finalDF["Won"] = np.where(finalDF["Vendor Name"] == winning_vendor, 1, 0)
    
    return finalDF
    


In [12]:
def flattenToVector(mergedItemDF: pd.DataFrame) -> np.array:
    numpy2D = mergedItemDF.to_numpy()
    lineitemsflat = numpy2D.flatten()
    return lineitemsflat

In [13]:
def createFeatureVector(proposalcode, contractorname, closingdate, bidderID, wonORlost, 
                        totalBidAmount, numberofBidders, mergedItemDF: pd.DataFrame) -> np.array:
    lineitemsflat = flattenToVector(mergedItemDF)
    proposalcode_arr = np.array([proposalcode])
    closingdate_arr = np.array([closingdate])
    bidderID_arr = np.array([bidderID])
    totalBidAmount_arr = np.array([totalBidAmount])
    wonORlost_arr = np.array([wonORlost])
    contractorname_arr = np.array([contractorname])
    numberofBidders_arr = np.array([numberofBidders])
    proposalfeaturevector = np.concatenate([
        proposalcode_arr,
        contractorname_arr,
        closingdate_arr,
        bidderID_arr,
        wonORlost_arr,
        totalBidAmount_arr,
        numberofBidders_arr,
        lineitemsflat
    ])
    return proposalfeaturevector

In [14]:
def mapItemCodestoGDOTItemsandCreateFeatureVector(csvDF: pd.DataFrame) -> np.array:
    proposalfeaturevectors = list()
    vendorNamegrouping = proposalVendorNameGrouping(csvDF)
    GADOT_ITEMS = pd.read_csv("../PayItemIndex_2021.csv").drop(columns=["UNITS", "LS UNITS", "DESCRIPTION"])
    proposalcode = getProposalCode(csvDF)
    closingdate = getClosingDate(csvDF)
    for name, group in vendorNamegrouping:
        bidderID = getBidderID(group)
        contractorname = getContractorName(group)
        wonORlost = getWonorNot(group)
        numberofContractorsBidding = getNumberofContractorsBidding(vendorNamegrouping)
        if numberofContractorsBidding == 1: # don't want contracts with only one bidder
            continue
        mergedAllItems = pd.merge(GADOT_ITEMS, group, left_on="ITEM NO.", right_on="Item", how="left").drop(
            columns=["Proposal", "Vendor Name", "Bidder ID", "Closing Date", "Item", "ITEM NO.", "Won"]).reset_index(drop=True)
        mergedAllItems = mergedAllItems.fillna(0)
        totalBidAmount = getContractorTotalBidAmount(mergedAllItems)
        mergedAllItems = mergedAllItems.drop(columns=["Extension"])
        singlefeaturevector = createFeatureVector(proposalcode, contractorname, closingdate, bidderID, 
                                                  wonORlost, totalBidAmount, numberofContractorsBidding, mergedAllItems)
        proposalfeaturevectors.append(singlefeaturevector)
        #print(f"Merged Items Length: {len(mergedAllItems)}")
        #print(f"{contractorname} Final Bid on {proposalcode}: {totalBidAmount}")
        #print(f"Single Feature Vector Shape: {singlefeaturevector.shape}")
        #print(mergedAllItems[mergedAllItems["Quantity"] > 0].head(40))
    
    return np.array(proposalfeaturevectors).T

    

In [15]:
sortedfiles = returnSortedCSVList('../ga_csv')
vendorNameIDmapping = dict()
allproposalsFeatureVectors = []
for file in sortedfiles:
    rawBidTab = convertRAWcsvtoDF(file)
    neededcolBidTab = keepColumnsforFeatureVector(rawBidTab)
    cleanedBidTab = cleanDollarColumns(neededcolBidTab)
    bidderIDConvertDF = convertBidderIDColumntoNumberID(cleanedBidTab)
    
    vendorNameIDmapping = VendorNameBidderIDdict(bidderIDConvertDF, vendorNameIDmapping)
    proposalcode = getProposalCode(bidderIDConvertDF)
    closingdate = getClosingDate(bidderIDConvertDF)
    
    
    duplicatesRemovedBidTab = combineDuplicatedProposalItems(bidderIDConvertDF)
    
    singleproposalfeaturevectors = mapItemCodestoGDOTItemsandCreateFeatureVector(duplicatesRemovedBidTab)
   # print(singleproposalfeaturevectors.T.tolist()[0] == singleproposalfeaturevectors[:, 0].tolist())
    allproposalsFeatureVectors.append(singleproposalfeaturevectors.T.tolist())
    
    
    



In [16]:
proposalsfeaturevectorsDF = pd.DataFrame()

In [17]:
data_dict = {}
totalbids = 0
bidindex = 0
for proposal in allproposalsFeatureVectors:
    totalbids +=len(proposal)
    for bid in proposal:
        data_dict[f"Bid {bidindex}"] = bid
        bidindex += 1


In [18]:
proposalsfeaturevectorsDF = pd.DataFrame.from_dict(data_dict)

In [19]:
print(proposalsfeaturevectorsDF.head(10))

                  Bid 0                                 Bid 1  \
0        B1CBA1800717-0                        B1CBA1800717-0   
1  C AND H PAVING, INC.  C. W. MATTHEWS CONTRACTING CO., INC.   
2                736743                                736743   
3   6153598478460343164                  11295075201831234105   
4                     0                                     0   
5    10900942.299999999                             9736175.0   
6                     4                                     4   
7                   0.0                                   0.0   
8                   0.0                                   0.0   
9                   0.0                                   0.0   

                          Bid 2                         Bid 3  \
0                B1CBA1800717-0                B1CBA1800717-0   
1  E. R. SNELL CONTRACTOR, INC.  PITTMAN CONSTRUCTION COMPANY   
2                        736743                        736743   
3          1120462475597

In [20]:
print(proposalsfeaturevectorsDF.shape)

(9897, 4602)


In [21]:
lineitemsDF = proposalsfeaturevectorsDF.iloc[7:]
lineitemsDF = lineitemsDF.astype(float)
positive_mask = (lineitemsDF > 0).any(axis=1)
lineitemsDF[positive_mask].T.describe()


Unnamed: 0,431,432,475,476,485,486,495,496,515,516,...,9375,9376,9377,9378,9379,9380,9389,9390,9405,9406
count,4602.0,4602.0,4602.0,4602.0,4602.0,4602.0,4602.0,4602.0,4602.0,4602.0,...,4602.0,4602.0,4602.0,4602.0,4602.0,4602.0,4602.0,4602.0,4602.0,4602.0
mean,0.001304,45.407049,0.000869,6.420209,0.001086,44.793133,0.003911,1.62967,1.016297,358292.7,...,0.001956,13.762297,0.003259,32.745808,0.000652,76.892712,282.738375,0.002141,0.000435,431.5823
std,0.051053,1965.651417,0.029472,370.668202,0.032948,1578.891809,0.108265,54.574872,0.215554,835022.1,...,0.07658,541.004545,0.127633,1285.226204,0.025527,3028.606909,6428.238909,0.050982,0.020845,20920.01
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,56111.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,154836.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,350000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,115900.0,1.0,25000.0,1.0,75000.0,3.0,3000.0,4.0,25000000.0,...,3.0,22750.0,5.0,55000.0,1.0,128825.0,163150.0,2.0,1.0,1138142.0


In [22]:
def standard_scale_rows_independently(df, start_row, end_row):
    # Create a copy of the dataframe to avoid modifying the original
    df_scaled = df.copy()
    
    # Initialize the scaler
    scaler = StandardScaler()
    
    # Apply scaling to each row in the selected range
    for idx in range(start_row, end_row):
        # Select the current row
        row = df.iloc[idx].astype(float)
        
        # Identify numeric columns in the row
        numeric_cols = row.index[row.apply(lambda x: isinstance(x, (int, float)))]
    
        if len(numeric_cols) > 0:
            # Reshape for scaler and apply
            row_data = row[numeric_cols].values.reshape(1, -1)
            if np.all(row_data == row_data[0]):
                # If all values are 0, keep them 0. Otherwise, set to 1.
                binary_values = np.where(row_data == 0, 0, 1)
                df_scaled.iloc[idx, df_scaled.columns.get_indexer(numeric_cols)] = binary_values.flatten()
            else:
                scaled_values = scaler.fit_transform(row_data)
                df_scaled.iloc[idx, df_scaled.columns.get_indexer(numeric_cols)] = scaled_values.flatten()
    
    return df_scaled




In [23]:
def OneHotCounties() -> pd.DataFrame:
    with open('../proposalsCounties.json', 'r') as file:
        proposals_counties = json.load(file)
    with open('../AllGAcounties.txt', 'r') as file:
        all_counties = [line.strip() for line in file]

    one_hot_encoded = {}
    for proposal, counties in proposals_counties.items():
        counties = counties.split()
        encoded = {county.lower(): 0 for county in all_counties}
        for county in counties:
            if county.lower() in encoded:
                encoded[county.lower()] = 1
        one_hot_encoded[proposal] = encoded
    return pd.DataFrame.from_dict(one_hot_encoded, orient='index').T


def addOneHotCountiesDF(csvDF: pd.DataFrame) -> pd.DataFrame:
    onehotcountencoding = OneHotCounties()
    new_data = {}

    for csvDF_column in csvDF.columns:
        proposal_id = csvDF[csvDF_column].iloc[0]
        if proposal_id in onehotcountencoding:
            new_column = pd.concat([
                csvDF[csvDF_column].iloc[0:7],
                onehotcountencoding[proposal_id],
                csvDF[csvDF_column].iloc[7:]
            ])
            new_data[csvDF_column] = new_column

    return pd.DataFrame(new_data)
            

In [24]:
# Removed Standard scaler for now
featureVectors = addOneHotCountiesDF(proposalsfeaturevectorsDF).reset_index(drop=True)
print(featureVectors.shape)
with open('../AllGAcounties.txt', 'r') as file:
    all_counties = [line.strip() for line in file]
print(len(all_counties))

(10057, 4602)
160


In [25]:
# Train Test Split: Don't Shuffle!, make sure proposals with given id stay in same group
# Research on Kaggle some of the Housing Market models for binary classification model/probability model
# Build that damn model!

In [26]:
vendorNameIDmapping

{'C AND H PAVING, INC.': 6153598478460343164,
 'C. W. MATTHEWS CONTRACTING CO., INC.': 11295075201831234105,
 'E. R. SNELL CONTRACTOR, INC.': 11204624755977916136,
 'PITTMAN CONSTRUCTION COMPANY': 18247970538673719247,
 'BALDWIN PAVING COMPANY, INC.': 11494767873968677632,
 'ALLIED PAVING CONTRACTORS, INC.': 2122819388884808436,
 'STEWART BROS. INC.': 2299920100819364586,
 'OXFORD CONSTRUCTION COMPANY': 231354610578019013,
 'REEVES CONSTRUCTION COMPANY': 6407162824770513453,
 'ROBINSON PAVING COMPANY': 3528655465465674686,
 'K.V.K. CONTRACTING, INC': 12935663050288602446,
 'M & J CONSTRUCTION COMPANY OF PINELLAS COUNTY, INC.': 4180860747474834015,
 'S & D INDUSTRIAL PAINTING, INC.': 12902524223471334422,
 'SEMINOLE EQUIPMENT, INC.': 8789726742082048019,
 'SOUTHEAST BRIDGE FL CORP': 9990605557316844068,
 'SOUTHERN ROAD & BRIDGE, LLC': 3990835782714541177,
 'COMANCHE CONSTRUCTION OF GEORGIA, LLC': 12895560795135555984,
 'GEORGIA BRIDGE AND CONCRETE, LLC': 13160576302212482473,
 'JHC CORP

In [27]:
len(vendorNameIDmapping)

196

In [28]:
def get_one_hot_encoding(value):
    # Create a reverse dictionary for easy lookup
    reverse_dict = {v: k for k, v in vendorNameIDmapping.items()}
    
    # If the value is in the dictionary, get the corresponding contractor name
    contractor = reverse_dict.get(value, 'UNKNOWN')
    
    # Create a dictionary with all contractors set to 0
    one_hot_dict = {f'{cont}': 0 for cont in vendorNameIDmapping.keys()}
    
    # Set the correct contractor to 1
    one_hot_dict[f'{contractor}'] = 1
    
    # Create the DataFrame all at once
    one_hot = pd.Series(one_hot_dict)
    
    return one_hot

In [29]:
get_one_hot_encoding(12362214400396910139)

C AND H PAVING, INC.                                    0
C. W. MATTHEWS CONTRACTING CO., INC.                    0
E. R. SNELL CONTRACTOR, INC.                            0
PITTMAN CONSTRUCTION COMPANY                            0
BALDWIN PAVING COMPANY, INC.                            0
                                                       ..
NGT GROUP LLC                                           0
SUPERIOR TRAFFIC CONTROL, LLC                           0
MCLEROY INC                                             0
E.R. SNELL CONTRACTOR INC/BALDWIN PAVING COMPANY INC    0
EAST COAST GRADING, INC.                                0
Length: 196, dtype: int64

In [30]:
def convertContractorHashtoOneHot(csvDF: pd.DataFrame) -> pd.DataFrame:
    updated_dict = {}
    for column in csvDF.columns:
        print(column)
        onehotencoding = get_one_hot_encoding(csvDF[column].iloc[3])
        
        # Combine the original column data with the one-hot encoding
        new_column = pd.concat([
            csvDF[column].iloc[0:3],
            csvDF[column].iloc[4:7],
            onehotencoding,
            csvDF[column].iloc[7:],
        ])
        
        new_column = new_column.reset_index(drop=True)
        
        updated_dict[column] = new_column

    updatedDF = pd.DataFrame(updated_dict)
    
    return updatedDF
        

In [31]:
featureVectorsFinal = convertContractorHashtoOneHot(featureVectors)

Bid 0
Bid 1
Bid 2
Bid 3
Bid 4
Bid 5
Bid 6
Bid 7
Bid 8
Bid 9
Bid 10
Bid 11
Bid 12
Bid 13
Bid 14
Bid 15
Bid 16
Bid 17
Bid 18
Bid 19
Bid 20
Bid 21
Bid 22
Bid 23
Bid 24
Bid 25
Bid 26
Bid 27
Bid 28
Bid 29
Bid 30
Bid 31
Bid 32
Bid 33
Bid 34
Bid 35
Bid 36
Bid 37
Bid 38
Bid 39
Bid 40
Bid 41
Bid 42
Bid 43
Bid 44
Bid 45
Bid 46
Bid 47
Bid 48
Bid 49
Bid 50
Bid 51
Bid 52
Bid 53
Bid 54
Bid 55
Bid 56
Bid 57
Bid 58
Bid 59
Bid 60
Bid 61
Bid 62
Bid 63
Bid 64
Bid 65
Bid 66
Bid 67
Bid 68
Bid 69
Bid 70
Bid 71
Bid 72
Bid 73
Bid 74
Bid 75
Bid 76
Bid 77
Bid 78
Bid 79
Bid 80
Bid 81
Bid 82
Bid 83
Bid 84
Bid 85
Bid 86
Bid 87
Bid 88
Bid 89
Bid 90
Bid 91
Bid 92
Bid 93
Bid 94
Bid 95
Bid 96
Bid 97
Bid 98
Bid 99
Bid 100
Bid 101
Bid 102
Bid 103
Bid 104
Bid 105
Bid 106
Bid 107
Bid 108
Bid 109
Bid 110
Bid 111
Bid 112
Bid 113
Bid 114
Bid 115
Bid 116
Bid 117
Bid 118
Bid 119
Bid 120
Bid 121
Bid 122
Bid 123
Bid 124
Bid 125
Bid 126
Bid 127
Bid 128
Bid 129
Bid 130
Bid 131
Bid 132
Bid 133
Bid 134
Bid 135
Bid 136
Bid 137
Bid 13

In [32]:
def getlabels(featurevector: pd.DataFrame) -> np.array:
    return np.array(featurevector.iloc[3], dtype=np.float16)

In [33]:
def getExamples(featurevector: pd.DataFrame) -> np.array:
    withoutlabels = featurevector.drop(3) # remove lables
    onlyNumeric = withoutlabels.drop([0, 1]).reset_index(drop=True)
    return np.array(onlyNumeric, dtype=np.float64)

In [34]:
print(getlabels(featureVectorsFinal))

[0. 0. 0. ... 0. 0. 0.]


In [35]:
featureVectors.columns

Index(['Bid 0', 'Bid 1', 'Bid 2', 'Bid 3', 'Bid 4', 'Bid 5', 'Bid 6', 'Bid 7',
       'Bid 8', 'Bid 9',
       ...
       'Bid 4592', 'Bid 4593', 'Bid 4594', 'Bid 4595', 'Bid 4596', 'Bid 4597',
       'Bid 4598', 'Bid 4599', 'Bid 4600', 'Bid 4601'],
      dtype='object', length=4602)

In [36]:
def returnsplitIndexLocation(featurevector: pd.DataFrame, percenttrain: float) -> int:
    location = int(featurevector.shape[1] * percenttrain)
    print(f"Original Location: {location}")
    while featurevector[f"Bid {location}"].iloc[0] == featurevector[f"Bid {location + 1}"].iloc[0]:
        location -= 1
    location+=1
    print(f"Location to maintain Proposal Blocks: {location}")
    print(f"Actual Training Set Percent: {location/featurevector.shape[1]:.3}")
    return location 
print(returnsplitIndexLocation(featureVectorsFinal, .78))

Original Location: 3589
Location to maintain Proposal Blocks: 3589
Actual Training Set Percent: 0.78
3589


In [37]:
for i in range(1680,1700):
    print(f"{i}: {featureVectorsFinal[f'Bid {i}'].iloc[0]}")
    

1680: B1CBA2001487-0
1681: B1CBA2001487-0
1682: B1CBA2001487-0
1683: B3TIA2001476-0
1684: B3TIA2001476-0
1685: B1CBA2001465-0
1686: B1CBA2001465-0
1687: B1CBA2001465-0
1688: B1CBA2001638-0
1689: B1CBA2001638-0
1690: B1CBA2001638-0
1691: B1CBA2001619-0
1692: B1CBA2001619-0
1693: B1CBA2001619-0
1694: B1CBA2001627-0
1695: B1CBA2001627-0
1696: B1CBA2001633-0
1697: B1CBA2001633-0
1698: B1CBA2001612-0
1699: B1CBA2001612-0


In [38]:
def createDataSets(featurevectors: pd.DataFrame, percenttrain: float) -> tuple:
    splitindexlocation = returnsplitIndexLocation(featurevectors, percenttrain)
    y_data = getlabels(featureVectorsFinal)
    X_data = getExamples(featureVectorsFinal)

    y_train = y_data[:splitindexlocation]
    y_test = y_data[splitindexlocation:]

    X_train = X_data[:, :splitindexlocation]
    X_test = X_data[:, splitindexlocation:]

    print(f"y_train shape: {y_train.shape}")
    print(f"y_test shape: {y_test.shape}")
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    return X_train, y_train, X_test, y_test

In [39]:
X_train, y_train, X_test, y_test = createDataSets(featureVectorsFinal, percenttrain=.9)

Original Location: 4141
Location to maintain Proposal Blocks: 4141
Actual Training Set Percent: 0.9
y_train shape: (4141,)
y_test shape: (461,)
X_train shape: (10250, 4141)
X_test shape: (10250, 461)


In [40]:
np.save("../Data/X_train.npy", X_train, allow_pickle=True)
np.save("../Data/y_train.npy", y_train, allow_pickle=True)
np.save("../Data/X_test.npy", X_test, allow_pickle=True)
np.save("../Data/y_test.npy", y_test, allow_pickle=True)