# Building Fast Queries on a CSV

The goal of this project is to create a inventory class to test various programming methods showing how each impacts compute time to maximize efficiency.


In [1]:
# importing the csv module and data and splitting into headers vs rows

import csv
with open("laptops.csv") as file:
    read_file = csv.reader(file)
    data = list(read_file)
    header = data[0]
    rows = data[1:]

In [2]:
print(header) # prings headers

['Id', 'Company', 'Product', 'TypeName', 'Inches', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price']


In [3]:
# printing the first 5 rows

for row in rows[:5]:
    print(row)
    print("\n")

['6571244', 'Apple', 'MacBook Pro', 'Ultrabook', '13.3', 'IPS Panel Retina Display 2560x1600', 'Intel Core i5 2.3GHz', '8GB', '128GB SSD', 'Intel Iris Plus Graphics 640', 'macOS', '1.37kg', '1339']


['7287764', 'Apple', 'Macbook Air', 'Ultrabook', '13.3', '1440x900', 'Intel Core i5 1.8GHz', '8GB', '128GB Flash Storage', 'Intel HD Graphics 6000', 'macOS', '1.34kg', '898']


['3362737', 'HP', '250 G6', 'Notebook', '15.6', 'Full HD 1920x1080', 'Intel Core i5 7200U 2.5GHz', '8GB', '256GB SSD', 'Intel HD Graphics 620', 'No OS', '1.86kg', '575']


['9722156', 'Apple', 'MacBook Pro', 'Ultrabook', '15.4', 'IPS Panel Retina Display 2880x1800', 'Intel Core i7 2.7GHz', '16GB', '512GB SSD', 'AMD Radeon Pro 455', 'macOS', '1.83kg', '2537']


['8550527', 'Apple', 'MacBook Pro', 'Ultrabook', '13.3', 'IPS Panel Retina Display 2560x1600', 'Intel Core i5 3.1GHz', '8GB', '256GB SSD', 'Intel Iris Plus Graphics 650', 'macOS', '1.37kg', '1803']




In [4]:
# create an inventory class to read files

class Inventory():
    
    def __init__(self, csv_filename):
        with open(csv_filename) as file:
            read_file = csv.reader(file)
            data = list(read_file)
            self.header = data[0]
            self.rows = data[1:]  
        
        for row in self.rows:
            row[-1] = int(row[-1])

In [5]:
inventory = Inventory("laptops.csv") # creates an inventory class object with our data

In [6]:
print(inventory.header) # print header attribute
print(len(inventory.rows)) # print row count using rows attribute

['Id', 'Company', 'Product', 'TypeName', 'Inches', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price']
1303


In [7]:
# adds get laptop row from id method to inventory class

class Inventory():
    
    def __init__(self, csv_filename):
        with open(csv_filename) as file:
            read_file = csv.reader(file)
            data = list(read_file)
            self.header = data[0]
            self.rows = data[1:]        
        for row in self.rows:
            row[-1] = int(row[-1])
            
    def get_laptop_from_id(self, laptop_id):
        for row in self.rows:
            if row[0] == laptop_id:
                return row
        return None

In [8]:
inventory = Inventory('laptops.csv')           # step 4
print(inventory.get_laptop_from_id('3362737')) # step 5
print(inventory.get_laptop_from_id('3362736')) 

['3362737', 'HP', '250 G6', 'Notebook', '15.6', 'Full HD 1920x1080', 'Intel Core i5 7200U 2.5GHz', '8GB', '256GB SSD', 'Intel HD Graphics 620', 'No OS', '1.86kg', 575]
None


In [9]:
# adds get laptop faster method to the inventory class

class Inventory():
    
    def __init__(self, csv_filename):
        with open(csv_filename) as file:
            read_file = csv.reader(file)
            data = list(read_file)
            self.header = data[0]
            self.rows = data[1:]        
        for row in self.rows:
            row[-1] = int(row[-1])
        self.id_to_row = {}
        for row in self.rows:
            self.id_to_row[row[0]] = row
            
    def get_laptop_from_id(self, laptop_id):
        for row in self.rows:
            if row[0] == laptop_id:
                return row
        return None
    
    def get_laptop_from_id_fast(self, laptop_id):
        if laptop_id in self.id_to_row:
            return self.id_to_row[laptop_id]
        return None

In [10]:
inventory = Inventory('laptops.csv')           
print(inventory.get_laptop_from_id_fast('3362737')) 
print(inventory.get_laptop_from_id_fast('3362736')) 

['3362737', 'HP', '250 G6', 'Notebook', '15.6', 'Full HD 1920x1080', 'Intel Core i5 7200U 2.5GHz', '8GB', '256GB SSD', 'Intel HD Graphics 620', 'No OS', '1.86kg', 575]
None


In [11]:
# tests the time difference in lookups bewteen a dict and looping through each row

import time
import random

ids = []

for i in range(0,10000):
    value = str(random.randint(1000000, 9999999))
    ids.append(value)
    
inventory = Inventory('laptops.csv')  

total_time_no_dict = 0

for id in ids:
    start = time.time()
    inventory.get_laptop_from_id(id)
    end = time.time()
    total_time_no_dict += end - start

total_time_dict = 0

for id in ids:
    start = time.time()
    inventory.get_laptop_from_id_fast(id)
    end = time.time()
    total_time_dict += end - start

In [12]:
print(total_time_dict)
print(total_time_no_dict)
print(total_time_no_dict/total_time_dict)

0.004812717437744141
1.125680923461914
233.89715644506094


Per the analysis above using a dictionary is 236 times faster than looping through each row when looking up 10,000 ids

In [13]:
# adds check promotion dollars method

class Inventory():
    
    def __init__(self, csv_filename):
        with open(csv_filename) as file:
            read_file = csv.reader(file)
            data = list(read_file)
            self.header = data[0]
            self.rows = data[1:]        
        for row in self.rows:
            row[-1] = int(row[-1])
        self.id_to_row = {}
        for row in self.rows:
            self.id_to_row[row[0]] = row
            
    def get_laptop_from_id(self, laptop_id):
        for row in self.rows:
            if row[0] == laptop_id:
                return row
        return None
    
    def get_laptop_from_id_fast(self, laptop_id):
        if laptop_id in self.id_to_row:
            return self.id_to_row[laptop_id]
        return None
    
    def check_promotion_dollars(self, dollars):
        for row in self.rows:
            if row[-1] == dollars:
                return True
        for row1 in self.rows:
            for row2 in self.rows:
                if row1 + row2 == dollars:
                    return True
        return False

In [14]:
inventory = Inventory("laptops.csv")
print(inventory.check_promotion_dollars(1000))
print(inventory.check_promotion_dollars(442))

True
False


In [15]:
# changing promotion check to use a set

class Inventory():
    
    def __init__(self, csv_filename):
        with open(csv_filename) as file:
            read_file = csv.reader(file)
            data = list(read_file)
            self.header = data[0]
            self.rows = data[1:]        
        for row in self.rows:
            row[-1] = int(row[-1])
        self.id_to_row = {}
        for row in self.rows:
            self.id_to_row[row[0]] = row
        self.prices = set()
        for row in self.rows:
            self.prices.add(row[-1])
            
    def get_laptop_from_id(self, laptop_id):
        for row in self.rows:
            if row[0] == laptop_id:
                return row
        return None
    
    def get_laptop_from_id_fast(self, laptop_id):
        if laptop_id in self.id_to_row:
            return self.id_to_row[laptop_id]
        return None
    
    def check_promotion_dollars(self, dollars):
        for row in self.rows:
            if row[-1] == dollars:
                return True
        for row1 in self.rows:
            for row2 in self.rows:
                if row1[-1] + row2[-1] == dollars:
                    return True
        return False
    
    def check_promotion_dollars_fast(self, dollars):
        if dollars in self.prices:
            return True
        for price in self.prices:
            if dollars - price in self.prices:
                return True
        return False

In [16]:
# checking test cases as seen with the double for loop

inventory = Inventory("laptops.csv")
print(inventory.check_promotion_dollars_fast(1000))
print(inventory.check_promotion_dollars_fast(442))

True
False


In [17]:
# measuring the compute difference in promotion dollars methods

prices = []

for i in range(0,100):
    value = random.randint(100, 5000)
    prices.append(value)
    
inventory = Inventory('laptops.csv')  

total_time_no_set = 0

for price in prices:
    start = time.time()
    inventory.check_promotion_dollars(price)
    end = time.time()
    total_time_no_set += end - start

total_time_set = 0

for price in prices:
    start = time.time()
    inventory.check_promotion_dollars_fast(price)
    end = time.time()
    total_time_set += end - start

In [18]:
# printing compute results 

print(total_time_no_set)
print(total_time_set)
print(total_time_no_set/total_time_set)

1.1598763465881348
0.0005218982696533203
2222.4184559159435


In [61]:
# creating a method to find the first laptop more expensive than a target price

def row_price(row):
    return row[-1]

class Inventory():
    
    def __init__(self, csv_filename):
        with open(csv_filename) as file:
            read_file = csv.reader(file)
            data = list(read_file)
            self.header = data[0]
            self.rows = data[1:]        
        for row in self.rows:
            row[-1] = int(row[-1])
        self.id_to_row = {}
        for row in self.rows:
            self.id_to_row[row[0]] = row
        self.prices = set()
        for row in self.rows:
            self.prices.add(row[-1])
        self.rows_by_price = sorted(self.rows, key=row_price)
            
    def get_laptop_from_id(self, laptop_id):
        for row in self.rows:
            if row[0] == laptop_id:
                return row
        return None
    
    def get_laptop_from_id_fast(self, laptop_id):
        if laptop_id in self.id_to_row:
            return self.id_to_row[laptop_id]
        return None
    
    def check_promotion_dollars(self, dollars):
        for row in self.rows:
            if row[-1] == dollars:
                return True
        for row1 in self.rows:
            for row2 in self.rows:
                if row1[-1] + row2[-1] == dollars:
                    return True
        return False
    
    def check_promotion_dollars_fast(self, dollars):
        if dollars in self.prices:
            return True
        for price in self.prices:
            if dollars - price in self.prices:
                return True
        return False
    
    def find_first_laptop_more_expensive(self, target_price):
        range_start = 0                                   
        range_end = len(self.rows_by_price) - 1  
        
        while range_start < range_end:
            range_middle = (range_end + range_start) // 2  
            price = self.rows_by_price[range_middle][-1]
            if price > target_price:                            
                range_end = range_middle                        
            else:
                range_start = range_middle + 1 
                
        if self.rows_by_price[range_start][-1] <= target_price:                  
            return -1                                   
        return range_start       

In [62]:
inventory = Inventory('laptops.csv')                     # Step 3            
print(inventory.find_first_laptop_more_expensive(1000))  # Step 4
print(inventory.find_first_laptop_more_expensive(10000))

683
-1
