## Let's make this code faster

In [20]:
import pandas as pd

In [21]:
fines = pd.read_csv('../../datasets/fines.csv')

##### loop: write a function that iterates through the dataframe using for i in range(0, len(df)), iloc and append() to a list, assign the result of the function to a new column in the dataframe

In [22]:
%%timeit
def calculate_loop(fines):
    results = []
    for i in range(0, len(fines)):
        row = fines.iloc[0]
       
        try:
            calculation = (row['Fines'] / row['Refund']) * row['Year']
        except ZeroDivisionError:
            calculation = pd.nan
        results.append(calculation)
    return results
fines['Calculated_loop'] = calculate_loop(fines)

2.93 ms ± 30.9 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


##### Do it using iterrows()

In [23]:
%%timeit
def calculate_iterrows(fines):
    results = []
    for _, row in fines.iterrows():
        results.append(row['Fines'] / row['Refund'] * row['Year'])
    return results

fines['Calculated_iterrows'] = calculate_iterrows(fines)


2.88 ms ± 243 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


##### Do it using apply() and lambda function

In [24]:
%%timeit
fines['Calculated_apply'] = fines.apply(lambda row: row['Fines'] / row['Refund'] * row['Year'], axis=1)
#print(fines)

876 μs ± 19.8 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


##### Do it using Series objects from the dataframe

In [25]:
%%timeit
fines['Calculated_vectorized'] = fines['Fines'] / fines['Refund'] * fines['Year']
#print(fines)

63 μs ± 2.64 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


##### Do it as in the previous subtask but with the method .values


In [26]:
%%timeit
fines['Calculated_values'] = fines['Fines'].values / fines['Refund'].values * fines['Year'].values
#print(fines)

31.1 μs ± 131 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


#### Indexing: measure the time using the magic command %%timeit in the cell

##### Get a row for a specific CarNumber, for example, ’O136HO197RUS’

In [27]:
%%timeit
car = fines[fines['CarNumber'] == 'O136HO197RUS']

89.4 μs ± 1.89 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


##### Set the index in your dataframe with CarNumber

In [28]:
%%timeit 
fines.set_index('CarNumber')


115 μs ± 3.87 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [29]:
fines = fines.set_index('CarNumber')
#print(fines)

##### Again, get a row for the same CarNumber

In [30]:
%%timeit
car = fines.loc['O136HO197RUS']
#print(car)

16.8 μs ± 321 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


#### Downcasting

##### Run df.info(memory_usage=’deep’), pay attention to the Dtype and the memory usage

In [31]:
fines.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 205 entries, T6329O50RUS to E555EE555RUS
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Make                   205 non-null    object 
 1   Model                  202 non-null    object 
 2   Fines                  205 non-null    float64
 3   Refund                 205 non-null    int64  
 4   Year                   205 non-null    int64  
 5   Calculated_loop        205 non-null    float64
 6   Calculated_iterrows    205 non-null    float64
 7   Calculated_apply       205 non-null    float64
 8   Calculated_vectorized  205 non-null    float64
 9   Calculated_values      205 non-null    float64
dtypes: float64(6), int64(2), object(2)
memory usage: 54.6 KB


##### Make a copy() of your initial dataframe into another dataframe optimized

In [32]:
optimized = fines.copy()

##### Downcast from float64 to float32 for all the columns

In [33]:
print(optimized.dtypes)
for col in optimized.select_dtypes(include='float'):
    optimized[col] = pd.to_numeric(optimized[col], downcast='float')
print(optimized.dtypes) 

Make                      object
Model                     object
Fines                    float64
Refund                     int64
Year                       int64
Calculated_loop          float64
Calculated_iterrows      float64
Calculated_apply         float64
Calculated_vectorized    float64
Calculated_values        float64
dtype: object
Make                      object
Model                     object
Fines                    float32
Refund                     int64
Year                       int64
Calculated_loop          float32
Calculated_iterrows      float64
Calculated_apply         float64
Calculated_vectorized    float64
Calculated_values        float64
dtype: object


##### Downcast from int64 to the smallest numerical dtype possible

In [34]:
print(optimized.dtypes)
optimized[['Refund', 'Year']] = optimized[['Refund', 'Year']].apply(
    pd.to_numeric, downcast='integer'
)

for col in optimized.select_dtypes(include='int'):
    optimized[col] = pd.to_numeric(optimized[col], downcast='int')

print(optimized.dtypes) 

Make                      object
Model                     object
Fines                    float32
Refund                     int64
Year                       int64
Calculated_loop          float32
Calculated_iterrows      float64
Calculated_apply         float64
Calculated_vectorized    float64
Calculated_values        float64
dtype: object
Make                      object
Model                     object
Fines                    float32
Refund                     int16
Year                       int16
Calculated_loop          float32
Calculated_iterrows      float64
Calculated_apply         float64
Calculated_vectorized    float64
Calculated_values        float64
dtype: object


##### Run info(memory_usage='deep') for your new dataframe, pay attention to the Dtype and the memory usage

In [35]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 205 entries, T6329O50RUS to E555EE555RUS
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Make                   205 non-null    object 
 1   Model                  202 non-null    object 
 2   Fines                  205 non-null    float32
 3   Refund                 205 non-null    int16  
 4   Year                   205 non-null    int16  
 5   Calculated_loop        205 non-null    float32
 6   Calculated_iterrows    205 non-null    float64
 7   Calculated_apply       205 non-null    float64
 8   Calculated_vectorized  205 non-null    float64
 9   Calculated_values      205 non-null    float64
dtypes: float32(2), float64(4), int16(2), object(2)
memory usage: 50.6 KB


##### Categories

In [36]:
for col in optimized.select_dtypes(include='object'):
    optimized[col] = optimized[col].astype('category')
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 205 entries, T6329O50RUS to E555EE555RUS
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   Make                   205 non-null    category
 1   Model                  202 non-null    category
 2   Fines                  205 non-null    float32 
 3   Refund                 205 non-null    int16   
 4   Year                   205 non-null    int16   
 5   Calculated_loop        205 non-null    float32 
 6   Calculated_iterrows    205 non-null    float64 
 7   Calculated_apply       205 non-null    float64 
 8   Calculated_vectorized  205 non-null    float64 
 9   Calculated_values      205 non-null    float64 
dtypes: category(2), float32(2), float64(4), int16(2)
memory usage: 30.7 KB


#### Memory clean

In [37]:
import gc 
del fines  
gc.collect()  
try:
    print(fines.head())
except NameError:
    print("'Fines' is already cleared")

'Fines' is already cleared


In [38]:
%reset_selective -f ^(?!optimized$).*
try:
    print(fines.head())
except NameError:
    print("'Fines' is already cleared")

'Fines' is already cleared
