In [78]:
import pandas as pd
import gc

## read the fines.csv

In [79]:
df = pd.read_csv(
    "../data/fines.csv",
)

## iterations


In [80]:
%%timeit
def loops(df):
    results = []
    for i in range(0, len(df)):
        row = df.iloc[i]
        results.append(int(row['Fines']/row['Refund']*row['Years']))
    return results
df['Sum1'] = loops(df)

28.7 ms ± 2.47 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [81]:
%%timeit
def iterrow(df):
    results = []
    for index, row in df.iterrows():
        results.append(int(row['Fines']/row['Refund']*row['Years']))
    return results
df['Sum2'] = iterrow(df)

14.4 ms ± 135 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [82]:
%%timeit
df['Sum3'] = df.apply(lambda row: int(row['Fines']/row['Refund']*row['Years']), axis = 1)

4.42 ms ± 112 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [83]:
%%timeit
fines = df['Fines']
refund = df['Refund']
years = df['Years']
df["Sum4"] = ''
for i in range(len(fines)):
    df['Sum4'][i] = fines[i] / refund[i] * years[i]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


66.4 ms ± 1.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [84]:
%%timeit
fines = df['Fines'].values
refund = df['Refund'].values
years = df['Years'].values
df["Sum4"] = ''
for i in range(len(fines)):
    df['Sum4'][i] = fines[i] / refund[i] * years[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


63 ms ± 1.74 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## indexing

In [85]:
%%timeit
df.loc[df['CarNumber'] == "O136HO197RUS"]

108 µs ± 3.23 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [86]:
df = df.set_index('CarNumber')

In [87]:
%%timeit
df.loc['O136HO197RUS']

44.5 µs ± 640 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## downcasting

In [88]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to POST36US
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Refund  930 non-null    int64  
 1   Fines   930 non-null    float64
 2   Make    930 non-null    object 
 3   Model   930 non-null    object 
 4   Years   930 non-null    int64  
 5   Sum1    930 non-null    int64  
 6   Sum2    930 non-null    int64  
 7   Sum3    930 non-null    int64  
 8   Sum4    930 non-null    object 
dtypes: float64(1), int64(5), object(3)
memory usage: 287.2 KB


In [89]:
df_optimized = df.copy()
fcols = df_optimized.select_dtypes('float').columns
icols = df_optimized.select_dtypes('integer').columns
df_optimized[fcols] = df_optimized[fcols].apply(pd.to_numeric, downcast='float')
df_optimized[icols] = df_optimized[icols].apply(pd.to_numeric, downcast='integer')
df_optimized.info(memory_usage='deep')


<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to POST36US
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Refund  930 non-null    int8   
 1   Fines   930 non-null    float32
 2   Make    930 non-null    object 
 3   Model   930 non-null    object 
 4   Years   930 non-null    int16  
 5   Sum1    930 non-null    int32  
 6   Sum2    930 non-null    int32  
 7   Sum3    930 non-null    int32  
 8   Sum4    930 non-null    object 
dtypes: float32(1), int16(1), int32(3), int8(1), object(3)
memory usage: 260.8 KB


## categories

In [90]:
for col in ['Make', 'Model']:
    df_optimized[col] = df_optimized[col].astype('category')
df_optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to POST36US
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   Refund  930 non-null    int8    
 1   Fines   930 non-null    float32 
 2   Make    930 non-null    category
 3   Model   930 non-null    category
 4   Years   930 non-null    int16   
 5   Sum1    930 non-null    int32   
 6   Sum2    930 non-null    int32   
 7   Sum3    930 non-null    int32   
 8   Sum4    930 non-null    object  
dtypes: category(2), float32(1), int16(1), int32(3), int8(1), object(1)
memory usage: 152.6 KB


## memory clean

In [91]:
del df
gc.collect()

1754

In [92]:
%reset_selective -f df