In [1]:
import pandas as pd
import gc

In [5]:
df=pd.read_csv('../ex04/fines.csv')

fines/refund*year

In [9]:
%%timeit

def loop(df):
    ans=[]
    for i in range(0, len(df)):
        row=df.iloc[i]
        ans.append(row['Fines']/row['Refund']*row['Year'])
    return ans

df['Answer'] = loop(df)


20.7 ms ± 917 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [30]:
%%timeit

def iter(df):
    ans=[]
    for _, row in df.iterrows():
        ans.append(row['Fines']/row['Refund']*row['Year'])
    return ans

df['Answer'] = iter(df)


17.8 ms ± 202 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%%timeit

def method_apply(df):
    df['Answer'] = df.apply(lambda row: row['Fines']/row['Refund']*row['Year'], axis=1)

method_apply(df)

5.27 ms ± 53.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
%%timeit
df['Answer'] = df['Fines'] / df['Refund'] * df['Year']

113 μs ± 1.09 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [32]:
%%timeit
df['Answer'] = df['Fines'].values / df['Refund'].values * df['Year'].values

51.5 μs ± 508 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


**Indexing**

In [34]:
%%timeit
df.loc[df['CarNumber']=="O136HO197RUS"]

146 μs ± 1.9 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [37]:
df=df.set_index('CarNumber')

In [39]:
%%timeit
df.loc["O136HO197RUS"]

33.2 μs ± 248 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


**Downcasting**

In [10]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  930 non-null    object 
 1   Refund     930 non-null    int64  
 2   Fines      930 non-null    float64
 3   Make       930 non-null    object 
 4   Model      919 non-null    object 
 5   Year       930 non-null    int64  
 6   Answer     930 non-null    float64
dtypes: float64(2), int64(2), object(3)
memory usage: 203.9 KB


In [35]:
optimized_df = df.copy() #deep_copy = df.copy(deep=True)

In [36]:
float64_columns = optimized_df.select_dtypes(include=['float64']).columns
optimized_df[float64_columns] = optimized_df[float64_columns].astype('float32')

In [37]:
int64_columns = optimized_df.select_dtypes(include=['int64']).columns
optimized_df[int64_columns] = optimized_df[int64_columns].apply(pd.to_numeric, downcast='integer')

In [38]:
optimized_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  930 non-null    object 
 1   Refund     930 non-null    int8   
 2   Fines      930 non-null    float32
 3   Make       930 non-null    object 
 4   Model      919 non-null    object 
 5   Year       930 non-null    int16  
 6   Answer     930 non-null    float32
dtypes: float32(2), int16(1), int8(1), object(3)
memory usage: 184.8 KB


**Categories**

In [39]:
object_columns = optimized_df.select_dtypes(include=['object']).columns
optimized_df[object_columns] = optimized_df[object_columns].astype('category')

In [40]:
optimized_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   CarNumber  930 non-null    category
 1   Refund     930 non-null    int8    
 2   Fines      930 non-null    float32 
 3   Make       930 non-null    category
 4   Model      919 non-null    category
 5   Year       930 non-null    int16   
 6   Answer     930 non-null    float32 
dtypes: category(3), float32(2), int16(1), int8(1)
memory usage: 68.2 KB


In [7]:
%reset_selective df
gc.collect()

7