#### Some tips and ideas to make python code run faster

Use built-in functions and libraries, they are tested and optimzied  
min, max, all, map
math.factorial, math.comb, math.fsum




https://medium.com/codex/say-goodbye-to-loops-in-python-and-welcome-vectorization-e4df66615a52   
https://www.kdnuggets.com/2021/06/make-python-code-run-incredibly-fast.html   
https://yanick-andrade.medium.com/enhancing-performance-in-python-a29bd06b7ad0  

In [12]:
# Use built-in functions and libraries, they are tested and optimzied
import string 
def upper_basic(n):
    newList = []
    for w in string.ascii_lowercase*n:
        newList.append(w.upper())        

In [13]:
%timeit upper_basic(1000)

2.94 ms ± 15.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
def upper_o2(n):
    newList = map(str.upper, string.ascii_lowercase*n)

In [15]:
%timeit upper_o2(1000)

1.21 µs ± 9.47 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [16]:
# performance difference: 2.95 ms vs 1.21 us

#### prefer comprehension list over loop

In [31]:
def list_loop(n):
    new_list = []
    for i in range(1, n):
        if i % 2 == 0:
            new_list.append(i**2)
            

In [32]:
%timeit list_loop(1001)

202 µs ± 711 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [33]:
def list_comp(n):
    new_list = [i**2 for i in range(1, n) if i%2 == 0]
    

In [34]:
%timeit list_comp(1001)

187 µs ± 1.81 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


#### use prop import

In [43]:
import math 
def import_module(n):
    for i in range(n):
        value = math.sqrt(n)

In [44]:
%timeit import_module(1001)

121 µs ± 3.04 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [45]:
del math

In [46]:
from math import sqrt 
def import_module_name(n):
    for i in range(n):
        value = sqrt(n)

In [47]:
%timeit import_module_name(1001)

86.2 µs ± 1.14 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
# preformance difference: 121 us vs 86 us

#### string concateation
+= operator creates new string
"".join is different and faster

In [49]:
def string_cat_1(n):
    s = ""
    for i in range(1, 101):
        s += string.ascii_letters
        

In [50]:
def string_cat_2(n):
    s = ""
    for i in range(1, 101):
        s.join(string.ascii_letters)

In [52]:
%timeit string_cat_1(101)


17.4 µs ± 327 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [53]:
%timeit string_cat_2(101)

136 µs ± 935 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


#### Use vectorization  
apply operations to all elements of an array in one go  
"for" loop manipulates one row at a time


In [73]:
def find_sum(n):
    total = 0
    for i in range(n):
        total += i

In [74]:
%timeit find_sum(1_000_000)

67.8 ms ± 472 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [75]:
import numpy as np
def find_sum_vector(n):
    total = 0
    total = np.sum(np.arange(n))

In [76]:
%timeit find_sum_vector(1_000_000)

1.84 ms ± 25 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [77]:
(67.8-1.84)/67.8

0.9728613569321534

#### create dataframe  
create a 5 million row x 4 columns filled with random values


In [94]:
import numpy as np 
import pandas as pd
df = pd.DataFrame(np.random.randint(0, 50, size=(5_000, 4)), columns=('a','b','c','d'))
df.shape

(5000, 4)

In [95]:
df.head()

Unnamed: 0,a,b,c,d
0,39,40,6,29
1,8,23,33,48
2,15,23,19,40
3,49,6,47,11
4,8,3,48,32


In [96]:
def create_new_col(df):
    for idx, row in df.iterrows():
        df.at[idx, 'ratio'] = 100 *  (row['d'] / (1+(row['c'])))

In [97]:
%timeit create_new_col(df)

439 ms ± 2.63 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [100]:
def create_new_vec(df):
    df['ratio'] = 100 *  (df['d'] / (1+(df['c'])))

In [101]:
%timeit create_new_vec(df)

501 µs ± 3.79 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [102]:
# preformance difference: 439 ms vs 501 ms

In [104]:
(439*1000 - 501) / 439

998.8587699316629

#### Dataframe if - else statement, create new column based on conditions

In [111]:
# use basic if / else statements
def df_if_new(df):
    for idx, row in df.iterrows():
        if row.a == 0:
            df.at[idx, 'e'] = row.d
        elif (row.a <= 25) & (row.a > 0):
            df.at[idx, 'e'] = (row.b) - (row.c)
        else:
            df.at[idx, 'e'] = row.b + row.c

In [110]:
%timeit df_if_new(df)

598 ms ± 5.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [113]:
# use vectorization
def df_if_vec(df):
    df['e'] = df['b'] + df['c']
    df.loc[df['a'] < 25, 'e'] = df['b'] - df['c']
    df.loc[df['a'] == 0, 'e'] = df['d']
    

In [114]:
%timeit df_if_vec(df)

2.02 ms ± 53.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [115]:
(598 - 2.02)/598

0.9966220735785953

#### Deep learning multi-linear regression calculations  
$$ y = m{_1}x{_1} + m{_2}x{_2} +m{_3}x{_3} +m{_4}x{_4} +m{_5}x{_1} + c $$

Use loop for million of rows of calculations is slow  
Vectorization is the  optimal solution

In [130]:
# create random data 
import numpy as np
m = np.random.rand(1,5)
n = np.random.rand(100000,5)
m.shape, n.shape


((1, 5), (100000, 5))

In [172]:
# use loop for calculations
import numpy as np

def loop_reg_sum(col, row):
    m = np.random.rand(1,col)
    n = np.random.rand(row,col)
    result = []
    for i in range(row):
        total = 0 
        for j in range(col):
            total += n[j][j]*m[0][j]
#         print(i, total)
        result.append(total)


In [173]:
%timeit loop_reg_sum(5, 100_000)

396 ms ± 651 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [175]:
# use vectorization 
def vec_reg_sum(col, row):
    m = np.random.rand(1,col)
    n = np.random.rand(row,col)    
    result = np.dot(n, m.T)
    

In [176]:
%timeit vec_reg_sum(5, 100_000)

7.34 ms ± 83 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


/Users/user/Documents/repo/side-projects/pandas-basics
