### [df.assign](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.assign.html)

In [17]:
import pandas as pd
from string import Template
import functools
import operator
import time

In [2]:
value = list(range(0, 26))
material = [*("abcdefghijklmnopqrstuvwxyz")]

df_ori = pd.DataFrame(dict(type = value, material = material))
df_ori.head()

Unnamed: 0,type,material
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


### Create a new column that add _ori to every item of colum_name material

In [3]:
df = df_ori

df = df.assign(material_processed = lambda x: x['material'] + "_ori", material_final = lambda x : x['material_processed'] + "_final")

df.head()

Unnamed: 0,type,material,material_processed,material_final
0,0,a,a_ori,a_ori_final
1,1,b,b_ori,b_ori_final
2,2,c,c_ori,c_ori_final
3,3,d,d_ori,d_ori_final
4,4,e,e_ori,e_ori_final


### Error when direct assignment without lambda

In [4]:
df = df_ori

df = df.assign(material_processed = df['material'] + "_ori", material_final = df['material_processed'] + "_final")

df.head()

KeyError: 'material_processed'

### But this works

In [5]:
df = df_ori

df = df.assign(material_processed = df['material'] + "_ori", material_final = lambda x: x['material_processed'] + "_final")

df.head()

Unnamed: 0,type,material,material_processed,material_final
0,0,a,a_ori,a_ori_final
1,1,b,b_ori,b_ori_final
2,2,c,c_ori,c_ori_final
3,3,d,d_ori,d_ori_final
4,4,e,e_ori,e_ori_final


### which one is faster? 
- use list and assign a new column to existing dataframe
- use df.assign
- suspect with iteration through row will be slower

Answer: Both same

In [47]:
message = Template("Average time consumed: ${time} seconds")

second_short = lambda x: "%.10f" % x

In [48]:

time_consumed = []

for _ in list(range(10000)):

    df_by_assign = df_ori
    
    start_assign = time.time()

    df_by_assign = df_by_assign.assign(material_processed = lambda x: x['material'] + "_ori", material_final = lambda x : x['material_processed'] + "_final")

    end_assign = time.time()
    
    time_per_run = end_assign - start_assign
    
    time_consumed.append(time_per_run)
    

average_time_assign = functools.reduce(operator.add, time_consumed) / len(time_consumed)


print(message.substitute(dict(time = second_short(average_time_assign))))


Average time consumed: 0.0005781980 seconds


In [49]:
time_consumed_list = []

for _ in list(range(10000)):

    df_by_list = df_ori

    start_list = time.time()

    material_processed = list(map(lambda x: x + "_ori", df_by_list.material.to_list()))

    material_final = list(map(lambda x: x + "_final", material_processed))

    df_by_list["material_processed"] = material_processed
    df_by_list["material_final"] = material_final

    end_list = time.time()
    
    time_per_run = end_list - start_list
    
    time_consumed_list.append(time_per_run)
    

average_time_list = functools.reduce(operator.add, time_consumed) / len(time_consumed)


print(message.substitute(dict(time = second_short(average_time_list))))

Average time consumed: 0.0005781980 seconds
