In [12]:
import pandas as pd
import numpy as np

# df = {'index': [0, 1, 2],
#       'columns': [{'name': 'name', 'data': ['Egor', 'Arthur', 'Slava']},
#                   {'name': 'age', 'data': [25, 23, 27]}
#                  ]
#      }

df = pd.DataFrame({'name': ['Egor', 'Arthur', 'Slava'], 'age': [25, 23, 27]})
df

Unnamed: 0,name,age
0,Egor,25
1,Arthur,23
2,Slava,27


In [3]:
df.loc[1]
df['name']

0      Egor
1    Arthur
2     Slava
Name: name, dtype: object

In [4]:
df['name'].str.upper()

0      EGOR
1    ARTHUR
2     SLAVA
Name: name, dtype: object

In [11]:
np.random.seed(40)

np_df = pd.DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c'])
np_df

Unnamed: 0,a,b,c
0,-0.607548,-0.126136,-0.684606
1,0.928715,-1.844401,-0.467002
2,2.29249,0.48881,0.710267
3,1.055534,0.054073,0.257953
4,0.588282,0.885244,-1.017007


In [15]:
df.axes

[RangeIndex(start=0, stop=3, step=1), Index(['name', 'age'], dtype='object')]

In [21]:
import time

s1 = time.time()
r1 = df.sum(axis='index')
e1 = time.time()

s2 = time.time()
r2 = df.apply(np.sum, axis=0)
e2 = time.time()

# df.sum(axis=0)
# df.sum(axis=1)
print('r1: {0:.3f}ms\nr2: {1:.3f}ms'.format((e1 - s1)*10**3, (e2 - s2)*10**3))

r1: 0.263ms
r2: 0.375ms


In [29]:
import pandas as pd
import numpy as np
import time

col_names = {
    'Seq.': 'Sequence', 'Bg': 'Background',
    'PL': 'Party leadership', 'CAb': 'Communication ability', 
    'RC': 'Relations with Congress', 'CAp': 'Court appointments', 
    'HE': 'Handling of economy', 'L': 'Luck',
    'AC': 'Ability to compromise', 'WR': 'Willing to take risks', 
    'EAp': 'Executive appointments', 'OA': 'Overall ability', 
    'Im': 'Imagination', 'DA': 'Domestic accomplishments',
    'Int': 'Integrity', 'EAb': 'Executive ability',
    'FPA': 'Foreign policy accomplishments',
    'LA': 'Leadership ability',
    'IQ': 'Intelligence', 'AM': 'Avoid crucial mistakes',
    'EV': "Experts' view", 'O': 'Overall'
}


df = pd.read_csv('./data/siena2018-pres.csv', index_col=0)
ms = df.memory_usage(deep=True).sum()

# df.head()
# df.columns
# df.dtypes
# df.nunique()     # Party to category type
# df.max()         # int64 to uint8 0-255

def transform_df(df: pd.DataFrame) -> pd.DataFrame:
    new_df_types = dict()
    
    for col in df.columns:
        if col == 'Party':
            new_df_types[col] = 'category'
        elif col not in ('Seq.', 'President'):
            new_df_types[col] = 'uint8'

    r_df = (df.astype(new_df_types)
              .rename(columns={k:v.replace(' ', '_') for k, v in col_names.items()}))
    return r_df

t_df = transform_df(df)
t_ms = t_df.memory_usage(deep=True).sum()
print(f'source df memory usage: {ms}\ntransformed df memory usage: {t_ms}\ndelta: {ms / t_ms:.1f}')


source df memory usage: 16485
transformed df memory usage: 7665
delta: 2.2
