# Data:
1. Quantitative (= numeric variable):
   - discreete
   - continuous
2. Qualitative (= categorical variable):
   - nominal
   - ordinal
  
https://www.abs.gov.au/statistics/understanding-statistics/statistical-terms-and-concepts/quantitative-and-qualitative-data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data = np.random.randint(1,100,500000)

In [3]:
np.min(data)

1

In [4]:
np.max(data)

99

In [5]:
np.size(data)

500000

In [6]:
np.shape(data)

(500000,)

In [7]:
data_reshaped = data.reshape(100000, 5)

In [8]:
data_reshaped

array([[34, 16, 71, 89, 37],
       [66, 74, 17, 12, 46],
       [ 5, 11, 99, 27, 97],
       ...,
       [48, 67, 42, 99, 47],
       [45, 69, 72, 71, 41],
       [20, 66, 53, 28, 39]])

In [9]:
data_reshaped.cumsum(axis = 0)

array([[     34,      16,      71,      89,      37],
       [    100,      90,      88,     101,      83],
       [    105,     101,     187,     128,     180],
       ...,
       [4984391, 5018525, 5004724, 4992048, 4992485],
       [4984436, 5018594, 5004796, 4992119, 4992526],
       [4984456, 5018660, 5004849, 4992147, 4992565]])

In [10]:
data_reshaped.cumsum(axis = 1)

array([[ 34,  50, 121, 210, 247],
       [ 66, 140, 157, 169, 215],
       [  5,  16, 115, 142, 239],
       ...,
       [ 48, 115, 157, 256, 303],
       [ 45, 114, 186, 257, 298],
       [ 20,  86, 139, 167, 206]])

In [11]:
data_reshaped.sum(axis = 0)

array([4984456, 5018660, 5004849, 4992147, 4992565])

In [12]:
data_reshaped.sum(axis = 1)

array([247, 215, 239, ..., 303, 298, 206])

In [13]:
df = pd.DataFrame(data_reshaped)

In [14]:
df

Unnamed: 0,0,1,2,3,4
0,34,16,71,89,37
1,66,74,17,12,46
2,5,11,99,27,97
3,71,84,67,41,80
4,71,30,82,90,22
...,...,...,...,...,...
99995,7,33,18,39,1
99996,20,79,8,87,81
99997,48,67,42,99,47
99998,45,69,72,71,41


In [15]:
df.cumsum(axis = 0)

Unnamed: 0,0,1,2,3,4
0,34,16,71,89,37
1,100,90,88,101,83
2,105,101,187,128,180
3,176,185,254,169,260
4,247,215,336,259,282
...,...,...,...,...,...
99995,4984323,5018379,5004674,4991862,4992357
99996,4984343,5018458,5004682,4991949,4992438
99997,4984391,5018525,5004724,4992048,4992485
99998,4984436,5018594,5004796,4992119,4992526


In [16]:
df.cumsum(axis = 1)

Unnamed: 0,0,1,2,3,4
0,34,50,121,210,247
1,66,140,157,169,215
2,5,16,115,142,239
3,71,155,222,263,343
4,71,101,183,273,295
...,...,...,...,...,...
99995,7,40,58,97,98
99996,20,99,107,194,275
99997,48,115,157,256,303
99998,45,114,186,257,298


In [17]:
df.sum(axis = 0)

0    4984456
1    5018660
2    5004849
3    4992147
4    4992565
dtype: int64

In [18]:
df.sum(axis = 1)

0        247
1        215
2        239
3        343
4        295
        ... 
99995     98
99996    275
99997    303
99998    298
99999    206
Length: 100000, dtype: int64

In [19]:
df['Total'] = df.sum(axis = 1)

In [20]:
df

Unnamed: 0,0,1,2,3,4,Total
0,34,16,71,89,37,247
1,66,74,17,12,46,215
2,5,11,99,27,97,239
3,71,84,67,41,80,343
4,71,30,82,90,22,295
...,...,...,...,...,...,...
99995,7,33,18,39,1,98
99996,20,79,8,87,81,275
99997,48,67,42,99,47,303
99998,45,69,72,71,41,298


In [21]:
pd.Series(df.sum(),name='Total')

0         4984456
1         5018660
2         5004849
3         4992147
4         4992565
Total    24992677
Name: Total, dtype: int64

In [22]:
pd.concat([df,pd.Series(df.sum(),name='Total').to_frame().T],ignore_index=True)

Unnamed: 0,0,1,2,3,4,Total
0,34,16,71,89,37,247
1,66,74,17,12,46,215
2,5,11,99,27,97,239
3,71,84,67,41,80,343
4,71,30,82,90,22,295
...,...,...,...,...,...,...
99996,20,79,8,87,81,275
99997,48,67,42,99,47,303
99998,45,69,72,71,41,298
99999,20,66,53,28,39,206


In [23]:
df = pd.concat([df,pd.Series(df.sum(),name='Total').to_frame().T])

In [24]:
df

Unnamed: 0,0,1,2,3,4,Total
0,34,16,71,89,37,247
1,66,74,17,12,46,215
2,5,11,99,27,97,239
3,71,84,67,41,80,343
4,71,30,82,90,22,295
...,...,...,...,...,...,...
99996,20,79,8,87,81,275
99997,48,67,42,99,47,303
99998,45,69,72,71,41,298
99999,20,66,53,28,39,206


In [25]:
# vybere redek
df.loc['Total',:]

0         4984456
1         5018660
2         5004849
3         4992147
4         4992565
Total    24992677
Name: Total, dtype: int64

In [26]:
# vybere sloupec
df.loc[:,'Total']

0             247
1             215
2             239
3             343
4             295
           ...   
99996         275
99997         303
99998         298
99999         206
Total    24992677
Name: Total, Length: 100001, dtype: int64

In [27]:
# vybere sloupec
df['Total']

0             247
1             215
2             239
3             343
4             295
           ...   
99996         275
99997         303
99998         298
99999         206
Total    24992677
Name: Total, Length: 100001, dtype: int64

In [28]:
df['Total']['Total']

24992677

In [29]:
df['Total'][0]

247

In [30]:
df.index

Index([      0,       1,       2,       3,       4,       5,       6,       7,
             8,       9,
       ...
         99991,   99992,   99993,   99994,   99995,   99996,   99997,   99998,
         99999, 'Total'],
      dtype='object', length=100001)

In [31]:
df.values

array([[      34,       16,       71,       89,       37,      247],
       [      66,       74,       17,       12,       46,      215],
       [       5,       11,       99,       27,       97,      239],
       ...,
       [      45,       69,       72,       71,       41,      298],
       [      20,       66,       53,       28,       39,      206],
       [ 4984456,  5018660,  5004849,  4992147,  4992565, 24992677]])

In [32]:
df

Unnamed: 0,0,1,2,3,4,Total
0,34,16,71,89,37,247
1,66,74,17,12,46,215
2,5,11,99,27,97,239
3,71,84,67,41,80,343
4,71,30,82,90,22,295
...,...,...,...,...,...,...
99996,20,79,8,87,81,275
99997,48,67,42,99,47,303
99998,45,69,72,71,41,298
99999,20,66,53,28,39,206


In [33]:
# rename columns
df.columns = ['Col'+str(col) if isinstance(col, int) else col for col in df.columns]

In [34]:
df

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Total
0,34,16,71,89,37,247
1,66,74,17,12,46,215
2,5,11,99,27,97,239
3,71,84,67,41,80,343
4,71,30,82,90,22,295
...,...,...,...,...,...,...
99996,20,79,8,87,81,275
99997,48,67,42,99,47,303
99998,45,69,72,71,41,298
99999,20,66,53,28,39,206


In [35]:
for var in df.columns:
    #var = 'Col1'
    conditions = [(df[var] >= 0) & (df[var] < 10),
                  (df[var] >= 10) & (df[var] < 20),
                  (df[var] >= 20) & (df[var] < 30),
                  (df[var] >= 30) & (df[var] < 40),
                  (df[var] >= 40) & (df[var] < 50),
                  (df[var] >= 50) & (df[var] < 60),
                  (df[var] >= 60) & (df[var] < 70),
                  (df[var] >= 70) & (df[var] < 80),
                  (df[var] >= 80) & (df[var] < 90),
                  (df[var] >= 90) & (df[var] < 100)]
    results = ['0-9 let','10-19 let','20-29 let','30-39 let','40-49 let',
               '50-59 let','60-69 let','70-79 let','80-89 let','90-99 let']
    df[var+'_bin'] = np.select(condlist=conditions, choicelist=results, default='ERROR')

In [36]:
df

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Total,Col0_bin,Col1_bin,Col2_bin,Col3_bin,Col4_bin,Total_bin
0,34,16,71,89,37,247,30-39 let,10-19 let,70-79 let,80-89 let,30-39 let,ERROR
1,66,74,17,12,46,215,60-69 let,70-79 let,10-19 let,10-19 let,40-49 let,ERROR
2,5,11,99,27,97,239,0-9 let,10-19 let,90-99 let,20-29 let,90-99 let,ERROR
3,71,84,67,41,80,343,70-79 let,80-89 let,60-69 let,40-49 let,80-89 let,ERROR
4,71,30,82,90,22,295,70-79 let,30-39 let,80-89 let,90-99 let,20-29 let,ERROR
...,...,...,...,...,...,...,...,...,...,...,...,...
99996,20,79,8,87,81,275,20-29 let,70-79 let,0-9 let,80-89 let,80-89 let,ERROR
99997,48,67,42,99,47,303,40-49 let,60-69 let,40-49 let,90-99 let,40-49 let,ERROR
99998,45,69,72,71,41,298,40-49 let,60-69 let,70-79 let,70-79 let,40-49 let,ERROR
99999,20,66,53,28,39,206,20-29 let,60-69 let,50-59 let,20-29 let,30-39 let,ERROR


In [37]:
list_of_values = ['ERROR']

In [38]:
# Vyberu zda některé pole je chyba
df[df.isin(list_of_values).any(axis=1)]
#df.query("A in @list_of_values or B in @list_of_values")

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Total,Col0_bin,Col1_bin,Col2_bin,Col3_bin,Col4_bin,Total_bin
0,34,16,71,89,37,247,30-39 let,10-19 let,70-79 let,80-89 let,30-39 let,ERROR
1,66,74,17,12,46,215,60-69 let,70-79 let,10-19 let,10-19 let,40-49 let,ERROR
2,5,11,99,27,97,239,0-9 let,10-19 let,90-99 let,20-29 let,90-99 let,ERROR
3,71,84,67,41,80,343,70-79 let,80-89 let,60-69 let,40-49 let,80-89 let,ERROR
4,71,30,82,90,22,295,70-79 let,30-39 let,80-89 let,90-99 let,20-29 let,ERROR
...,...,...,...,...,...,...,...,...,...,...,...,...
99996,20,79,8,87,81,275,20-29 let,70-79 let,0-9 let,80-89 let,80-89 let,ERROR
99997,48,67,42,99,47,303,40-49 let,60-69 let,40-49 let,90-99 let,40-49 let,ERROR
99998,45,69,72,71,41,298,40-49 let,60-69 let,70-79 let,70-79 let,40-49 let,ERROR
99999,20,66,53,28,39,206,20-29 let,60-69 let,50-59 let,20-29 let,30-39 let,ERROR


In [39]:
df.isin(list_of_values).any(axis=1)

0        True
1        True
2        True
3        True
4        True
         ... 
99996    True
99997    True
99998    True
99999    True
Total    True
Length: 100001, dtype: bool

In [40]:
df.isin(list_of_values).any(axis=0)

Col0         False
Col1         False
Col2         False
Col3         False
Col4         False
Total        False
Col0_bin      True
Col1_bin      True
Col2_bin      True
Col3_bin      True
Col4_bin      True
Total_bin     True
dtype: bool

In [41]:
# Vyberu zda všechna pole jsou chyba
df[df.isin(list_of_values).all(axis=1)]
#df.query("A in @list_of_values or B in @list_of_values")

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Total,Col0_bin,Col1_bin,Col2_bin,Col3_bin,Col4_bin,Total_bin


In [42]:
df.isin(list_of_values).all(axis=1)

0        False
1        False
2        False
3        False
4        False
         ...  
99996    False
99997    False
99998    False
99999    False
Total    False
Length: 100001, dtype: bool

In [43]:
df.isin(list_of_values).all(axis=0)

Col0         False
Col1         False
Col2         False
Col3         False
Col4         False
Total        False
Col0_bin     False
Col1_bin     False
Col2_bin     False
Col3_bin     False
Col4_bin     False
Total_bin    False
dtype: bool

In [44]:
# ani jeden
df[~df.isin(list_of_values).any(axis=1)]

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Total,Col0_bin,Col1_bin,Col2_bin,Col3_bin,Col4_bin,Total_bin
17,5,1,5,50,20,81,0-9 let,0-9 let,0-9 let,50-59 let,20-29 let,80-89 let
26,11,4,12,29,32,88,10-19 let,0-9 let,10-19 let,20-29 let,30-39 let,80-89 let
463,1,12,15,42,9,79,0-9 let,10-19 let,10-19 let,40-49 let,0-9 let,70-79 let
613,16,18,12,1,11,58,10-19 let,10-19 let,10-19 let,0-9 let,10-19 let,50-59 let
652,28,31,22,15,3,99,20-29 let,30-39 let,20-29 let,10-19 let,0-9 let,90-99 let
...,...,...,...,...,...,...,...,...,...,...,...,...
98904,5,12,4,11,5,37,0-9 let,10-19 let,0-9 let,10-19 let,0-9 let,30-39 let
99162,24,35,6,22,3,90,20-29 let,30-39 let,0-9 let,20-29 let,0-9 let,90-99 let
99301,11,25,31,15,6,88,10-19 let,20-29 let,30-39 let,10-19 let,0-9 let,80-89 let
99365,11,2,20,2,60,95,10-19 let,0-9 let,20-29 let,0-9 let,60-69 let,90-99 let


In [45]:
# ne všechny (= 0 až col-1 krát)
df[~df.isin(list_of_values).all(axis=1)]

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Total,Col0_bin,Col1_bin,Col2_bin,Col3_bin,Col4_bin,Total_bin
0,34,16,71,89,37,247,30-39 let,10-19 let,70-79 let,80-89 let,30-39 let,ERROR
1,66,74,17,12,46,215,60-69 let,70-79 let,10-19 let,10-19 let,40-49 let,ERROR
2,5,11,99,27,97,239,0-9 let,10-19 let,90-99 let,20-29 let,90-99 let,ERROR
3,71,84,67,41,80,343,70-79 let,80-89 let,60-69 let,40-49 let,80-89 let,ERROR
4,71,30,82,90,22,295,70-79 let,30-39 let,80-89 let,90-99 let,20-29 let,ERROR
...,...,...,...,...,...,...,...,...,...,...,...,...
99996,20,79,8,87,81,275,20-29 let,70-79 let,0-9 let,80-89 let,80-89 let,ERROR
99997,48,67,42,99,47,303,40-49 let,60-69 let,40-49 let,90-99 let,40-49 let,ERROR
99998,45,69,72,71,41,298,40-49 let,60-69 let,70-79 let,70-79 let,40-49 let,ERROR
99999,20,66,53,28,39,206,20-29 let,60-69 let,50-59 let,20-29 let,30-39 let,ERROR


In [46]:
assert len(df[df.eq('ERROR').any(axis = 1)]) == 0, 'CHYBA!'

AssertionError: CHYBA!

In [47]:
df.eq('ERROR')

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Total,Col0_bin,Col1_bin,Col2_bin,Col3_bin,Col4_bin,Total_bin
0,False,False,False,False,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,False,False,False,True
2,False,False,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
99996,False,False,False,False,False,False,False,False,False,False,False,True
99997,False,False,False,False,False,False,False,False,False,False,False,True
99998,False,False,False,False,False,False,False,False,False,False,False,True
99999,False,False,False,False,False,False,False,False,False,False,False,True


In [63]:
df.eq('ERROR').any(axis = 1)

0        True
1        True
2        True
3        True
4        True
         ... 
99996    True
99997    True
99998    True
99999    True
Total    True
Length: 100001, dtype: bool

In [64]:
df.eq('ERROR').any(axis = 1).loc[df.eq('ERROR').any(axis = 1) == True]

0        True
1        True
2        True
3        True
4        True
         ... 
99996    True
99997    True
99998    True
99999    True
Total    True
Length: 99274, dtype: bool

In [66]:
df[df.eq('ERROR').any(axis = 1)]

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Total,Col0_bin,Col1_bin,Col2_bin,Col3_bin,Col4_bin,Total_bin,Count
0,34,16,71,89,37,247,30-39 let,10-19 let,70-79 let,80-89 let,30-39 let,ERROR,1
1,66,74,17,12,46,215,60-69 let,70-79 let,10-19 let,10-19 let,40-49 let,ERROR,1
2,5,11,99,27,97,239,0-9 let,10-19 let,90-99 let,20-29 let,90-99 let,ERROR,1
3,71,84,67,41,80,343,70-79 let,80-89 let,60-69 let,40-49 let,80-89 let,ERROR,1
4,71,30,82,90,22,295,70-79 let,30-39 let,80-89 let,90-99 let,20-29 let,ERROR,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99996,20,79,8,87,81,275,20-29 let,70-79 let,0-9 let,80-89 let,80-89 let,ERROR,1
99997,48,67,42,99,47,303,40-49 let,60-69 let,40-49 let,90-99 let,40-49 let,ERROR,1
99998,45,69,72,71,41,298,40-49 let,60-69 let,70-79 let,70-79 let,40-49 let,ERROR,1
99999,20,66,53,28,39,206,20-29 let,60-69 let,50-59 let,20-29 let,30-39 let,ERROR,1


In [67]:
df.eq('ERROR').any(axis = 1).loc[df.eq('ERROR').any(axis = 1) == False]

17       False
26       False
463      False
613      False
652      False
         ...  
98904    False
99162    False
99301    False
99365    False
99995    False
Length: 727, dtype: bool

In [68]:
df[~df.eq('ERROR').any(axis = 1)]

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Total,Col0_bin,Col1_bin,Col2_bin,Col3_bin,Col4_bin,Total_bin,Count
17,5,1,5,50,20,81,0-9 let,0-9 let,0-9 let,50-59 let,20-29 let,80-89 let,1
26,11,4,12,29,32,88,10-19 let,0-9 let,10-19 let,20-29 let,30-39 let,80-89 let,1
463,1,12,15,42,9,79,0-9 let,10-19 let,10-19 let,40-49 let,0-9 let,70-79 let,1
613,16,18,12,1,11,58,10-19 let,10-19 let,10-19 let,0-9 let,10-19 let,50-59 let,1
652,28,31,22,15,3,99,20-29 let,30-39 let,20-29 let,10-19 let,0-9 let,90-99 let,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
98904,5,12,4,11,5,37,0-9 let,10-19 let,0-9 let,10-19 let,0-9 let,30-39 let,1
99162,24,35,6,22,3,90,20-29 let,30-39 let,0-9 let,20-29 let,0-9 let,90-99 let,1
99301,11,25,31,15,6,88,10-19 let,20-29 let,30-39 let,10-19 let,0-9 let,80-89 let,1
99365,11,2,20,2,60,95,10-19 let,0-9 let,20-29 let,0-9 let,60-69 let,90-99 let,1


In [69]:
df.eq('ERROR').any(axis = 0)

Col0         False
Col1         False
Col2         False
Col3         False
Col4         False
Total        False
Col0_bin      True
Col1_bin      True
Col2_bin      True
Col3_bin      True
Col4_bin      True
Total_bin     True
Count        False
dtype: bool

In [70]:
df['Count'] = 1

In [71]:
df1 = df[['Col0_bin', 'Count']].groupby(by = 'Col0_bin').count()

In [72]:
df1

Unnamed: 0_level_0,Count
Col0_bin,Unnamed: 1_level_1
0-9 let,9149
10-19 let,10292
20-29 let,10058
30-39 let,10141
40-49 let,9993
50-59 let,10205
60-69 let,10056
70-79 let,10164
80-89 let,9890
90-99 let,10052


In [73]:
df2 = pd.concat([df1, pd.Series(df1.sum(), name='Total').to_frame().T])

In [74]:
df2

Unnamed: 0,Count
0-9 let,9149
10-19 let,10292
20-29 let,10058
30-39 let,10141
40-49 let,9993
50-59 let,10205
60-69 let,10056
70-79 let,10164
80-89 let,9890
90-99 let,10052


In [75]:
df2 = df1.cumsum(axis = 0)

In [76]:
df2

Unnamed: 0_level_0,Count
Col0_bin,Unnamed: 1_level_1
0-9 let,9149
10-19 let,19441
20-29 let,29499
30-39 let,39640
40-49 let,49633
50-59 let,59838
60-69 let,69894
70-79 let,80058
80-89 let,89948
90-99 let,100000


In [77]:
df2.loc['ERROR',:]

Count    100001
Name: ERROR, dtype: int64

In [79]:
# ODEBRÁNÍ SLOUPCE - POZOR BEZ sort = False MENI PORADI SLOUPCU!!!

# subset = df.columns.difference(["Year"])
# df[subset] = df[subset].div(100)
df[df.columns.difference(["Total_bin"], sort = False)]

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Total,Col0_bin,Col1_bin,Col2_bin,Col3_bin,Col4_bin,Count
0,34,16,71,89,37,247,30-39 let,10-19 let,70-79 let,80-89 let,30-39 let,1
1,66,74,17,12,46,215,60-69 let,70-79 let,10-19 let,10-19 let,40-49 let,1
2,5,11,99,27,97,239,0-9 let,10-19 let,90-99 let,20-29 let,90-99 let,1
3,71,84,67,41,80,343,70-79 let,80-89 let,60-69 let,40-49 let,80-89 let,1
4,71,30,82,90,22,295,70-79 let,30-39 let,80-89 let,90-99 let,20-29 let,1
...,...,...,...,...,...,...,...,...,...,...,...,...
99996,20,79,8,87,81,275,20-29 let,70-79 let,0-9 let,80-89 let,80-89 let,1
99997,48,67,42,99,47,303,40-49 let,60-69 let,40-49 let,90-99 let,40-49 let,1
99998,45,69,72,71,41,298,40-49 let,60-69 let,70-79 let,70-79 let,40-49 let,1
99999,20,66,53,28,39,206,20-29 let,60-69 let,50-59 let,20-29 let,30-39 let,1


In [80]:
df

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Total,Col0_bin,Col1_bin,Col2_bin,Col3_bin,Col4_bin,Total_bin,Count
0,34,16,71,89,37,247,30-39 let,10-19 let,70-79 let,80-89 let,30-39 let,ERROR,1
1,66,74,17,12,46,215,60-69 let,70-79 let,10-19 let,10-19 let,40-49 let,ERROR,1
2,5,11,99,27,97,239,0-9 let,10-19 let,90-99 let,20-29 let,90-99 let,ERROR,1
3,71,84,67,41,80,343,70-79 let,80-89 let,60-69 let,40-49 let,80-89 let,ERROR,1
4,71,30,82,90,22,295,70-79 let,30-39 let,80-89 let,90-99 let,20-29 let,ERROR,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99996,20,79,8,87,81,275,20-29 let,70-79 let,0-9 let,80-89 let,80-89 let,ERROR,1
99997,48,67,42,99,47,303,40-49 let,60-69 let,40-49 let,90-99 let,40-49 let,ERROR,1
99998,45,69,72,71,41,298,40-49 let,60-69 let,70-79 let,70-79 let,40-49 let,ERROR,1
99999,20,66,53,28,39,206,20-29 let,60-69 let,50-59 let,20-29 let,30-39 let,ERROR,1


In [81]:
df.columns.difference(["Total_bin"])

Index(['Col0', 'Col0_bin', 'Col1', 'Col1_bin', 'Col2', 'Col2_bin', 'Col3',
       'Col3_bin', 'Col4', 'Col4_bin', 'Count', 'Total'],
      dtype='object')

In [82]:
df.columns.difference(["Total_bin"], sort = False)

Index(['Col0', 'Col1', 'Col2', 'Col3', 'Col4', 'Total', 'Col0_bin', 'Col1_bin',
       'Col2_bin', 'Col3_bin', 'Col4_bin', 'Count'],
      dtype='object')

In [83]:
df2.index.difference(["ERROR"])

Index(['0-9 let', '10-19 let', '20-29 let', '30-39 let', '40-49 let',
       '50-59 let', '60-69 let', '70-79 let', '80-89 let', '90-99 let'],
      dtype='object', name='Col0_bin')

In [84]:
# tady vychazi stejne
df2.index.difference(["ERROR"], sort = False)

Index(['0-9 let', '10-19 let', '20-29 let', '30-39 let', '40-49 let',
       '50-59 let', '60-69 let', '70-79 let', '80-89 let', '90-99 let'],
      dtype='object', name='Col0_bin')

In [85]:
# vybírá sloupce
df2[df2.index.difference(["ERROR"])]

KeyError: "None of [Index(['0-9 let', '10-19 let', '20-29 let', '30-39 let', '40-49 let',\n       '50-59 let', '60-69 let', '70-79 let', '80-89 let', '90-99 let'],\n      dtype='object', name='Col0_bin')] are in the [columns]"

In [86]:
# vybírá řádky
df2.loc[df2.index.difference(["ERROR"], sort = False)]

Unnamed: 0_level_0,Count
Col0_bin,Unnamed: 1_level_1
0-9 let,9149
10-19 let,19441
20-29 let,29499
30-39 let,39640
40-49 let,49633
50-59 let,59838
60-69 let,69894
70-79 let,80058
80-89 let,89948
90-99 let,100000


In [87]:
df2.loc[df2.index.isin(["ERROR"])]

Unnamed: 0_level_0,Count
Col0_bin,Unnamed: 1_level_1
ERROR,100001


In [101]:
# to samé???
df2[df2.index.isin(["ERROR"])]

Unnamed: 0_level_0,Count
Col0_bin,Unnamed: 1_level_1
ERROR,100001


In [102]:
df2.loc[~df2.index.isin(["ERROR"])]

Unnamed: 0_level_0,Count
Col0_bin,Unnamed: 1_level_1
0-9 let,9149
10-19 let,19441
20-29 let,29499
30-39 let,39640
40-49 let,49633
50-59 let,59838
60-69 let,69894
70-79 let,80058
80-89 let,89948
90-99 let,100000


In [103]:
# to samé???
df2[~df2.index.isin(["ERROR"])]

Unnamed: 0_level_0,Count
Col0_bin,Unnamed: 1_level_1
0-9 let,9149
10-19 let,19441
20-29 let,29499
30-39 let,39640
40-49 let,49633
50-59 let,59838
60-69 let,69894
70-79 let,80058
80-89 let,89948
90-99 let,100000


In [104]:
df2.loc[df2.index == "ERROR"]

Unnamed: 0_level_0,Count
Col0_bin,Unnamed: 1_level_1
ERROR,100001


In [105]:
# to samé???
df2[df2.index == "ERROR"]

Unnamed: 0_level_0,Count
Col0_bin,Unnamed: 1_level_1
ERROR,100001


In [106]:
df2.loc[~(df2.index == "ERROR")]

Unnamed: 0_level_0,Count
Col0_bin,Unnamed: 1_level_1
0-9 let,9149
10-19 let,19441
20-29 let,29499
30-39 let,39640
40-49 let,49633
50-59 let,59838
60-69 let,69894
70-79 let,80058
80-89 let,89948
90-99 let,100000


In [108]:
# to samé???
df2[~(df2.index == "ERROR")]

Unnamed: 0_level_0,Count
Col0_bin,Unnamed: 1_level_1
0-9 let,9149
10-19 let,19441
20-29 let,29499
30-39 let,39640
40-49 let,49633
50-59 let,59838
60-69 let,69894
70-79 let,80058
80-89 let,89948
90-99 let,100000


In [91]:
df.loc[:,~df.columns.isin(["Total_bin"])]

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Total,Col0_bin,Col1_bin,Col2_bin,Col3_bin,Col4_bin,Count
0,34,16,71,89,37,247,30-39 let,10-19 let,70-79 let,80-89 let,30-39 let,1
1,66,74,17,12,46,215,60-69 let,70-79 let,10-19 let,10-19 let,40-49 let,1
2,5,11,99,27,97,239,0-9 let,10-19 let,90-99 let,20-29 let,90-99 let,1
3,71,84,67,41,80,343,70-79 let,80-89 let,60-69 let,40-49 let,80-89 let,1
4,71,30,82,90,22,295,70-79 let,30-39 let,80-89 let,90-99 let,20-29 let,1
...,...,...,...,...,...,...,...,...,...,...,...,...
99996,20,79,8,87,81,275,20-29 let,70-79 let,0-9 let,80-89 let,80-89 let,1
99997,48,67,42,99,47,303,40-49 let,60-69 let,40-49 let,90-99 let,40-49 let,1
99998,45,69,72,71,41,298,40-49 let,60-69 let,70-79 let,70-79 let,40-49 let,1
99999,20,66,53,28,39,206,20-29 let,60-69 let,50-59 let,20-29 let,30-39 let,1


In [111]:
# nejde!!!
df[~df.columns.isin(["Total_bin"])]

ValueError: Item wrong length 13 instead of 100001.

In [112]:
df.loc[:,~(df.columns == "Total_bin")]

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Total,Col0_bin,Col1_bin,Col2_bin,Col3_bin,Col4_bin,Count
0,34,16,71,89,37,247,30-39 let,10-19 let,70-79 let,80-89 let,30-39 let,1
1,66,74,17,12,46,215,60-69 let,70-79 let,10-19 let,10-19 let,40-49 let,1
2,5,11,99,27,97,239,0-9 let,10-19 let,90-99 let,20-29 let,90-99 let,1
3,71,84,67,41,80,343,70-79 let,80-89 let,60-69 let,40-49 let,80-89 let,1
4,71,30,82,90,22,295,70-79 let,30-39 let,80-89 let,90-99 let,20-29 let,1
...,...,...,...,...,...,...,...,...,...,...,...,...
99996,20,79,8,87,81,275,20-29 let,70-79 let,0-9 let,80-89 let,80-89 let,1
99997,48,67,42,99,47,303,40-49 let,60-69 let,40-49 let,90-99 let,40-49 let,1
99998,45,69,72,71,41,298,40-49 let,60-69 let,70-79 let,70-79 let,40-49 let,1
99999,20,66,53,28,39,206,20-29 let,60-69 let,50-59 let,20-29 let,30-39 let,1


In [114]:
# nejde!!!
df[~(df.columns == "Total_bin")]

ValueError: Item wrong length 13 instead of 100001.

In [136]:
# řádek
df.loc[0]

Col0                34
Col1                16
Col2                71
Col3                89
Col4                37
Total              247
Col0_bin     30-39 let
Col1_bin     10-19 let
Col2_bin     70-79 let
Col3_bin     80-89 let
Col4_bin     30-39 let
Total_bin        ERROR
Count                1
Name: 0, dtype: object

In [137]:
df.loc[0].to_frame().T

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Total,Col0_bin,Col1_bin,Col2_bin,Col3_bin,Col4_bin,Total_bin,Count
0,34,16,71,89,37,247,30-39 let,10-19 let,70-79 let,80-89 let,30-39 let,ERROR,1


In [138]:
# řádek
df[df.index == 0]

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Total,Col0_bin,Col1_bin,Col2_bin,Col3_bin,Col4_bin,Total_bin,Count
0,34,16,71,89,37,247,30-39 let,10-19 let,70-79 let,80-89 let,30-39 let,ERROR,1


In [164]:
df[df.index == 0].T

Unnamed: 0,0
Col0,34
Col1,16
Col2,71
Col3,89
Col4,37
Total,247
Col0_bin,30-39 let
Col1_bin,10-19 let
Col2_bin,70-79 let
Col3_bin,80-89 let


In [166]:
# řádek, vyberu 1 sloupec jako series
df[df.index == 0].T[0]

Col0                34
Col1                16
Col2                71
Col3                89
Col4                37
Total              247
Col0_bin     30-39 let
Col1_bin     10-19 let
Col2_bin     70-79 let
Col3_bin     80-89 let
Col4_bin     30-39 let
Total_bin        ERROR
Count                1
Name: 0, dtype: object

In [168]:
# řádek jako Series beze jména, ale pozor - multiindex
df[df.index == 0].stack()

0  Col0                34
   Col1                16
   Col2                71
   Col3                89
   Col4                37
   Total              247
   Col0_bin     30-39 let
   Col1_bin     10-19 let
   Col2_bin     70-79 let
   Col3_bin     80-89 let
   Col4_bin     30-39 let
   Total_bin        ERROR
   Count                1
dtype: object

In [169]:
# řádek jako Series se jménem, ale pozor - multiindex
pd.Series(df[df.index == 0].stack(), name = 0)

0  Col0                34
   Col1                16
   Col2                71
   Col3                89
   Col4                37
   Total              247
   Col0_bin     30-39 let
   Col1_bin     10-19 let
   Col2_bin     70-79 let
   Col3_bin     80-89 let
   Col4_bin     30-39 let
   Total_bin        ERROR
   Count                1
Name: 0, dtype: object

In [149]:
# řádek jako Series se jménem, ale pozor - multiindex
pd.Series(df[df.index == 0].stack(), name = 0).to_frame().T

Unnamed: 0_level_0,0,0,0,0,0,0,0,0,0,0,0,0,0
Unnamed: 0_level_1,Col0,Col1,Col2,Col3,Col4,Total,Col0_bin,Col1_bin,Col2_bin,Col3_bin,Col4_bin,Total_bin,Count
0,34,16,71,89,37,247,30-39 let,10-19 let,70-79 let,80-89 let,30-39 let,ERROR,1


In [170]:
# pozor columns nyní jako multiindex
pd.Series(df[df.index == 0].stack(), name = 0).to_frame().T.columns

MultiIndex([(0,      'Col0'),
            (0,      'Col1'),
            (0,      'Col2'),
            (0,      'Col3'),
            (0,      'Col4'),
            (0,     'Total'),
            (0,  'Col0_bin'),
            (0,  'Col1_bin'),
            (0,  'Col2_bin'),
            (0,  'Col3_bin'),
            (0,  'Col4_bin'),
            (0, 'Total_bin'),
            (0,     'Count')],
           )