In [1]:
import pandas as pd
import numpy as np

**1. Show installed versions**

In [2]:
pd.show_versions()


INSTALLED VERSIONS
------------------
commit           : 06d230151e6f18fdb8139d09abf539867a8cd481
python           : 3.10.0.final.0
python-bits      : 64
OS               : Windows
OS-release       : 10
Version          : 10.0.19044
machine          : AMD64
processor        : Intel64 Family 6 Model 158 Stepping 10, GenuineIntel
byteorder        : little
LC_ALL           : None
LANG             : pl_PL.UTF-8
LOCALE           : Polish_Poland.1250

pandas           : 1.4.1
numpy            : 1.22.1
pytz             : 2021.3
dateutil         : 2.8.2
pip              : 21.3.1
setuptools       : 57.4.0
Cython           : None
pytest           : None
hypothesis       : None
sphinx           : None
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : None
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 3.0.3
IPython          : 8.1.1
pandas_datareader: None
bs4              : 4.10.0
bottleneck       : None
fastpar

**2. Create an example DataFrame**

In [3]:
df = pd.DataFrame(np.random.rand(4, 8), columns=list('abcdefgh')) # by using np random rand function we pass (rows, columns)
# it will generate 2 dimensional array which can be interpreted by pandas as a dataframe. We can pass also the column names
# so it will be more readable

**3. Rename columns**

In [4]:
df.head()

Unnamed: 0,a,b,c,d,e,f,g,h
0,0.328109,0.605315,0.072623,0.074005,0.343637,0.876347,0.314741,0.013966
1,0.3487,0.917319,0.790261,0.420661,0.221609,0.262926,0.928702,0.718723
2,0.218682,0.241995,0.691802,0.392442,0.272287,0.088847,0.947837,0.186637
3,0.938799,0.176201,0.089792,0.010669,0.738867,0.225566,0.735958,0.474928


In [5]:
df.rename({'a':'a1', 'b':'b1'}, axis=1) # the most flexible

Unnamed: 0,a1,b1,c,d,e,f,g,h
0,0.328109,0.605315,0.072623,0.074005,0.343637,0.876347,0.314741,0.013966
1,0.3487,0.917319,0.790261,0.420661,0.221609,0.262926,0.928702,0.718723
2,0.218682,0.241995,0.691802,0.392442,0.272287,0.088847,0.947837,0.186637
3,0.938799,0.176201,0.089792,0.010669,0.738867,0.225566,0.735958,0.474928


In [6]:
df.columns = list('hgfedcba') # the fastest? to overwrite existing names, we pass a list with exact length

In [7]:
df.head()

Unnamed: 0,h,g,f,e,d,c,b,a
0,0.328109,0.605315,0.072623,0.074005,0.343637,0.876347,0.314741,0.013966
1,0.3487,0.917319,0.790261,0.420661,0.221609,0.262926,0.928702,0.718723
2,0.218682,0.241995,0.691802,0.392442,0.272287,0.088847,0.947837,0.186637
3,0.938799,0.176201,0.089792,0.010669,0.738867,0.225566,0.735958,0.474928


If we have column names with spaces inside and we do not really want that, we can use this trick

In [8]:
df.rename({'h':'ha ha'},axis=1, inplace=True)

In [9]:
df.head()

Unnamed: 0,ha ha,g,f,e,d,c,b,a
0,0.328109,0.605315,0.072623,0.074005,0.343637,0.876347,0.314741,0.013966
1,0.3487,0.917319,0.790261,0.420661,0.221609,0.262926,0.928702,0.718723
2,0.218682,0.241995,0.691802,0.392442,0.272287,0.088847,0.947837,0.186637
3,0.938799,0.176201,0.089792,0.010669,0.738867,0.225566,0.735958,0.474928


In [10]:
df.columns = df.columns.str.replace(' ', '_')

In [11]:
df.head()

Unnamed: 0,ha_ha,g,f,e,d,c,b,a
0,0.328109,0.605315,0.072623,0.074005,0.343637,0.876347,0.314741,0.013966
1,0.3487,0.917319,0.790261,0.420661,0.221609,0.262926,0.928702,0.718723
2,0.218682,0.241995,0.691802,0.392442,0.272287,0.088847,0.947837,0.186637
3,0.938799,0.176201,0.089792,0.010669,0.738867,0.225566,0.735958,0.474928


If we want to add a suffix/prefix

In [12]:
df.add_prefix('x_')

Unnamed: 0,x_ha_ha,x_g,x_f,x_e,x_d,x_c,x_b,x_a
0,0.328109,0.605315,0.072623,0.074005,0.343637,0.876347,0.314741,0.013966
1,0.3487,0.917319,0.790261,0.420661,0.221609,0.262926,0.928702,0.718723
2,0.218682,0.241995,0.691802,0.392442,0.272287,0.088847,0.947837,0.186637
3,0.938799,0.176201,0.089792,0.010669,0.738867,0.225566,0.735958,0.474928


In [13]:
df.add_suffix('_y')

Unnamed: 0,ha_ha_y,g_y,f_y,e_y,d_y,c_y,b_y,a_y
0,0.328109,0.605315,0.072623,0.074005,0.343637,0.876347,0.314741,0.013966
1,0.3487,0.917319,0.790261,0.420661,0.221609,0.262926,0.928702,0.718723
2,0.218682,0.241995,0.691802,0.392442,0.272287,0.088847,0.947837,0.186637
3,0.938799,0.176201,0.089792,0.010669,0.738867,0.225566,0.735958,0.474928


**4. Reverse row order**

In [14]:
df.head()

Unnamed: 0,ha_ha,g,f,e,d,c,b,a
0,0.328109,0.605315,0.072623,0.074005,0.343637,0.876347,0.314741,0.013966
1,0.3487,0.917319,0.790261,0.420661,0.221609,0.262926,0.928702,0.718723
2,0.218682,0.241995,0.691802,0.392442,0.272287,0.088847,0.947837,0.186637
3,0.938799,0.176201,0.089792,0.010669,0.738867,0.225566,0.735958,0.474928


In [15]:
df.loc[::-1] # reversed order (index also)

Unnamed: 0,ha_ha,g,f,e,d,c,b,a
3,0.938799,0.176201,0.089792,0.010669,0.738867,0.225566,0.735958,0.474928
2,0.218682,0.241995,0.691802,0.392442,0.272287,0.088847,0.947837,0.186637
1,0.3487,0.917319,0.790261,0.420661,0.221609,0.262926,0.928702,0.718723
0,0.328109,0.605315,0.072623,0.074005,0.343637,0.876347,0.314741,0.013966


In [16]:
df.loc[::-1].reset_index(drop=True) # reversed order with index reset

Unnamed: 0,ha_ha,g,f,e,d,c,b,a
0,0.938799,0.176201,0.089792,0.010669,0.738867,0.225566,0.735958,0.474928
1,0.218682,0.241995,0.691802,0.392442,0.272287,0.088847,0.947837,0.186637
2,0.3487,0.917319,0.790261,0.420661,0.221609,0.262926,0.928702,0.718723
3,0.328109,0.605315,0.072623,0.074005,0.343637,0.876347,0.314741,0.013966


**5. Reverse column order**

In [17]:
df.loc[:, ::-1] # works the same as row reverse

Unnamed: 0,a,b,c,d,e,f,g,ha_ha
0,0.013966,0.314741,0.876347,0.343637,0.074005,0.072623,0.605315,0.328109
1,0.718723,0.928702,0.262926,0.221609,0.420661,0.790261,0.917319,0.3487
2,0.186637,0.947837,0.088847,0.272287,0.392442,0.691802,0.241995,0.218682
3,0.474928,0.735958,0.225566,0.738867,0.010669,0.089792,0.176201,0.938799


**6. Select columns by data type**

In [18]:
train_df = pd.DataFrame({'name':['Josh','John','Anna','Bernie'], 'age':[20, 21, 51, 31], 'payment':[512.93, 321.12, 866, 9412.93]})

In [19]:
train_df

Unnamed: 0,name,age,payment
0,Josh,20,512.93
1,John,21,321.12
2,Anna,51,866.0
3,Bernie,31,9412.93


In [20]:
train_df.select_dtypes(include='number')

Unnamed: 0,age,payment
0,20,512.93
1,21,321.12
2,51,866.0
3,31,9412.93


In [21]:
train_df.select_dtypes(include='object')

Unnamed: 0,name
0,Josh
1,John
2,Anna
3,Bernie


In [22]:
train_df.select_dtypes(include=['object', 'number'])

Unnamed: 0,name,age,payment
0,Josh,20,512.93
1,John,21,321.12
2,Anna,51,866.0
3,Bernie,31,9412.93


In [23]:
train_df.select_dtypes(exclude='object')

Unnamed: 0,age,payment
0,20,512.93
1,21,321.12
2,51,866.0
3,31,9412.93


**7. Convert string to numbers**

In [24]:
train_df['fav_number'] = ['7','25','51','88']
train_df['bad_number'] = ['51', '22','67','t']

In [25]:
train_df.dtypes #fav_number is object (string)

name           object
age             int64
payment       float64
fav_number     object
bad_number     object
dtype: object

In [26]:
train_df = train_df.astype({'fav_number':'int'}) # cast pandas object to specific dtype, prone to errors

In [27]:
train_df.dtypes

name           object
age             int64
payment       float64
fav_number      int32
bad_number     object
dtype: object

In [28]:
#train_df.astype({'bad_number':'int'}) # error because pandas does not know how to handle not "numeric" strings

In [29]:
pd.to_numeric(train_df.bad_number, errors='coerce') # tell pandas to handle these string and set them to NaN

0    51.0
1    22.0
2    67.0
3     NaN
Name: bad_number, dtype: float64

**8. Reduce DataFrame size**

In [30]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')

In [31]:
drinks.info(memory_usage='deep') # 30.5 KB 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 30.5 KB


In [32]:
cols_we_need = ['beer_servings', 'continent']
s_drinks = pd.read_csv('http://bit.ly/drinksbycountry', usecols=cols_we_need)

In [33]:
s_drinks.info(memory_usage='deep') # 13.7 KB, the point is to use only the columns that we actually need to do
# certain operations on

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   beer_servings  193 non-null    int64 
 1   continent      193 non-null    object
dtypes: int64(1), object(1)
memory usage: 13.7 KB


In [34]:
dtypes = {'continent':'category'}
s_drinks = pd.read_csv('http://bit.ly/drinksbycountry', usecols=cols_we_need, dtype=dtypes)
s_drinks.info(memory_usage='deep') # 2.4 KB not only columns that we need but also try to change object-type data to categorical
# only if there are not many different values (for example not many countries to choose from)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   beer_servings  193 non-null    int64   
 1   continent      193 non-null    category
dtypes: category(1), int64(1)
memory usage: 2.4 KB


**9. Build a DataFrame from multiple files (row-wise)**

In [35]:
from glob import glob # glob help us to take all filenames that match pattern from specified path. 

In [36]:
files = sorted(glob('./data_2/concat_*')) # we have to sort the files 

In [37]:
df = pd.concat((pd.read_csv(file) for file in files), ignore_index=True) # concat can take mapping and iterable so
# we take an adventage of that, we pass ignore_index = True to make sure that index numering starts and ends in normal way.

In [38]:
df

Unnamed: 0,A,B,C,D
0,a0,b0,c0,d0
1,a1,b1,c1,d1
2,a2,b2,c2,d2
3,a3,b3,c3,d3
4,a4,b4,c4,d4
5,a5,b5,c5,d5
6,a6,b6,c6,d6
7,a7,b7,c7,d7
8,a8,b8,c8,d8
9,a9,b9,c9,d9


**10. Build a DataFrame from multiple files (column-wise)**

If a data set has been split into two or more files (first file has some columns, the second some and so on...) we can use this trick to concate them into single data frame column wise.

In [39]:
# pd.concat((pd.read_csv(file) for file in files), axis='columns') The diffrence ist hat we specify columns and we
# do not care about index.

**11. Create a DataFrame from the clipboard**

In [40]:
df = pd.read_clipboard() # u have to copy from for example excel, select table, ctrl + c and you can use this function
# to paste it as a dataframe, wtf magic
# pandas can recognize patterns, by that i mean as long as the table in excel look like a data frame.

In [42]:
df.dtypes

dawid.kula98@gmail.com    object
dtype: object

**12. Split a DataFrame into two random subsets**

To do this we can use random sampling - sample() method.

In [44]:
drinks_1 = drinks.sample(frac=0.50, random_state=777)

In [53]:
drinks_2 = drinks.drop(drinks_1.index) # cutting original dataframe, this work if indexes are unique

In [46]:
len(drinks)

193

In [54]:
len(drinks_1) + len(drinks_2) # same length

193

In [51]:
drinks_1.index.sort_values()

Int64Index([  1,   2,   3,   5,   6,  10,  11,  12,  13,  18,  19,  21,  22,
             25,  26,  27,  29,  30,  33,  35,  40,  41,  42,  43,  45,  48,
             49,  52,  57,  63,  64,  72,  73,  75,  76,  77,  79,  81,  83,
             86,  90,  93,  94,  96,  97,  98, 101, 102, 106, 107, 110, 113,
            114, 118, 121, 124, 125, 126, 131, 132, 133, 134, 136, 137, 139,
            140, 144, 145, 147, 148, 151, 154, 155, 156, 158, 159, 160, 161,
            163, 164, 168, 169, 170, 171, 175, 176, 178, 179, 180, 182, 185,
            186, 187, 189, 190, 191],
           dtype='int64')

In [52]:
drinks_2.index.sort_values()

Int64Index([  0,   4,   7,   8,   9,  14,  15,  16,  17,  20,  23,  24,  28,
             31,  32,  34,  36,  37,  38,  39,  44,  46,  47,  50,  51,  53,
             54,  55,  56,  58,  59,  60,  61,  62,  65,  66,  67,  68,  69,
             70,  71,  74,  78,  80,  82,  84,  85,  87,  88,  89,  91,  92,
             95,  99, 100, 103, 104, 105, 108, 109, 111, 112, 115, 116, 117,
            119, 120, 122, 123, 127, 128, 129, 130, 135, 138, 141, 142, 143,
            146, 149, 150, 152, 153, 157, 162, 165, 166, 167, 172, 173, 174,
            177, 181, 183, 184, 188, 192],
           dtype='int64')

**13. Filter a DataFrame by multiple categories**

In [63]:
drinks.loc[(drinks.country == 'Albania') |
           (drinks.country == 'Afghanistan')]
# too long

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe


In [64]:
drinks.loc[drinks.country.isin(['Albania','Afghanistan'])] # more elegant and readable

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe


**14. Filter a DataFrame by largest categories**

In [70]:
count_continent = drinks.continent.value_counts()

In [71]:
count_continent.nlargest(3)

Africa    53
Europe    45
Asia      44
Name: continent, dtype: int64

In [72]:
count_continent.nlargest(3).index

Index(['Africa', 'Europe', 'Asia'], dtype='object')

In [77]:
drinks.loc[drinks.continent.isin(count_continent.nlargest(3).index)] # show only top 3 most common continent

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa
...,...,...,...,...,...,...
186,Uzbekistan,25,101,8,2.4,Asia
189,Vietnam,111,2,1,2.0,Asia
190,Yemen,6,0,0,0.1,Asia
191,Zambia,32,19,4,2.5,Africa


**15. Handle missing values (threshold)**

In [85]:
train_df.loc[len(train_df)] = ['Marry', np.nan, 33662, 41, np.nan]

In [86]:
train_df

Unnamed: 0,name,age,payment,fav_number,bad_number
0,Josh,20.0,512.93,7,51.0
1,John,21.0,321.12,25,22.0
2,Anna,51.0,866.0,51,67.0
3,Jane,,3332.0,11,
4,Marry,,33662.0,41,


In [90]:
train_df.isna().sum() # treat true as 1 false as 0 and sum up. We get how many misisng values we have in specific columns

name          0
age           2
payment       0
fav_number    0
bad_number    2
dtype: int64

In [92]:
train_df.isna().mean() # % missing values

name          0.0
age           0.4
payment       0.0
fav_number    0.0
bad_number    0.4
dtype: float64

In [94]:
train_df.dropna(axis=0) # drops columns or rows which have nan values

Unnamed: 0,name,age,payment,fav_number,bad_number
0,Josh,20.0,512.93,7,51
1,John,21.0,321.12,25,22
2,Anna,51.0,866.0,51,67


In [100]:
train_df.dropna(thresh=len(train_df) * 0.9, axis=1) # in this case we tell pandas to keep columns which
# 90% of values are not NaN. We can pass simple int and so that the treshold would be fixed

Unnamed: 0,name,payment,fav_number
0,Josh,512.93,7
1,John,321.12,25
2,Anna,866.0,51
3,Jane,3332.0,11
4,Marry,33662.0,41


**16. Split a string into multiple columns**

In [101]:
hdf = pd.DataFrame({'name':['John Doe', 'Henry Cavil'], 'height':[176, 194]})

In [103]:
hdf[['first.name', 'last.name']] = hdf.name.str.split(' ',expand=True)

In [104]:
hdf

Unnamed: 0,name,height,first.name,last.name
0,John Doe,176,John,Doe
1,Henry Cavil,194,Henry,Cavil


**17. Expand a Series (column) of lists into a DataFrame**

In [108]:
train_df['codes'] = [[10, 20], [40, 41], [94, 83], [17, 23], [1,2]]

In [110]:
train_df

Unnamed: 0,name,age,payment,fav_number,bad_number,codes
0,Josh,20.0,512.93,7,51.0,"[10, 20]"
1,John,21.0,321.12,25,22.0,"[40, 41]"
2,Anna,51.0,866.0,51,67.0,"[94, 83]"
3,Jane,,3332.0,11,,"[17, 23]"
4,Marry,,33662.0,41,,"[1, 2]"


In [112]:
t_expand = train_df.codes.apply(pd.Series)

In [116]:
n_train_df = pd.concat([train_df, t_expand], axis=1)

In [117]:
n_train_df

Unnamed: 0,name,age,payment,fav_number,bad_number,codes,0,1
0,Josh,20.0,512.93,7,51.0,"[10, 20]",10,20
1,John,21.0,321.12,25,22.0,"[40, 41]",40,41
2,Anna,51.0,866.0,51,67.0,"[94, 83]",94,83
3,Jane,,3332.0,11,,"[17, 23]",17,23
4,Marry,,33662.0,41,,"[1, 2]",1,2


In [122]:
train_df[['1_part', '2_part']] = train_df.codes.apply(lambda x : pd.Series(x))

In [123]:
train_df

Unnamed: 0,name,age,payment,fav_number,bad_number,codes,1_part,2_part
0,Josh,20.0,512.93,7,51.0,"[10, 20]",10,20
1,John,21.0,321.12,25,22.0,"[40, 41]",40,41
2,Anna,51.0,866.0,51,67.0,"[94, 83]",94,83
3,Jane,,3332.0,11,,"[17, 23]",17,23
4,Marry,,33662.0,41,,"[1, 2]",1,2


**18. Aggregate by multiple functions**

In [124]:
orders = pd.read_csv('http://bit.ly/chiporders', sep='\t')
orders['item_price'] = orders.item_price.str.replace('$', '').astype('float')

  orders['item_price'] = orders.item_price.str.replace('$', '').astype('float')


In [126]:
by_id = orders.groupby('order_id')

In [129]:
by_id.item_price.sum().head()

order_id
1    11.56
2    16.98
3    12.67
4    21.00
5    13.70
Name: item_price, dtype: float64

In [146]:
by_id.item_price.agg(['sum','count']).head() # agg method allow us to pass multiple aggregate functions

Unnamed: 0_level_0,sum,count
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,11.56,4
2,16.98,1
3,12.67,2
4,21.0,2
5,13.7,2


**19. Combine the output of an aggregation with a DataFrame**

In [150]:
by_id.item_price.sum() # 1834 records

order_id
1       11.56
2       16.98
3       12.67
4       21.00
5       13.70
        ...  
1830    23.00
1831    12.90
1832    13.20
1833    23.50
1834    28.75
Name: item_price, Length: 1834, dtype: float64

In [152]:
len(orders) # 4622 records

4622

If we use any aggregate function the length of the result is always smaller. What if we want to make a column with with the result of an aggregation function? To do that it is better to use .transform() and pass agg func inside. This way the number of rows in resulted data will be equal to the original length of a dataframe.

In [156]:
orders['total_price'] = by_id.item_price.transform('sum') # it matches the original indexes

In [155]:
orders.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,total_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39,11.56
1,1,1,Izze,[Clementine],3.39,11.56
2,1,1,Nantucket Nectar,[Apple],3.39,11.56
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39,11.56
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98,16.98


In [185]:
orders['percent_of_total'] = round((orders.item_price / orders.total_price) * 100, 2)

In [186]:
orders

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,total_price,percent_of_total
0,1,1,Chips and Fresh Tomato Salsa,,2.39,11.56,20.67
1,1,1,Izze,[Clementine],3.39,11.56,29.33
2,1,1,Nantucket Nectar,[Apple],3.39,11.56,29.33
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39,11.56,20.67
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98,16.98,100.00
...,...,...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",11.75,23.50,50.00
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",11.75,23.50,50.00
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",11.25,28.75,39.13
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",8.75,28.75,30.43


**20. Select a slice of rows and columns**

In [188]:
orders.loc[2000:2500, 'item_name':'item_price']

Unnamed: 0,item_name,choice_description,item_price
2000,Carnitas Soft Tacos,"[Tomatillo Red Chili Salsa, [Rice, Lettuce]]",9.25
2001,Chicken Bowl,"[Tomatillo Red Chili Salsa, [Rice, Pinto Beans...",11.25
2002,Canned Soft Drink,[Coke],1.25
2003,Chicken Bowl,"[Fresh Tomato Salsa, [Rice, Black Beans, Cheese]]",8.75
2004,Chips and Tomatillo Red Chili Salsa,,2.95
...,...,...,...
2496,Chips,,2.15
2497,Chicken Bowl,"[Fresh Tomato Salsa, Cheese]",8.75
2498,Chips and Guacamole,,4.45
2499,Chicken Burrito,"[Fresh Tomato Salsa (Mild), [Cheese, Lettuce]]",8.49


**21. Reshape a Multindexed Series**

In [189]:
titanic = pd.read_csv('http://bit.ly/kaggletrain')

In [202]:
titanic.groupby(['Sex','Pclass']).Survived.mean() # it is harder to read than normal dataframe and it is also a series object

Sex     Pclass
female  1         0.968085
        2         0.921053
        3         0.500000
male    1         0.368852
        2         0.157407
        3         0.135447
Name: Survived, dtype: float64

In [201]:
titanic.groupby(['Sex','Pclass']).Survived.mean().unstack() # now it is more readable and 
# we can use familiar dataframe methods on it

Pclass,1,2,3
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


**22. Create a pivot table**

In [203]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [213]:
titanic.pivot_table(index='Pclass', columns='Sex', values=['Survived', 'Age'], aggfunc={'Age': 'mean', 'Survived':'count'}, margins=True)

Unnamed: 0_level_0,Age,Age,Age,Survived,Survived,Survived
Sex,female,male,All,female,male,All
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,34.611765,41.281386,38.233441,94,122,186
2,28.722973,30.740707,29.87763,76,108,173
3,21.75,26.507589,25.14062,144,347,355
All,27.915709,30.726645,29.699118,261,453,714


**23. Convert continous data into categorical data**

In [214]:
titanic.Age.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64

In [215]:
pd.cut(titanic.Age, bins=[0, 18, 25, 99], labels=['child','young adult','adult'])

0      young adult
1            adult
2            adult
3            adult
4            adult
          ...     
886          adult
887    young adult
888            NaN
889          adult
890          adult
Name: Age, Length: 891, dtype: category
Categories (3, object): ['child' < 'young adult' < 'adult']