**Series**

In [1]:
import pandas as pd 
import numpy as np 

* Pandas series are one-dimensional arrays that hold data of any type.
* Pandas dataframes are two-dimensional tabular data structures.

In [2]:
s = pd.Series([1, 'strucutre', 5, np.nan, 6, 8])
print(s)
print(type(s))

0            1
1    strucutre
2            5
3          NaN
4            6
5            8
dtype: object
<class 'pandas.core.series.Series'>


In [3]:
#Accessing 4th element
s[3]

nan

In [4]:
#Accessing 2nd to 5th element
s[2:]

2      5
3    NaN
4      6
5      8
dtype: object

In [5]:
#Accessing 2nd, 3rd and 5th element
s[[2,3,5]]

2      5
3    NaN
5      8
dtype: object

In [6]:
#Apply function on series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s = s.apply(lambda x: x**2)
s
#But apply function does not work on numpy arrays - arrays are not meant to be iterated on

0     1.0
1     9.0
2    25.0
3     NaN
4    36.0
5    64.0
dtype: float64

In [7]:
# creating a series of type datetime
date_series = pd.date_range(start = '11-09-2017', end = '12-12-2017')
print(type(date_series))
date_series

<class 'pandas.core.indexes.datetimes.DatetimeIndex'>


DatetimeIndex(['2017-11-09', '2017-11-10', '2017-11-11', '2017-11-12',
               '2017-11-13', '2017-11-14', '2017-11-15', '2017-11-16',
               '2017-11-17', '2017-11-18', '2017-11-19', '2017-11-20',
               '2017-11-21', '2017-11-22', '2017-11-23', '2017-11-24',
               '2017-11-25', '2017-11-26', '2017-11-27', '2017-11-28',
               '2017-11-29', '2017-11-30', '2017-12-01', '2017-12-02',
               '2017-12-03', '2017-12-04', '2017-12-05', '2017-12-06',
               '2017-12-07', '2017-12-08', '2017-12-09', '2017-12-10',
               '2017-12-11', '2017-12-12'],
              dtype='datetime64[ns]', freq='D')

In [8]:
#Creating series from arrays with explicit index
m = pd.Series(np.array(range(0,10))**1.5, index = range(0,10))
m

0     0.000000
1     1.000000
2     2.828427
3     5.196152
4     8.000000
5    11.180340
6    14.696938
7    18.520259
8    22.627417
9    27.000000
dtype: float64

**Dataframes**

In [9]:
#Creating dataframe from dictonary
data = {'Name':['Tom', 'nick', 'krish', 'jack'], 'Age':[20, 21, 19, 18], 'Gender':['M', 'F', 'M', 'M']}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Gender
0,Tom,20,M
1,nick,21,F
2,krish,19,M
3,jack,18,M


In [10]:
# market_df = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv', sep = '\t')
market_df = pd.read_csv(r'C:/Users/getch/Downloads/Compressed/Introduction+to+Pandas/Introduction to Pandas/global_sales_data/market_fact.csv')
market_df

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
0,Ord_5446,Prod_16,SHP_7609,Cust_1818,136.8100,0.01,23,-30.51,3.60,0.56
1,Ord_5406,Prod_13,SHP_7549,Cust_1818,42.2700,0.01,13,4.56,0.93,0.54
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.6900,0.00,26,1148.90,2.50,0.59
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.8900,0.09,43,729.34,14.30,0.37
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.1500,0.08,35,1219.87,26.30,0.38
...,...,...,...,...,...,...,...,...,...,...
8394,Ord_5353,Prod_4,SHP_7479,Cust_1798,2841.4395,0.08,28,374.63,7.69,0.59
8395,Ord_5411,Prod_6,SHP_7555,Cust_1798,127.1600,0.10,20,-74.03,6.92,0.37
8396,Ord_5388,Prod_6,SHP_7524,Cust_1798,243.0500,0.02,39,-70.85,5.35,0.40
8397,Ord_5348,Prod_15,SHP_7469,Cust_1798,3872.8700,0.03,23,565.34,30.00,0.62


In [11]:
market_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8399 entries, 0 to 8398
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Ord_id               8399 non-null   object 
 1   Prod_id              8399 non-null   object 
 2   Ship_id              8399 non-null   object 
 3   Cust_id              8399 non-null   object 
 4   Sales                8399 non-null   float64
 5   Discount             8399 non-null   float64
 6   Order_Quantity       8399 non-null   int64  
 7   Profit               8399 non-null   float64
 8   Shipping_Cost        8399 non-null   float64
 9   Product_Base_Margin  8336 non-null   float64
dtypes: float64(5), int64(1), object(4)
memory usage: 656.3+ KB


In [12]:
market_df.shape

(8399, 10)

In [13]:
market_df.values

array([['Ord_5446', 'Prod_16', 'SHP_7609', ..., -30.51, 3.6, 0.56],
       ['Ord_5406', 'Prod_13', 'SHP_7549', ..., 4.56, 0.93, 0.54],
       ['Ord_5446', 'Prod_4', 'SHP_7610', ..., 1148.9, 2.5, 0.59],
       ...,
       ['Ord_5388', 'Prod_6', 'SHP_7524', ..., -70.85, 5.35, 0.4],
       ['Ord_5348', 'Prod_15', 'SHP_7469', ..., 565.34, 30.0, 0.62],
       ['Ord_5459', 'Prod_6', 'SHP_7628', ..., 131.39, 4.86, 0.38]],
      dtype=object)

In [14]:
market_df.describe()

Unnamed: 0,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
count,8399.0,8399.0,8399.0,8399.0,8399.0,8336.0
mean,1775.878179,0.049671,25.571735,181.184424,12.838557,0.512513
std,3585.050525,0.031823,14.481071,1196.653371,17.264052,0.135589
min,2.24,0.0,1.0,-14140.7,0.49,0.35
25%,143.195,0.02,13.0,-83.315,3.3,0.38
50%,449.42,0.05,26.0,-1.5,6.07,0.52
75%,1709.32,0.08,38.0,162.75,13.99,0.59
max,89061.05,0.25,50.0,27220.69,164.73,0.85


In [15]:
market_df_1 = market_df.copy()

In [16]:
#setting order_id as index
market_df_1.set_index('Ord_id', inplace = True)
market_df_1

Unnamed: 0_level_0,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
Ord_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ord_5446,Prod_16,SHP_7609,Cust_1818,136.8100,0.01,23,-30.51,3.60,0.56
Ord_5406,Prod_13,SHP_7549,Cust_1818,42.2700,0.01,13,4.56,0.93,0.54
Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.6900,0.00,26,1148.90,2.50,0.59
Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.8900,0.09,43,729.34,14.30,0.37
Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.1500,0.08,35,1219.87,26.30,0.38
...,...,...,...,...,...,...,...,...,...
Ord_5353,Prod_4,SHP_7479,Cust_1798,2841.4395,0.08,28,374.63,7.69,0.59
Ord_5411,Prod_6,SHP_7555,Cust_1798,127.1600,0.10,20,-74.03,6.92,0.37
Ord_5388,Prod_6,SHP_7524,Cust_1798,243.0500,0.02,39,-70.85,5.35,0.40
Ord_5348,Prod_15,SHP_7469,Cust_1798,3872.8700,0.03,23,565.34,30.00,0.62


*Sorting dataframe can be done in 2 ways - by index and by values*

In [17]:
#Sorting with index
market_df_1.sort_index(axis=0, ascending = False)

Unnamed: 0_level_0,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
Ord_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ord_999,Prod_15,SHP_1383,Cust_361,5661.08,0.00,33,1055.47,30.00,0.62
Ord_998,Prod_8,SHP_1380,Cust_372,750.66,0.00,33,120.05,4.00,0.60
Ord_998,Prod_5,SHP_1382,Cust_372,2149.37,0.03,42,217.87,19.99,0.55
Ord_998,Prod_8,SHP_1381,Cust_372,254.32,0.01,8,-117.39,6.50,0.79
Ord_997,Prod_14,SHP_1379,Cust_365,28761.52,0.04,8,285.11,24.49,0.37
...,...,...,...,...,...,...,...,...,...
Ord_1001,Prod_5,SHP_1385,Cust_374,1981.26,0.07,49,100.80,8.66,0.76
Ord_1000,Prod_6,SHP_1384,Cust_373,334.71,0.01,25,31.74,6.47,0.38
Ord_100,Prod_8,SHP_138,Cust_58,121.12,0.10,3,-118.82,1.99,0.44
Ord_10,Prod_3,SHP_13,Cust_10,80.61,0.02,15,-4.72,2.99,0.37


In [18]:
#Sorting by values
market_df_1.sort_values(by = 'Discount', ascending = False)

Unnamed: 0_level_0,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
Ord_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ord_13,Prod_11,SHP_17,Cust_12,663.784,0.25,11,-481.04,69.00,0.68
Ord_162,Prod_5,SHP_219,Cust_42,338.520,0.21,22,-17.75,8.99,0.39
Ord_916,Prod_5,SHP_1262,Cust_328,27.960,0.17,1,-9.13,13.56,0.58
Ord_960,Prod_5,SHP_1329,Cust_349,651.900,0.16,49,-74.51,6.85,0.54
Ord_876,Prod_5,SHP_1204,Cust_299,586.110,0.11,43,98.44,4.98,0.48
...,...,...,...,...,...,...,...,...,...
Ord_2774,Prod_9,SHP_3809,Cust_1015,3191.240,0.00,38,1620.23,5.01,0.38
Ord_2605,Prod_6,SHP_3566,Cust_991,197.150,0.00,27,-23.34,5.20,0.37
Ord_2443,Prod_2,SHP_3351,Cust_945,342.400,0.00,26,-14.06,6.96,0.50
Ord_4963,Prod_7,SHP_6927,Cust_1695,20.950,0.00,14,2.81,0.70,0.38


In [19]:
#Sorting by multiple values
market_df_1.sort_values(by = ['Discount', 'Shipping_Cost'], ascending = False)

Unnamed: 0_level_0,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
Ord_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ord_13,Prod_11,SHP_17,Cust_12,663.784,0.25,11,-481.04,69.00,0.68
Ord_162,Prod_5,SHP_219,Cust_42,338.520,0.21,22,-17.75,8.99,0.39
Ord_916,Prod_5,SHP_1262,Cust_328,27.960,0.17,1,-9.13,13.56,0.58
Ord_960,Prod_5,SHP_1329,Cust_349,651.900,0.16,49,-74.51,6.85,0.54
Ord_876,Prod_5,SHP_1204,Cust_299,586.110,0.11,43,98.44,4.98,0.48
...,...,...,...,...,...,...,...,...,...
Ord_4026,Prod_12,SHP_5603,Cust_1389,69.060,0.00,21,28.47,0.50,0.37
Ord_4415,Prod_12,SHP_6154,Cust_1488,21.560,0.00,5,2.75,0.50,0.39
Ord_4990,Prod_12,SHP_6966,Cust_1693,104.330,0.00,39,39.91,0.50,0.39
Ord_2722,Prod_12,SHP_3729,Cust_1006,34.010,0.00,12,10.58,0.50,0.39


In [None]:
# Selecting the rows from indices 2 to 6
market_df[2:7]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.69,0.0,26,1148.9,2.5,0.59
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.89,0.09,43,729.34,14.3,0.37
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.15,0.08,35,1219.87,26.3,0.38
5,Ord_5446,Prod_6,SHP_7608,Cust_1818,164.02,0.03,23,-47.64,6.15,0.37
6,Ord_31,Prod_12,SHP_41,Cust_26,14.76,0.01,5,1.32,0.5,0.36


In [None]:
# Selecting alternate rows starting from index = 5
market_df[5::2]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
5,Ord_5446,Prod_6,SHP_7608,Cust_1818,164.0200,0.03,23,-47.64,6.15,0.37
7,Ord_4725,Prod_4,SHP_6593,Cust_1641,3410.1575,0.10,48,1137.91,0.99,0.55
9,Ord_4725,Prod_6,SHP_6593,Cust_1641,57.2200,0.07,8,-27.72,6.60,0.37
11,Ord_1925,Prod_6,SHP_2637,Cust_708,465.9000,0.05,38,79.34,4.86,0.38
13,Ord_2207,Prod_11,SHP_3093,Cust_839,3364.2480,0.10,15,-693.23,61.76,0.78
...,...,...,...,...,...,...,...,...,...,...
8389,Ord_4444,Prod_13,SHP_6192,Cust_1519,159.4100,0.00,44,34.68,0.98,0.52
8391,Ord_5435,Prod_4,SHP_7594,Cust_1798,1991.8985,0.07,20,88.36,7.69,0.58
8393,Ord_5348,Prod_8,SHP_7470,Cust_1798,356.7200,0.07,9,12.61,1.99,0.44
8395,Ord_5411,Prod_6,SHP_7555,Cust_1798,127.1600,0.10,20,-74.03,6.92,0.37


#### Selecting Columns

In [None]:
# Using df['column']
sales = market_df['Sales'] #or market_df.Sales
sales

0        136.8100
1         42.2700
2       4701.6900
3       2337.8900
4       4233.1500
          ...    
8394    2841.4395
8395     127.1600
8396     243.0500
8397    3872.8700
8398     603.6900
Name: Sales, Length: 8399, dtype: float64

In [23]:
type(market_df['Sales'])

pandas.core.series.Series

In [29]:
market_df[['Sales']]

Unnamed: 0,Sales
0,136.8100
1,42.2700
2,4701.6900
3,2337.8900
4,4233.1500
...,...
8394,2841.4395
8395,127.1600
8396,243.0500
8397,3872.8700


In [28]:
type(market_df[['Sales']])

pandas.core.frame.DataFrame

In [26]:
market_df[['Cust_id','Sales', 'Profit']]

Unnamed: 0,Cust_id,Sales,Profit
0,Cust_1818,136.8100,-30.51
1,Cust_1818,42.2700,4.56
2,Cust_1818,4701.6900,1148.90
3,Cust_1818,2337.8900,729.34
4,Cust_1818,4233.1500,1219.87
...,...,...,...
8394,Cust_1798,2841.4395,374.63
8395,Cust_1798,127.1600,-74.03
8396,Cust_1798,243.0500,-70.85
8397,Cust_1798,3872.8700,565.34


In [27]:
type(market_df[['Cust_id','Sales', 'Profit']])

pandas.core.frame.DataFrame

In [30]:
# Changing the row indices to Ord_id
market_df_2 = market_df.copy()
market_df_2 = market_df_2.set_index('Ord_id').head()
market_df_2

Unnamed: 0_level_0,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
Ord_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ord_5446,Prod_16,SHP_7609,Cust_1818,136.81,0.01,23,-30.51,3.6,0.56
Ord_5406,Prod_13,SHP_7549,Cust_1818,42.27,0.01,13,4.56,0.93,0.54
Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.69,0.0,26,1148.9,2.5,0.59
Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.89,0.09,43,729.34,14.3,0.37
Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.15,0.08,35,1219.87,26.3,0.38


##### Position and Label Based Indexing: ```df.iloc``` and ```df.loc```

In [31]:
help(pd.DataFrame.iloc)

Help on property:

iloc
    Purely integer-location based indexing for selection by position.

    .. deprecated:: 2.2.0

       Returning a tuple from a callable is deprecated.

    ``.iloc[]`` is primarily integer position based (from ``0`` to
    ``length-1`` of the axis), but may also be used with a boolean
    array.

    Allowed inputs are:

    - An integer, e.g. ``5``.
    - A list or array of integers, e.g. ``[4, 3, 0]``.
    - A slice object with ints, e.g. ``1:7``.
    - A boolean array.
    - A ``callable`` function with one argument (the calling Series or
      DataFrame) and that returns valid output for indexing (one of the above).
      This is useful in method chains, when you don't have a reference to the
      calling object, but would like to base your selection on
      some value.
    - A tuple of row and column indexes. The tuple elements consist of one of the
      above inputs, e.g. ``(0, 1)``.

    ``.iloc`` will raise ``IndexError`` if a requested indexer is


In [32]:
help(pd.DataFrame.loc)

Help on property:

loc
    Access a group of rows and columns by label(s) or a boolean array.

    ``.loc[]`` is primarily label based, but may also be used with a
    boolean array.

    Allowed inputs are:

    - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is
      interpreted as a *label* of the index, and **never** as an
      integer position along the index).
    - A list or array of labels, e.g. ``['a', 'b', 'c']``.
    - A slice object with labels, e.g. ``'a':'f'``.

          start and the stop are included

    - A boolean array of the same length as the axis being sliced,
      e.g. ``[True, False, True]``.
    - An alignable boolean Series. The index of the key will be aligned before
      masking.
    - An alignable Index. The Index of the returned selection will be the input.
    - A ``callable`` function with one argument (the calling Series or
      DataFrame) and that returns valid output for indexing (one of the above)

    See more at :ref:`Selection by L

In [57]:
# Selecting a single element
# Note that 2, 4 corresponds to the third row and fifth column (Sales)
market_df.iloc[2, 4]

np.float64(4701.69)

In [None]:
#selective rows
market_df.iloc[[2, 4]]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.69,0.0,26,1148.9,2.5,0.59
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.15,0.08,35,1219.87,26.3,0.38


In [38]:
market_df.iloc[5]

Ord_id                  Ord_5446
Prod_id                   Prod_6
Ship_id                 SHP_7608
Cust_id                Cust_1818
Sales                     164.02
Discount                    0.03
Order_Quantity                23
Profit                    -47.64
Shipping_Cost               6.15
Product_Base_Margin         0.37
Name: 5, dtype: object

In [60]:
type(market_df.iloc[5])

pandas.core.series.Series

In [39]:
market_df.iloc[[5]]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
5,Ord_5446,Prod_6,SHP_7608,Cust_1818,164.02,0.03,23,-47.64,6.15,0.37


In [61]:
type(market_df.iloc[[5]])

pandas.core.frame.DataFrame

In [None]:
# The above is equivalent to this
# The ":" indicates "all rows/columns"
market_df.iloc[5, :]

# equivalent to market_df.iloc[5, ] or market_df.iloc[5]

Ord_id                  Ord_5446
Prod_id                   Prod_6
Ship_id                 SHP_7608
Cust_id                Cust_1818
Sales                     164.02
Discount                    0.03
Order_Quantity                23
Profit                    -47.64
Shipping_Cost               6.15
Product_Base_Margin         0.37
Name: 5, dtype: object

In [42]:
# Equivalently, you can use:
market_df.iloc[[3, 7, 8], :]

# same as market_df.iloc[[3, 7, 8], ]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.89,0.09,43,729.34,14.3,0.37
7,Ord_4725,Prod_4,SHP_6593,Cust_1641,3410.1575,0.1,48,1137.91,0.99,0.55
8,Ord_4725,Prod_13,SHP_6593,Cust_1641,162.0,0.01,33,45.84,0.71,0.52


In [None]:
market_df.iloc[2:4]
#similar to market_df.iloc[[2, 4]]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.69,0.0,26,1148.9,2.5,0.59
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.89,0.09,43,729.34,14.3,0.37


In [None]:
market_df.iloc[2:4, :]
#similar to market_df.iloc[[2, 4]] or market_df.iloc[2:4, ]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.69,0.0,26,1148.9,2.5,0.59
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.89,0.09,43,729.34,14.3,0.37


In [49]:
# Selecting a single column
# Notice that the column index starts at 0, and 2 represents the third column (Cust_id)
market_df.iloc[:, 2]

0       SHP_7609
1       SHP_7549
2       SHP_7610
3       SHP_7625
4       SHP_7664
          ...   
8394    SHP_7479
8395    SHP_7555
8396    SHP_7524
8397    SHP_7469
8398    SHP_7628
Name: Ship_id, Length: 8399, dtype: object

In [50]:
# Selecting multiple columns
market_df.iloc[:, 3:8]

Unnamed: 0,Cust_id,Sales,Discount,Order_Quantity,Profit
0,Cust_1818,136.8100,0.01,23,-30.51
1,Cust_1818,42.2700,0.01,13,4.56
2,Cust_1818,4701.6900,0.00,26,1148.90
3,Cust_1818,2337.8900,0.09,43,729.34
4,Cust_1818,4233.1500,0.08,35,1219.87
...,...,...,...,...,...
8394,Cust_1798,2841.4395,0.08,28,374.63
8395,Cust_1798,127.1600,0.10,20,-74.03
8396,Cust_1798,243.0500,0.02,39,-70.85
8397,Cust_1798,3872.8700,0.03,23,565.34


In [55]:
# Selecting multiple rows and columns
market_df.iloc[3:6, 2:5]

Unnamed: 0,Ship_id,Cust_id,Sales
3,SHP_7625,Cust_1818,2337.89
4,SHP_7664,Cust_1818,4233.15
5,SHP_7608,Cust_1818,164.02


In [None]:
# Using booleans
# This selects the rows corresponding to True
market_df.iloc[[True, True, False, True, True, False, True]] 

label based

In [63]:
# Selecting a single element
# Select row label = 2 and column label = 'Sales
market_df.loc[2, 'Sales']

np.float64(4701.69)

In [64]:
market_df.loc[5]

Ord_id                  Ord_5446
Prod_id                   Prod_6
Ship_id                 SHP_7608
Cust_id                Cust_1818
Sales                     164.02
Discount                    0.03
Order_Quantity                23
Profit                    -47.64
Shipping_Cost               6.15
Product_Base_Margin         0.37
Name: 5, dtype: object

In [65]:
# or equivalently
market_df.loc[5, :]

# or market_df.loc[5, ]

Ord_id                  Ord_5446
Prod_id                   Prod_6
Ship_id                 SHP_7608
Cust_id                Cust_1818
Sales                     164.02
Discount                    0.03
Order_Quantity                23
Profit                    -47.64
Shipping_Cost               6.15
Product_Base_Margin         0.37
Name: 5, dtype: object

In [66]:
# Select multiple rows using a list of row labels
market_df.loc[[3, 7, 8]]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.89,0.09,43,729.34,14.3,0.37
7,Ord_4725,Prod_4,SHP_6593,Cust_1641,3410.1575,0.1,48,1137.91,0.99,0.55
8,Ord_4725,Prod_13,SHP_6593,Cust_1641,162.0,0.01,33,45.84,0.71,0.52


In [67]:
# Or equivalently
market_df.loc[[3, 7, 8], :]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.89,0.09,43,729.34,14.3,0.37
7,Ord_4725,Prod_4,SHP_6593,Cust_1641,3410.1575,0.1,48,1137.91,0.99,0.55
8,Ord_4725,Prod_13,SHP_6593,Cust_1641,162.0,0.01,33,45.84,0.71,0.52


In [None]:
# Selecting rows using a range of labels
# Notice that with df.loc, both 4 and 8 are included, unlike with df.iloc
# This is an important difference between iloc and loc
market_df.loc[4:8]
# Or equivalently market_df.loc[4:8, ] or market_df.loc[4:8, :]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.15,0.08,35,1219.87,26.3,0.38
5,Ord_5446,Prod_6,SHP_7608,Cust_1818,164.02,0.03,23,-47.64,6.15,0.37
6,Ord_31,Prod_12,SHP_41,Cust_26,14.76,0.01,5,1.32,0.5,0.36
7,Ord_4725,Prod_4,SHP_6593,Cust_1641,3410.1575,0.1,48,1137.91,0.99,0.55
8,Ord_4725,Prod_13,SHP_6593,Cust_1641,162.0,0.01,33,45.84,0.71,0.52


In [None]:
#same as loc
market_df.iloc[4:8]
# Or equivalently market_df.loc[4:8, ] or market_df.loc[4:8, :]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.15,0.08,35,1219.87,26.3,0.38
5,Ord_5446,Prod_6,SHP_7608,Cust_1818,164.02,0.03,23,-47.64,6.15,0.37
6,Ord_31,Prod_12,SHP_41,Cust_26,14.76,0.01,5,1.32,0.5,0.36
7,Ord_4725,Prod_4,SHP_6593,Cust_1641,3410.1575,0.1,48,1137.91,0.99,0.55


In [75]:
market_df_1

Unnamed: 0_level_0,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
Ord_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ord_5446,Prod_16,SHP_7609,Cust_1818,136.8100,0.01,23,-30.51,3.60,0.56
Ord_5406,Prod_13,SHP_7549,Cust_1818,42.2700,0.01,13,4.56,0.93,0.54
Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.6900,0.00,26,1148.90,2.50,0.59
Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.8900,0.09,43,729.34,14.30,0.37
Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.1500,0.08,35,1219.87,26.30,0.38
...,...,...,...,...,...,...,...,...,...
Ord_5353,Prod_4,SHP_7479,Cust_1798,2841.4395,0.08,28,374.63,7.69,0.59
Ord_5411,Prod_6,SHP_7555,Cust_1798,127.1600,0.10,20,-74.03,6.92,0.37
Ord_5388,Prod_6,SHP_7524,Cust_1798,243.0500,0.02,39,-70.85,5.35,0.40
Ord_5348,Prod_15,SHP_7469,Cust_1798,3872.8700,0.03,23,565.34,30.00,0.62


In [76]:
# Select Ord_id = Ord_5406 and some columns
market_df_1.loc['Ord_5406', ['Sales', 'Profit', 'Cust_id']]

Sales          42.27
Profit          4.56
Cust_id    Cust_1818
Name: Ord_5406, dtype: object

In [89]:
#but with iloc
market_df_1.iloc[1, [3,6,2]]

Sales          42.27
Profit          4.56
Cust_id    Cust_1818
Name: Ord_5406, dtype: object

In [None]:
# Select multiple orders using labels, and some columns
market_df_1.loc[['Ord_5406', 'Ord_5446', 'Ord_5485'], 'Sales':'Profit']

Unnamed: 0_level_0,Sales,Discount,Order_Quantity,Profit
Ord_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ord_5406,42.27,0.01,13,4.56
Ord_5446,136.81,0.01,23,-30.51
Ord_5446,4701.69,0.0,26,1148.9
Ord_5446,164.02,0.03,23,-47.64
Ord_5485,4233.15,0.08,35,1219.87


In [90]:
#but with iloc
market_df_1.iloc[[1,2,4], [3,4,5,6]]

Unnamed: 0_level_0,Sales,Discount,Order_Quantity,Profit
Ord_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ord_5406,42.27,0.01,13,4.56
Ord_5446,4701.69,0.0,26,1148.9
Ord_5485,4233.15,0.08,35,1219.87


Difference between above 2 is: in loc, all 5446 rows are selected but in iloc only particular 5446 row is selected

In [None]:
# Using booleans
# This selects the rows corresponding to True
market_df_1.loc[[True, True, False, True, True, False, True]]

### Slicing

In [94]:
# Select all rows where Sales > 3000
# First, we get a boolean array where True corresponds to rows having Sales > 3000
market_df.Sales > 3000

0       False
1       False
2        True
3       False
4        True
        ...  
8394    False
8395    False
8396    False
8397     True
8398    False
Name: Sales, Length: 8399, dtype: bool

In [95]:
# Then, we pass this boolean array inside df.loc
market_df.loc[market_df.Sales > 3000]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.6900,0.00,26,1148.90,2.50,0.59
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.1500,0.08,35,1219.87,26.30,0.38
7,Ord_4725,Prod_4,SHP_6593,Cust_1641,3410.1575,0.10,48,1137.91,0.99,0.55
10,Ord_4743,Prod_2,SHP_6615,Cust_1641,4072.0100,0.01,43,1675.98,0.99,0.56
13,Ord_2207,Prod_11,SHP_3093,Cust_839,3364.2480,0.10,15,-693.23,61.76,0.78
...,...,...,...,...,...,...,...,...,...,...
8366,Ord_3593,Prod_3,SHP_4974,Cust_1274,12073.0600,0.03,39,5081.87,19.99,0.38
8367,Ord_3593,Prod_15,SHP_4975,Cust_1274,6685.0500,0.09,25,1653.60,24.49,
8371,Ord_2624,Prod_4,SHP_3591,Cust_1006,4924.1350,0.07,28,1049.54,8.99,0.58
8383,Ord_2722,Prod_1,SHP_3731,Cust_1006,3508.3300,0.04,21,-546.98,35.00,0.85


In [96]:
# An alternative to df.Sales is df['Sales]
# You may want to put the : to indicate that you want all columns
# It is more explicit 
market_df.loc[market_df['Sales'] > 3000, :]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.6900,0.00,26,1148.90,2.50,0.59
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.1500,0.08,35,1219.87,26.30,0.38
7,Ord_4725,Prod_4,SHP_6593,Cust_1641,3410.1575,0.10,48,1137.91,0.99,0.55
10,Ord_4743,Prod_2,SHP_6615,Cust_1641,4072.0100,0.01,43,1675.98,0.99,0.56
13,Ord_2207,Prod_11,SHP_3093,Cust_839,3364.2480,0.10,15,-693.23,61.76,0.78
...,...,...,...,...,...,...,...,...,...,...
8366,Ord_3593,Prod_3,SHP_4974,Cust_1274,12073.0600,0.03,39,5081.87,19.99,0.38
8367,Ord_3593,Prod_15,SHP_4975,Cust_1274,6685.0500,0.09,25,1653.60,24.49,
8371,Ord_2624,Prod_4,SHP_3591,Cust_1006,4924.1350,0.07,28,1049.54,8.99,0.58
8383,Ord_2722,Prod_1,SHP_3731,Cust_1006,3508.3300,0.04,21,-546.98,35.00,0.85


In [97]:
# We combine multiple conditions using the & operator
# E.g. all orders having 2000 < Sales < 3000 and Profit > 100
market_df.loc[(market_df.Sales > 2000) & (market_df.Sales < 3000) & (market_df.Profit > 100), :]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.8900,0.09,43,729.34,14.30,0.37
81,Ord_5205,Prod_4,SHP_7274,Cust_1749,2546.5235,0.09,26,210.00,7.69,0.59
109,Ord_139,Prod_17,SHP_186,Cust_45,2671.2100,0.06,14,636.18,15.59,0.36
110,Ord_239,Prod_4,SHP_332,Cust_45,2157.3085,0.00,38,519.25,5.31,0.57
141,Ord_1673,Prod_17,SHP_2314,Cust_498,2027.5500,0.04,14,537.40,13.99,0.37
...,...,...,...,...,...,...,...,...,...,...
8338,Ord_2107,Prod_2,SHP_2882,Cust_785,2409.9600,0.07,32,575.10,4.50,0.59
8350,Ord_3570,Prod_4,SHP_4942,Cust_1266,2094.9780,0.06,44,697.29,1.25,0.55
8354,Ord_3592,Prod_4,SHP_4973,Cust_1266,2614.3705,0.07,25,384.01,7.69,0.58
8381,Ord_2696,Prod_4,SHP_3691,Cust_1006,2836.0505,0.01,25,561.13,8.99,0.59


In [98]:
# The 'OR' operator is represented by a | (Note that 'or' doesn't work with pandas)
# E.g. all orders having 2000 < Sales  OR Profit > 100
market_df.loc[(market_df.Sales > 2000) | (market_df.Profit > 100), :]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.6900,0.00,26,1148.90,2.50,0.59
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.8900,0.09,43,729.34,14.30,0.37
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.1500,0.08,35,1219.87,26.30,0.38
7,Ord_4725,Prod_4,SHP_6593,Cust_1641,3410.1575,0.10,48,1137.91,0.99,0.55
10,Ord_4743,Prod_2,SHP_6615,Cust_1641,4072.0100,0.01,43,1675.98,0.99,0.56
...,...,...,...,...,...,...,...,...,...,...
8383,Ord_2722,Prod_1,SHP_3731,Cust_1006,3508.3300,0.04,21,-546.98,35.00,0.85
8385,Ord_1833,Prod_3,SHP_2527,Cust_637,611.1600,0.04,46,100.22,4.98,0.40
8394,Ord_5353,Prod_4,SHP_7479,Cust_1798,2841.4395,0.08,28,374.63,7.69,0.59
8397,Ord_5348,Prod_15,SHP_7469,Cust_1798,3872.8700,0.03,23,565.34,30.00,0.62


In [99]:
# E.g. all orders having 2000 < Sales < 3000 and Profit > 100
# Also, this time, you only need the Cust_id, Sales and Profit columns
market_df.loc[(market_df.Sales > 2000) & (market_df.Sales < 3000) & (market_df.Profit > 100), ['Cust_id', 'Sales', 'Profit']]

Unnamed: 0,Cust_id,Sales,Profit
3,Cust_1818,2337.8900,729.34
81,Cust_1749,2546.5235,210.00
109,Cust_45,2671.2100,636.18
110,Cust_45,2157.3085,519.25
141,Cust_498,2027.5500,537.40
...,...,...,...
8338,Cust_785,2409.9600,575.10
8350,Cust_1266,2094.9780,697.29
8354,Cust_1266,2614.3705,384.01
8381,Cust_1006,2836.0505,561.13


In [100]:
# You can use the == and != operators 
market_df.loc[(market_df.Sales == 4233.15), :]
market_df.loc[(market_df.Sales != 1000), :]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
0,Ord_5446,Prod_16,SHP_7609,Cust_1818,136.8100,0.01,23,-30.51,3.60,0.56
1,Ord_5406,Prod_13,SHP_7549,Cust_1818,42.2700,0.01,13,4.56,0.93,0.54
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.6900,0.00,26,1148.90,2.50,0.59
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.8900,0.09,43,729.34,14.30,0.37
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.1500,0.08,35,1219.87,26.30,0.38
...,...,...,...,...,...,...,...,...,...,...
8394,Ord_5353,Prod_4,SHP_7479,Cust_1798,2841.4395,0.08,28,374.63,7.69,0.59
8395,Ord_5411,Prod_6,SHP_7555,Cust_1798,127.1600,0.10,20,-74.03,6.92,0.37
8396,Ord_5388,Prod_6,SHP_7524,Cust_1798,243.0500,0.02,39,-70.85,5.35,0.40
8397,Ord_5348,Prod_15,SHP_7469,Cust_1798,3872.8700,0.03,23,565.34,30.00,0.62


In [101]:
# You may want to select rows whose column value is in an iterable
# For instance, say a colleague gives you a list of customer_ids from a certain region

customers_in_bangalore = ['Cust_1798', 'Cust_1519', 'Cust_637', 'Cust_851']

# To get all the orders from these customers, use the isin() function
# It returns a boolean, which you can use to select rows
market_df.loc[market_df['Cust_id'].isin(customers_in_bangalore), :]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
8385,Ord_1833,Prod_3,SHP_2527,Cust_637,611.16,0.04,46,100.22,4.98,0.4
8386,Ord_2324,Prod_7,SHP_3189,Cust_851,121.87,0.07,39,11.32,1.35,0.4
8387,Ord_2220,Prod_3,SHP_3019,Cust_851,41.06,0.04,4,-16.39,6.28,0.35
8388,Ord_4424,Prod_1,SHP_6165,Cust_1519,994.04,0.03,10,-335.06,35.0,
8389,Ord_4444,Prod_13,SHP_6192,Cust_1519,159.41,0.0,44,34.68,0.98,0.52
8390,Ord_5435,Prod_16,SHP_7594,Cust_1798,316.99,0.04,47,-276.54,8.37,0.58
8391,Ord_5435,Prod_4,SHP_7594,Cust_1798,1991.8985,0.07,20,88.36,7.69,0.58
8392,Ord_5384,Prod_9,SHP_7519,Cust_1798,181.5,0.08,43,-6.24,2.5,0.37
8393,Ord_5348,Prod_8,SHP_7470,Cust_1798,356.72,0.07,9,12.61,1.99,0.44
8394,Ord_5353,Prod_4,SHP_7479,Cust_1798,2841.4395,0.08,28,374.63,7.69,0.59
