# Slicing

The many ways to slice data in Python.

----

## Lists

----

In [1]:
import numpy as np

In [2]:
my_list = [f'{x:.4f}' for x in np.random.randn(14)]

In [3]:
my_list

['0.0618',
 '-0.2938',
 '-0.4301',
 '-0.2013',
 '-0.5821',
 '0.8824',
 '-1.4016',
 '1.1005',
 '0.3570',
 '-0.8012',
 '0.7421',
 '-0.3374',
 '-0.2854',
 '-0.4207']

In [4]:
print("fifth list element:", my_list[4], "\nfourth last list element:", my_list[-4])

fifth list element: -0.5821 
fourth last list element: 0.7421


In [5]:
my_list[3:7]

['-0.2013', '-0.5821', '0.8824', '-1.4016']

In [6]:
my_list[::2]

['0.0618', '-0.4301', '-0.5821', '-1.4016', '0.3570', '0.7421', '-0.2854']

In [7]:
list_of_lists = [x for x in np.random.randn(5, 5)]

l_of_l = []
for x in list_of_lists:
    l_of_l.append(list(x))

In [8]:
# A 2-D list
l_of_l

[[-0.2610980940418338,
  -1.1172367592005865,
  -0.47054707404188634,
  -0.7873524483125123,
  -1.2352762658096932],
 [-2.2833697423635066,
  -2.7209147047464555,
  1.550361139953513,
  -1.2428933447187447,
  1.6577402552379281],
 [0.3246508113399237,
  1.0540636403685861,
  1.4098905446384615,
  1.2083403052415593,
  -1.0476213027760435],
 [1.2053130516346495,
  0.1221706897149656,
  0.47629250550943747,
  0.14783093578419682,
  1.3647936896754458],
 [-0.23267209597334318,
  -2.0268800723762825,
  -0.12271734423319945,
  0.8934928714008672,
  -1.0410161608915594]]

In [9]:
l_of_l[3][3]

0.14783093578419682

In [10]:
for l in l_of_l[:3]:
    print(l[3])

-0.7873524483125123
-1.2428933447187447
1.2083403052415593


In [11]:
[l[3] for l in l_of_l[:3]]

[-0.7873524483125123, -1.2428933447187447, 1.2083403052415593]

**Numpy** to the rescue in the next section ...

In [12]:
my_slice = slice(1, 8, 2)
my_list[my_slice]

['-0.2938', '-0.2013', '0.8824', '1.1005']

In [13]:
from itertools import islice

In [14]:
iter_slice = islice(my_list, 1, 8, 2)

[float(i) for i in iter_slice]

[-0.2938, -0.2013, 0.8824, 1.1005]

----

## NumPy Arrays

----

In [18]:
import numpy as np

num_2d_array = np.array(l_of_l)
num_2d_array

array([[-0.26109809, -1.11723676, -0.47054707, -0.78735245, -1.23527627],
       [-2.28336974, -2.7209147 ,  1.55036114, -1.24289334,  1.65774026],
       [ 0.32465081,  1.05406364,  1.40989054,  1.20834031, -1.0476213 ],
       [ 1.20531305,  0.12217069,  0.47629251,  0.14783094,  1.36479369],
       [-0.2326721 , -2.02688007, -0.12271734,  0.89349287, -1.04101616]])

In [16]:
num_2d_array[0:1]

array([[-0.26109809, -1.11723676, -0.47054707, -0.78735245, -1.23527627]])

In [17]:
num_2d_array[:,0]

array([-0.26109809, -2.28336974,  0.32465081,  1.20531305, -0.2326721 ])

In [19]:
# Analogous to the 2-D list example above but far
# less verbose and more efficient!
num_2d_array[:3,3]

array([-0.78735245, -1.24289334,  1.20834031])

In [21]:
# NumPy also supports boolean  indexing
num_2d_array[num_2d_array > 0]

array([1.55036114, 1.65774026, 0.32465081, 1.05406364, 1.40989054,
       1.20834031, 1.20531305, 0.12217069, 0.47629251, 0.14783094,
       1.36479369, 0.89349287])

Remember that NumPy arrays are homogenious with regards to datatype and that simple 1-D slicing in NumPy arrays is exactly the same as slicing 1-D lists.

----

## Strings

----

In [15]:
my_string = "My name is Cody"

In [16]:
my_string[-4:]

'Cody'

In [17]:
my_string[3:7]

'name'

In [18]:
my_string[::2]

'M aei oy'

----

## Dictionaries

----

In [19]:
key_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
value_list = [[1, 2, 3, 4, 5], 'My name is Cody', 77, 0.1234, (123, 456), {"a dict": "within a dict"}, {1, 2, 3, 4}]

mapping = {}

for key, value in zip(key_list, value_list):
    mapping[key] = value

In [20]:
mapping

{'a': [1, 2, 3, 4, 5],
 'b': 'My name is Cody',
 'c': 77,
 'd': 0.1234,
 'e': (123, 456),
 'f': {'a dict': 'within a dict'},
 'g': {1, 2, 3, 4}}

In [21]:
comp_mapping = {key : value for key, value in zip(key_list, value_list)}

comp_mapping

{'a': [1, 2, 3, 4, 5],
 'b': 'My name is Cody',
 'c': 77,
 'd': 0.1234,
 'e': (123, 456),
 'f': {'a dict': 'within a dict'},
 'g': {1, 2, 3, 4}}

In [22]:
mapping['c']

77

In [23]:
key_slice = ['a', 'c', 'g']

values_slice = [mapping[key] for key in key_slice]
values_slice

[[1, 2, 3, 4, 5], 77, {1, 2, 3, 4}]

In [24]:
(123,456) in mapping.values()

True

In [25]:
def dict_search(dictionary, search_term):

    ''' This function takes two inputs. First is the
    dictionary you want to search, second is the value
    you want to search for. The function will return the
    FIRST key that matches that value.
    '''

    my_keys_indexed = list(dictionary.keys())
    my_values_indexed = list(dictionary.values())

    ans = my_keys_indexed[my_values_indexed.index(search_term)]
    
    return ans

In [26]:
dict_search(mapping, 77)

'c'

In [27]:
def dict_search_all(dictionary, search_term):
    
    '''This function loops over the dictionary and
    returns all keys that match the search term. The
    function takes two inputs:
    
    dictionary: a dict() object
    search_term: a value within the supplied dict()
    '''
    
    my_values_enumerated = enumerate(dictionary.values())
    
    idx_search_matches = []
    
    for idx, value in my_values_enumerated:
        if value == search_term:
            idx_search_matches.append(idx)
            
    key_matches = []
    
    for idx in idx_search_matches:
        key_matches.append(list(dictionary.keys())[idx])
        
    return key_matches

In [28]:
dict_search_all(mapping, 77)

['c']

In [29]:
for i, v in mapping.items():
    print(i, v)

a [1, 2, 3, 4, 5]
b My name is Cody
c 77
d 0.1234
e (123, 456)
f {'a dict': 'within a dict'}
g {1, 2, 3, 4}


In [30]:
search_term = 77

for i, v in mapping.items():
    if v == search_term:
        print(i, v)

c 77


----

## `Collections`

----

When thinking about splicing, the built-in `collections` package provides many useful container objects.

#### `Counter`, with useful methods like `most_common`

In [31]:
from collections import Counter

In [32]:
c = Counter({'red': 4, 'blue': 2})
c['red']

4

In [33]:
c = Counter('My name is Cody, and I love to program with Python')
c['y']

3

In [34]:
c.most_common(3)

[(' ', 10), ('o', 5), ('y', 3)]

#### `deque`, with useful methods for appending and popping values

It is worth the effort to work through the examples provided in the python documentation for [deque](https://docs.python.org/3.7/library/collections.html?highlight=collections#collections.deque). There are also excellent recipes for usage cases.

In [35]:
my_queue = [x for x in np.arange(4, 25, 0.5)]

In [36]:
from collections import deque

d = deque(my_queue)

In [37]:
d.pop()

24.5

In [38]:
d.popleft()

4.0

In [39]:
d.append(24.5)
d[-1]

24.5

In [40]:
d.appendleft(4.0)
d[0]

4.0

----

## Pandas

----

In [41]:
import pandas as pd

In [42]:
my_series = pd.Series(my_queue)

In [43]:
my_series[0:5]

0    4.0
1    4.5
2    5.0
3    5.5
4    6.0
dtype: float64

In [44]:
new_series = my_series[0:26]
len(new_series)

26

In [45]:
index_list = [x for x in "abcdefghijklmnopqrstuvwxyz"]
new_series.index = index_list

In [46]:
new_series[0:5]

a    4.0
b    4.5
c    5.0
d    5.5
e    6.0
dtype: float64

In [47]:
new_series['g']

7.0

#### Pandas series and dataframes with datetime indexing

In [48]:
from datetime import datetime
from datetime import timedelta

In [49]:
base = datetime.today()
date_index = [base - timedelta(days=x) for x in range(0, 26)]

In [50]:
new_series.index = sorted(date_index)

In [51]:
new_series[0:5]

2019-03-21 16:00:35.108052    4.0
2019-03-22 16:00:35.108052    4.5
2019-03-23 16:00:35.108052    5.0
2019-03-24 16:00:35.108052    5.5
2019-03-25 16:00:35.108052    6.0
dtype: float64

In [52]:
new_series['April, 6 2019']

2019-04-06 16:00:35.108052    12.0
dtype: float64

In [53]:
new_series['2019-04-04':'2019-04-14']

2019-04-04 16:00:35.108052    11.0
2019-04-05 16:00:35.108052    11.5
2019-04-06 16:00:35.108052    12.0
2019-04-07 16:00:35.108052    12.5
2019-04-08 16:00:35.108052    13.0
2019-04-09 16:00:35.108052    13.5
2019-04-10 16:00:35.108052    14.0
2019-04-11 16:00:35.108052    14.5
2019-04-12 16:00:35.108052    15.0
2019-04-13 16:00:35.108052    15.5
2019-04-14 16:00:35.108052    16.0
dtype: float64

In [54]:
new_df = pd.DataFrame(new_series)

In [55]:
new_df['2019-04-02':'2019-04-04']

Unnamed: 0,0
2019-04-02 16:00:35.108052,10.0
2019-04-03 16:00:35.108052,10.5
2019-04-04 16:00:35.108052,11.0


In [56]:
new_df[new_df.index.day == 2]

Unnamed: 0,0
2019-04-02 16:00:35.108052,10.0


In [57]:
new_df[new_df.index.month == 4]

Unnamed: 0,0
2019-04-01 16:00:35.108052,9.5
2019-04-02 16:00:35.108052,10.0
2019-04-03 16:00:35.108052,10.5
2019-04-04 16:00:35.108052,11.0
2019-04-05 16:00:35.108052,11.5
2019-04-06 16:00:35.108052,12.0
2019-04-07 16:00:35.108052,12.5
2019-04-08 16:00:35.108052,13.0
2019-04-09 16:00:35.108052,13.5
2019-04-10 16:00:35.108052,14.0


In [58]:
another_df = pd.DataFrame({'a':np.arange(5, 10, 0.25), 'b':np.arange(10, 15, 0.25)})

In [59]:
another_df.head()

Unnamed: 0,a,b
0,5.0,10.0
1,5.25,10.25
2,5.5,10.5
3,5.75,10.75
4,6.0,11.0


In [60]:
another_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
a    20 non-null float64
b    20 non-null float64
dtypes: float64(2)
memory usage: 400.0 bytes


In [61]:
another_df.describe()

Unnamed: 0,a,b
count,20.0,20.0
mean,7.375,12.375
std,1.47902,1.47902
min,5.0,10.0
25%,6.1875,11.1875
50%,7.375,12.375
75%,8.5625,13.5625
max,9.75,14.75


In [62]:
another_df[4:14]

Unnamed: 0,a,b
4,6.0,11.0
5,6.25,11.25
6,6.5,11.5
7,6.75,11.75
8,7.0,12.0
9,7.25,12.25
10,7.5,12.5
11,7.75,12.75
12,8.0,13.0
13,8.25,13.25


In [63]:
another_df.iloc[4:14]

Unnamed: 0,a,b
4,6.0,11.0
5,6.25,11.25
6,6.5,11.5
7,6.75,11.75
8,7.0,12.0
9,7.25,12.25
10,7.5,12.5
11,7.75,12.75
12,8.0,13.0
13,8.25,13.25


In [64]:
another_df.loc[:,'b']

0     10.00
1     10.25
2     10.50
3     10.75
4     11.00
5     11.25
6     11.50
7     11.75
8     12.00
9     12.25
10    12.50
11    12.75
12    13.00
13    13.25
14    13.50
15    13.75
16    14.00
17    14.25
18    14.50
19    14.75
Name: b, dtype: float64

In [65]:
short_df = another_df[0:13]

In [66]:
rows_string = 'row '*13
rows_list = rows_string.split(sep=' ')
n = 0
for i in range(0,13):
    rows_list[i] = rows_list[i]+str(n)
    n+=1
    
rows_list.pop()
rows_list

['row0',
 'row1',
 'row2',
 'row3',
 'row4',
 'row5',
 'row6',
 'row7',
 'row8',
 'row9',
 'row10',
 'row11',
 'row12']

In [67]:
short_df.index = rows_list
short_df.columns = ['column1', 'column2']

In [68]:
short_df.loc[['row4', 'row7', 'row1'],'column2']

row4    11.00
row7    11.75
row1    10.25
Name: column2, dtype: float64

In [69]:
short_df['column1']

row0     5.00
row1     5.25
row2     5.50
row3     5.75
row4     6.00
row5     6.25
row6     6.50
row7     6.75
row8     7.00
row9     7.25
row10    7.50
row11    7.75
row12    8.00
Name: column1, dtype: float64

In [70]:
short_df[['column1', 'column2']]

Unnamed: 0,column1,column2
row0,5.0,10.0
row1,5.25,10.25
row2,5.5,10.5
row3,5.75,10.75
row4,6.0,11.0
row5,6.25,11.25
row6,6.5,11.5
row7,6.75,11.75
row8,7.0,12.0
row9,7.25,12.25


In [71]:
short_df[3:5]

Unnamed: 0,column1,column2
row3,5.75,10.75
row4,6.0,11.0


In [72]:
short_df[4:10]['column1']

row4    6.00
row5    6.25
row6    6.50
row7    6.75
row8    7.00
row9    7.25
Name: column1, dtype: float64

In [73]:
short_df['column1']['row11']

7.75

In [74]:
masked_df = short_df[short_df['column1'].between(7,9)]

In [75]:
masked_df

Unnamed: 0,column1,column2
row8,7.0,12.0
row9,7.25,12.25
row10,7.5,12.5
row11,7.75,12.75
row12,8.0,13.0
