# Pandas Series

In [92]:
import numpy as np 
import pandas as pd 

In [93]:
s1 = pd.Series(2)
s1

0    2
dtype: int64

In [94]:
s2 = pd.Series(np.arange(9))
print(s2)
print('Index:', s2.index)
print('Values:', s2.values)

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
dtype: int64
Index: RangeIndex(start=0, stop=9, step=1)
Values: [0 1 2 3 4 5 6 7 8]


In [95]:
s3 = pd.Series(['a','b','c'], index = [[i for i in range(3)]])
print(s3)

0    a
1    b
2    c
dtype: object


In [96]:
s3 = pd.Series([1,2,3], index = ['a','b','c'])
print(s3)
print('Index=',s3.index)
print('Values=',s3.values)

a    1
b    2
c    3
dtype: int64
Index= Index(['a', 'b', 'c'], dtype='object')
Values= [1 2 3]


In [97]:
# indexing

print('s3[1]:',s3[1]) # will only give the value of the series
print('s3[1] index:',s3.index[1]) # will give index of the series's location
print('s3[1] values:',s3.values[1]) # will give the value of that index

# Indexing by index or label

print('s3[1] value by label or index:',s3['a'])
print('s3[1] value by label or index:',s3['b'])

s3[1]: 2
s3[1] index: b
s3[1] values: 2
s3[1] value by label or index: 1
s3[1] value by label or index: 2


In [98]:
# creating series by using a dictionary

s4 = pd.Series({'A':1, 'B':2, 'C':3}) # index is string and value is integer
print(s4)
print('---------------')
s4 = pd.Series({1:'a',2:'b',3:'c'}) # index is integer and value is string
print(s4)
print('---------------')
s4 = pd.Series({'1':'a','2':'b','3':'c'}) # index is string and value is string
print(s4)

A    1
B    2
C    3
dtype: int64
---------------
1    a
2    b
3    c
dtype: object
---------------
1    a
2    b
3    c
dtype: object


<h3>Size, shape, uniqueness, and counts of values</h3>

In [99]:
alpha_list = [] 
alpha = 'a'
for i in range(0, 26): 
    alpha_list.append(alpha) 
    alpha = chr(ord(alpha) + 1)  # getting ASCII code
s = pd.Series([i for i in range(26)], index = alpha_list)
s

a     0
b     1
c     2
d     3
e     4
f     5
g     6
h     7
i     8
j     9
k    10
l    11
m    12
n    13
o    14
p    15
q    16
r    17
s    18
t    19
u    20
v    21
w    22
x    23
y    24
z    25
dtype: int64

In [100]:
print('Length:',len(s)) # len of the series
print('Size:',s.size) # size of the series
print('Shape:',s.shape) # shape of the series, it will be only rows
print('Count:',s.count()) # count of things in series
print('Unique Value:',s.unique()) # give out each value and if multiple give them just once
print('Counts:\n',s.value_counts()) # count of every value

Length: 26
Size: 26
Shape: (26,)
Count: 26
Unique Value: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Counts:
 25    1
24    1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    1
0     1
dtype: int64


In [101]:
# .head() >> will give the top 5 value and their index by default but can be given particular range

a = s.head()
b = s.head(n = 6)
print(a)
print(b)


a    0
b    1
c    2
d    3
e    4
dtype: int64
a    0
b    1
c    2
d    3
e    4
f    5
dtype: int64


In [102]:
# tail() command >> will give the bottom 5 value and their index by default but can be given particular range
a = s.tail()
b = s.tail(n = 6)
print(a)
print(b)


v    21
w    22
x    23
y    24
z    25
dtype: int64
u    20
v    21
w    22
x    23
y    24
z    25
dtype: int64


In [103]:
# .take() command return the rows based on the input list 
c = s.take([9,3,9])
d = s.take([2])
print(c)
print(d)

j    9
d    3
j    9
dtype: int64
c    2
dtype: int64


# Indexing though the series using series label 

In [104]:
# just like numpy, the index or indexes of the values can be selected.
s3 = pd.Series([1,2,3], index = ['a','b','c'])
print(s3)
# lookup by index (label)
print(s3['a'])
print(s3[['a','c']]) # a list of list because we are giving a group of indexes

a    1
b    2
c    3
dtype: int64
1
a    1
c    3
dtype: int64


In [105]:
# Some numpy operation such as filtering with boolean, scalar multiplication or math functions

print(s3)
print('-----------')
print(s3>1) # value and index whose value is greater than 2
# Mathoperations
print('s3*2:', s3*2) 
print('s3+ s3:',s3 + s3)
print('s3 - s3:',s3 - s3)
print('s3/2:',s3/2)
print('s3^2:',s3 ** 2)
print('e^s3:',np.exp(s3))

a    1
b    2
c    3
dtype: int64
-----------
a    False
b     True
c     True
dtype: bool
s3*2: a    2
b    4
c    6
dtype: int64
s3+ s3: a    2
b    4
c    6
dtype: int64
s3 - s3: a    0
b    0
c    0
dtype: int64
s3/2: a    0.5
b    1.0
c    1.5
dtype: float64
s3^2: a    1
b    4
c    9
dtype: int64
e^s3: a     2.718282
b     7.389056
c    20.085537
dtype: float64


In [106]:
# Series can also be made by a dictionary
# The index of the dictionary become index of the series and the value of the dictionary become value of the series

data = { 'Fawad':1, 'Aali':2}
s = pd.Series(data)
print(s)

Fawad    1
Aali     2
dtype: int64


In [107]:
# Adding more index and values in the series
index = ['Fawad', 'Aali', 'Padre', 'Maa', 'Baji']
s1 = pd.Series(data, index = index)
print(s1)
# Because there is no value associated with some of the index, the values will be NaN (Not a number) >> its has a memory location address but no value

Fawad    1.0
Aali     2.0
Padre    NaN
Maa      NaN
Baji     NaN
dtype: float64


In [108]:
# Some pd function related to NaN to find missing data
# these functions also have instance methods >> s1.notnull()
print(pd.isnull(s1)) # True is value is NaN else False
print('-----------')
print(pd.notnull(s1)) # True is value not Nan else False

Fawad    False
Aali     False
Padre     True
Maa       True
Baji      True
dtype: bool
-----------
Fawad     True
Aali      True
Padre    False
Maa      False
Baji     False
dtype: bool


In [109]:
s2 = s + s1
s2

Aali     4.0
Baji     NaN
Fawad    2.0
Maa      NaN
Padre    NaN
dtype: float64

In [110]:
 s2.name = "Family" # The series now has a name
 print(s2)
 print('-------------')
 s2.index.name = "Members" # The index name is now members
 print(s2)

Aali     4.0
Baji     NaN
Fawad    2.0
Maa      NaN
Padre    NaN
Name: Family, dtype: float64
-------------
Members
Aali     4.0
Baji     NaN
Fawad    2.0
Maa      NaN
Padre    NaN
Name: Family, dtype: float64


In [111]:
# Practice
city_data = {'Karachi':1000, 'Lahore':2000, 'Peshawar':5000}
s1 = pd.Series(city_data)
print(s1)

Karachi     1000
Lahore      2000
Peshawar    5000
dtype: int64


In [112]:
states = ['Karachi', 'Lahore', 'Peshawar','Quetta', 'Islamabad', 'Hyderabad']
s2 = pd.Series(s1, index = states)
print(s2)

Karachi      1000.0
Lahore       2000.0
Peshawar     5000.0
Quetta          NaN
Islamabad       NaN
Hyderabad       NaN
dtype: float64


In [113]:
s2.name = 'Population'
s2.index.name = 'City'
print(s2)

City
Karachi      1000.0
Lahore       2000.0
Peshawar     5000.0
Quetta          NaN
Islamabad       NaN
Hyderabad       NaN
Name: Population, dtype: float64


# Indexing in Panda Series

In [114]:
print(s2)
print('-----------')
# indexing by index
print('Printing Single index:',s2['Karachi'])
print('-------------')
print('Printing multiple index:',s2[['Karachi', 'Quetta']])

City
Karachi      1000.0
Lahore       2000.0
Peshawar     5000.0
Quetta          NaN
Islamabad       NaN
Hyderabad       NaN
Name: Population, dtype: float64
-----------
Printing Single index: 1000.0
-------------
Printing multiple index: City
Karachi    1000.0
Quetta        NaN
Name: Population, dtype: float64


In [115]:
# Looking by position
print(s2)
print('------------')
# As index is not integer so position
print('Index by single Poisition:', s2[0])
print('--------------')
#Multiple Position
print('Index by multiple Poisitions:', s2[[0,1,0,4]])

City
Karachi      1000.0
Lahore       2000.0
Peshawar     5000.0
Quetta          NaN
Islamabad       NaN
Hyderabad       NaN
Name: Population, dtype: float64
------------
Index by single Poisition: 1000.0
--------------
Index by multiple Poisitions: City
Karachi      1000.0
Lahore       2000.0
Karachi      1000.0
Islamabad       NaN
Name: Population, dtype: float64


In [116]:
# Series whose index does not start from Zero (0)
s = pd.Series(['A','B','C'], index = [4,5,6])
print(s)

4    A
5    B
6    C
dtype: object


# Label Vs Position Look up

In [117]:
# for above series index (label) starts at 4
# if label and index are not compatible meaning that the index and label do not start at zero and other number (in our case 4), calling a value by position will not work as the index do not match
print(s[4]) # Label look up will result A
print('------------')
# print(s[0]) # will give error as the index does not exist

A
------------


In [118]:
print(s)
print('---------')
# if data is to be taken out by the usual python index then below function is used
print(s.loc[4]) # >> will print A >> works with label based look up 
print('---------')
print(s.loc[[4,5]])
print('---------')
# BUT if python indexing is to be used
print(s.iloc[0]) # it will not care for the index of the series rather will get data as the python index starting form 0
print('---------')
print(s.iloc[[0,2]])

4    A
5    B
6    C
dtype: object
---------
A
---------
4    A
5    B
dtype: object
---------
A
---------
4    A
6    C
dtype: object


# Alignment via index labels

Pandas can align data of series with another based on the index of the seres

In [119]:
s1 = pd.Series([1,2,3,4], index = ['a','b','c','d'])
s2 = pd.Series([4,3,2,1], index = ['d','c','b','a'])
print('s1:')
print(s1)
print('-----------')
print('s2:')
print(s2)

s1:
a    1
b    2
c    3
d    4
dtype: int64
-----------
s2:
d    4
c    3
b    2
a    1
dtype: int64


In [120]:
#adding the two series >> index wise addition or subtraction, multiplication and division

s3 = s1 + s2
print("s3 = s1 + s2")
print(s3)
print('----------------')
s3 = s1 - s2
print("s3 = s1 - s2")
print(s3)
print('----------------')
s3 = s1 * s2
print("s3 = s1 * s2")
print(s3)
print('----------------')
s3 = s1 / s2
print("s3 = s1 / s2")
print(s3)

s3 = s1 + s2
a    2
b    4
c    6
d    8
dtype: int64
----------------
s3 = s1 - s2
a    0
b    0
c    0
d    0
dtype: int64
----------------
s3 = s1 * s2
a     1
b     4
c     9
d    16
dtype: int64
----------------
s3 = s1 / s2
a    1.0
b    1.0
c    1.0
d    1.0
dtype: float64


 <h3>Nan + number = NaN </h3>      (NaN added to a number results in NaN)
    
<h3>number + NaN = NaN</h3>        (Number added to a Nan results in NaN)

In [121]:
s1 = pd.Series({'a':1,'b':2,'c':3,'d':4})
s2 = pd.Series({'b':5,'c':6,'d':7,'e':8,'f':9})
print(s1)
print(s2)

a    1
b    2
c    3
d    4
dtype: int64
b    5
c    6
d    7
e    8
f    9
dtype: int64


In [122]:
s3 = s1 + s2
print(s3)

a     NaN
b     7.0
c     9.0
d    11.0
e     NaN
f     NaN
dtype: float64


In [123]:
s1 = pd.Series([1,2,3], index = ['a','a','b'])
s2 = pd.Series([1,2,3], index = ['a','a','c'])

s3 = s1 + s2 # if same index is for multiple values in series or serieses,
# if any operation is performed, all combination of same indexes are performed. that is why in s3 there are 4 index a
print(s3)

a    2.0
a    3.0
a    3.0
a    4.0
b    NaN
c    NaN
dtype: float64


# Special NaN case

In [124]:
# mean of a numpy array having numerical values
s = np.arange(1,5)
s.mean()

2.5

In [125]:
# Mean of Numpy array with a NaN array, The output will be NaN because any arithmetic operation with NaN will result in NaN
s = np.array([1,2,3,np.NaN])
mean_normal = np.mean(s)
print(mean_normal)
print('------------')
# handling the NaN values
mean = np.nanmean(s)
print(mean)

nan
------------
2.0


# Boolean Operations

In [126]:
s = pd.Series(np.arange(0,10))
print(s>5) # Finding the values greater than 5 and giving a bool True to them
print('*********')
print(s<5) # Finding the values greater than 5 and giving a bool True to them

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
dtype: bool
*********
0     True
1     True
2     True
3     True
4     True
5    False
6    False
7    False
8    False
9    False
dtype: bool


In [127]:
mask = s>5 # Finding the values greater than 5 and giving a bool True to them and assigning it to a variable
print(s[mask])
# OR
print('------SHort Version--------')
print(s[s>5]) # short

6    6
7    7
8    8
9    9
dtype: int64
------SHort Version--------
6    6
7    7
8    8
9    9
dtype: int64


# Reindexing a series

Reindexing in Pandas makes the data is series or data frame to be matched with a given set of labels. It makes index alignment across multiple objects. The Process is as follows

1. Reorder the data to match a set of index labels
2. Insert NaN marker for missing data 
3. Possibly filling missing data for a label using some type of ;ogic (default is to insert NaN values)

In [128]:
s = pd.Series(np.random.randn(5))
print(s)
print('--------')
s.index = ['a', 'b', 'c', 'd', 'e']
print(s)

0    1.376980
1    0.291594
2   -0.970144
3   -0.544836
4   -0.620751
dtype: float64
--------
a    1.376980
b    0.291594
c   -0.970144
d   -0.544836
e   -0.620751
dtype: float64


<h3> Concat function in Pandas</h3>

if two series are concatenated, the matching index are duplicated which is not desired

In [129]:
s1 = pd.Series(np.random.randn(3))
s2 = pd.Series(np.random.randn(3))
combined = pd.concat([s1, s2])
print(s1)
print(s2)
print(combined)

0   -0.418913
1   -1.765102
2   -0.865710
dtype: float64
0    0.686941
1    0.665573
2    0.143740
dtype: float64
0   -0.418913
1   -1.765102
2   -0.865710
0    0.686941
1    0.665573
2    0.143740
dtype: float64


In [130]:
combined.index = np.arange(0, len(combined))
print(combined)

0   -0.418913
1   -1.765102
2   -0.865710
3    0.686941
4    0.665573
5    0.143740
dtype: float64


# reindex() method

Greater flexibility in creating a new index is provided using the .reindex() method. An example of the flexibility of .reindex() over assigning the .index property directly is that the list provided to .reindex() can be of a different length than the number of rows in the Series:

In [131]:
s1 = pd.Series(np.random.randn(4), index = ['a', 'b', 'c', 'd'])
print(s1)

s2 = s1.reindex(['a', 'c', 'g']) # a new series will be made comprised of values form the old series and the matching index's value will be taken and if any index is not in the old one a new index with NaN value will be created. Original will not be modified
print(s2)

a   -0.220012
b   -0.838076
c   -1.689415
d    1.176063
dtype: float64
a   -0.220012
c   -1.689415
g         NaN
dtype: float64


In [132]:
# Example from before

z = combined.reindex([9,6,4,8,3,5,1,9,3,5,7,3,4,4,6,6])
print(z)
print(combined) # original remained the same

9         NaN
6         NaN
4    0.665573
8         NaN
3    0.686941
5    0.143740
1   -1.765102
9         NaN
3    0.686941
5    0.143740
7         NaN
3    0.686941
4    0.665573
4    0.665573
6         NaN
6         NaN
dtype: float64
0   -0.418913
1   -1.765102
2   -0.865710
3    0.686941
4    0.665573
5    0.143740
dtype: float64


 Sometimes reindexing does not work because the data type of index are different

In [133]:
s1 = pd.Series([0,1,2], index = [1,2,3]) # index is integer
s2 = pd.Series([0,1,2], index = ['1','2','3']) # index is strings
s1 + s2 # all output will be NaN b/c of the fact that there was no matching index, so the operation created a index with a NaN value and anything the NaN touches is NaN.

1   NaN
2   NaN
3   NaN
1   NaN
2   NaN
3   NaN
dtype: float64

In [134]:
# Fix

s2.index = s2.index.values.astype(int) # converted all to integers
s1 + s2

1    0
2    2
3    4
dtype: int64

In [140]:
# One solution to NaN is to fill with some value like 0 so that if any operations are performed, No data will be lost

s = pd.Series(np.random.randn(5), index = ['a','b','c','d','e'])
s_indexed = s.reindex(['a','f'], fill_value = 0) # took series s and added an index f and replaced its NaN value by 0
s_indexed

a    0.775998
f    0.000000
dtype: float64

<h3> ffill, bfill & nearest   </h3>

In [142]:
s = pd.Series(['a','b','c'], index = [0,8,10])
s

0     a
8     b
10    c
dtype: object

In [143]:
s.reindex(np.arange(0,15), method = 'ffill') # will take the first value and start filling the NaN untill next value is met and then will take that next value and start fillinf

0     a
1     a
2     a
3     a
4     a
5     a
6     a
7     a
8     b
9     b
10    c
11    c
12    c
13    c
14    c
dtype: object

In [144]:
s.reindex(np.arange(0,7), method = 'bfill') # will start from bottom of the series and do the same

0    a
1    b
2    b
3    b
4    b
5    b
6    b
dtype: object

In [145]:
s.reindex(np.arange(0,10), method = 'nearest') # use nearest valid observations to fill gap

0    a
1    a
2    a
3    a
4    b
5    b
6    b
7    b
8    b
9    c
dtype: object

# Slicing a Series

In [None]:
# Creating a series with index not starting form 0
# using position based slicing

In [155]:
s = pd.Series(np.arange(0,10), index = np.arange(20,30))
print(s)
print('------------')
print(s[0:6:2]) # go from 0 to 6th wiht step of 2
print('------------')
print(s.iloc[[0,2,4]]) # same as above but with iloc
print('------------')
print(s[:5]) # first five
print('------------')
print(s[[21,24,27,22]]) # Fancy indexing

20    0
21    1
22    2
23    3
24    4
25    5
26    6
27    7
28    8
29    9
dtype: int64
------------
20    0
22    2
24    4
dtype: int64
------------
20    0
22    2
24    4
dtype: int64
------------
20    0
21    1
22    2
23    3
24    4
dtype: int64
------------
21    1
24    4
27    7
22    2
dtype: int64


In [None]:
1