In [1]:
import pandas as pd

In [12]:
import numpy as np

Two primary data structures used in Pandas:

- Series: used to represent one-dimensional data structure, just like array but with some additional features.

- Dataframe: a tabular data structure, similar to spreadsheet.

# Series

 Declaring a `Series`:

In [2]:
s = pd.Series([11, 12, -13, 4])
s

0    11
1    12
2   -13
3     4
dtype: int64

### Declaring a `Series` with index:

In [3]:
s = pd.Series([11, 12, -13, 4], index=["a", "b", "c", "d"])
s

a    11
b    12
c   -13
d     4
dtype: int64

### Accessing `values` and `index`:

In [4]:
s.values

array([ 11,  12, -13,   4])

In [5]:
s.index

Index(['a', 'b', 'c', 'd'], dtype='object')

### Indexing

In [6]:
s["a"]

11

In [9]:
s[0]

11

In [8]:
s[0:2]

a    11
b    12
dtype: int64

In [10]:
s[["a", "b"]]

a    11
b    12
dtype: int64

Assignment:

In [11]:
s["a"] = 44
s

a    44
b    12
c   -13
d     4
dtype: int64

### Defining series from `NumPy` or from existing `Series`

In [13]:
arr = np.random.rand(4)
arr

array([0.93849323, 0.33554165, 0.07673459, 0.70323283])

In [14]:
s = pd.Series(arr)
s

0    0.938493
1    0.335542
2    0.076735
3    0.703233
dtype: float64

In [15]:
s2 = pd.Series(s)
s2

0    0.938493
1    0.335542
2    0.076735
3    0.703233
dtype: float64

**Values contained in `NumPy` arrays are not copied, but passed by reference**

In [16]:
arr[2] = 9999;
s

0       0.938493
1       0.335542
2    9999.000000
3       0.703233
dtype: float64

### Filtering

In [17]:
s > 0.5

0     True
1    False
2     True
3     True
dtype: bool

In [18]:
s[s > 0.5]

0       0.938493
2    9999.000000
3       0.703233
dtype: float64

### Several mathematical operations

In [19]:
s + 0.2

0       1.138493
1       0.535542
2    9999.200000
3       0.903233
dtype: float64

In [20]:
s - 10

0      -9.061507
1      -9.664458
2    9989.000000
3      -9.296767
dtype: float64

In [21]:
np.cos(s)

0    0.591004
1    0.944232
2   -0.771617
3    0.762756
dtype: float64

### Handling duplicate values

In [22]:
serd = pd.Series([1,0,2,1,2,3], index=["white","white","blue","green","green","yellow"])
serd

white     1
white     0
blue      2
green     1
green     2
yellow    3
dtype: int64

In [23]:
serd.unique()

array([1, 0, 2, 3])

In [24]:
serd.value_counts()

2    2
1    2
3    1
0    1
dtype: int64

In [25]:
serd.isin([0,3])

white     False
white      True
blue      False
green     False
green     False
yellow     True
dtype: bool

### Handling `NaN`

In [26]:
s = pd.Series([5,-3,np.NaN,14])
s

0     5.0
1    -3.0
2     NaN
3    14.0
dtype: float64

In [27]:
s.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [28]:
s[s.isnull()]

2   NaN
dtype: float64

In [29]:
s.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [30]:
s[s.notnull()]

0     5.0
1    -3.0
3    14.0
dtype: float64

In [31]:
s.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [32]:
s[s.isna()]

2   NaN
dtype: float64

### Series as dictionary

In [33]:
mydict = {"red": 2000, "blue": 1000, "yellow": 500, "orange": 1000}
s = pd.Series(mydict)
s

red       2000
blue      1000
yellow     500
orange    1000
dtype: int64

In [34]:
mydict["red"] = 9999;
mydict

{'red': 9999, 'blue': 1000, 'yellow': 500, 'orange': 1000}

In [35]:
s

red       2000
blue      1000
yellow     500
orange    1000
dtype: int64

Specifying index, no data for index will be set to `NaN`

In [36]:
colors = ["red", "yellow", "orange", "blue", "green"]
s = pd.Series(mydict, index=colors)
s

red       9999.0
yellow     500.0
orange    1000.0
blue      1000.0
green        NaN
dtype: float64

### Operations between `Series`

In [37]:
mydict2 = {"red" : 400, "yellow" : 1000, "black" : 700}
s2 = pd.Series(mydict2)
s2

red        400
yellow    1000
black      700
dtype: int64

In [38]:
s + s2

black         NaN
blue          NaN
green         NaN
orange        NaN
red       10399.0
yellow     1500.0
dtype: float64

# DataFrame

The dataframe consists of an ordered collection of columns, each of which can contain a value of a different type (numeric, string, Boolean, etc.)

### Using dict to create a `DataFrame`

In [40]:
dat = {"color" : ["blue", "green", "yellow", "red", "white"],
       "object" : ["ball", "pen", "pencil", "paper", "mug"],
       "price" : [1.2, 1.0, 0.6, 0.9, 1.7]}
df = pd.DataFrame(dat)
df

Unnamed: 0,color,object,price
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


Using only selected columns

In [41]:
df2 = pd.DataFrame(dat, columns=["object", "price"])
df2

Unnamed: 0,object,price
0,ball,1.2
1,pen,1.0
2,pencil,0.6
3,paper,0.9
4,mug,1.7


Using NumPy array

In [42]:
df3 = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index=["red", "blue", "yellow", "white"],
                   columns=["ball", "pen", "pencil", "paper"])
df3

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [43]:
df.columns

Index(['color', 'object', 'price'], dtype='object')

In [44]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [45]:
df.values

array([['blue', 'ball', 1.2],
       ['green', 'pen', 1.0],
       ['yellow', 'pencil', 0.6],
       ['red', 'paper', 0.9],
       ['white', 'mug', 1.7]], dtype=object)

In [46]:
df["price"]

0    1.2
1    1.0
2    0.6
3    0.9
4    1.7
Name: price, dtype: float64

In [51]:
df[["price", "color"]]

Unnamed: 0,price,color
0,1.2,blue
1,1.0,green
2,0.6,yellow
3,0.9,red
4,1.7,white


In [47]:
df.price

0    1.2
1    1.0
2    0.6
3    0.9
4    1.7
Name: price, dtype: float64

Accessing row:

In [49]:
df.loc[1]

color     green
object      pen
price         1
Name: 1, dtype: object

In [50]:
df.loc[2:3]

Unnamed: 0,color,object,price
2,yellow,pencil,0.6
3,red,paper,0.9
