In [2]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import pandas_datareader as pdr


### Creating a Series by passing a list of values, letting pandas create a default integer index:

In [16]:
print("Pandas Series")
s = pd.Series([1,3,5,np.nan,6,8])
s

Pandas Series


0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### Creating a DataFrame by passing a NumPy array, with a datetime index using date_range() and labeled columns:

In [15]:
print("Pandas Dates")
dates = pd.date_range("20130101", periods=6)
dates

Pandas Dates


DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [18]:
print("Pandas Dataframes")
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['Big', 'Medium', 'Small', 'Tiny'])
df.head()

Pandas Dataframes


Unnamed: 0,Big,Medium,Small,Tiny
2013-01-01,1.234532,0.244524,-0.668575,-0.721615
2013-01-02,1.152464,0.963525,0.368424,1.331332
2013-01-03,-0.030914,1.460905,-0.363886,-0.205925
2013-01-04,0.372608,1.468275,-0.639068,1.120021
2013-01-05,-1.545329,0.39836,-0.686302,0.790974


### Creating a DataFrame by passing a dictionary of objects that can be converted into a series-like structure:

In [26]:
rows = 6
df2 = pd.DataFrame(
    {
        "Age": 1.0,
        "Birthdate": pd.Timestamp("20130102"),
        "Wage": pd.Series(1, index=list(range(rows)), dtype="float32"),
        "Siblings": np.array([3] * rows, dtype="int32"),
        "Type": pd.Categorical(["test", "train", "test", "train", "test", "test"]),
        "Foo": "foo",
    }
)
df2

Unnamed: 0,Age,Birthdate,Wage,Siblings,Type,Foo
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo
4,1.0,2013-01-02,1.0,3,test,foo
5,1.0,2013-01-02,1.0,3,test,foo


### The columns of the resulting DataFrame have different dtypes:

In [27]:
df2.dtypes

Age                 float64
Birthdate    datetime64[ns]
Wage                float32
Siblings              int32
Type               category
Foo                  object
dtype: object

### head

In [28]:
df.head()

Unnamed: 0,Big,Medium,Small,Tiny
2013-01-01,1.234532,0.244524,-0.668575,-0.721615
2013-01-02,1.152464,0.963525,0.368424,1.331332
2013-01-03,-0.030914,1.460905,-0.363886,-0.205925
2013-01-04,0.372608,1.468275,-0.639068,1.120021
2013-01-05,-1.545329,0.39836,-0.686302,0.790974


### tail

In [29]:
df.tail()

Unnamed: 0,Big,Medium,Small,Tiny
2013-01-02,1.152464,0.963525,0.368424,1.331332
2013-01-03,-0.030914,1.460905,-0.363886,-0.205925
2013-01-04,0.372608,1.468275,-0.639068,1.120021
2013-01-05,-1.545329,0.39836,-0.686302,0.790974
2013-01-06,-1.603504,-0.601746,0.013817,-0.532378


### display index

In [33]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

### display columns

In [32]:
df.columns

Index(['Big', 'Medium', 'Small', 'Tiny'], dtype='object')

### DataFrame.to_numpy() gives a NumPy representation of the underlying data.
Note that this can be an expensive operation when your DataFrame has columns with different data types, which comes down to a fundamental difference between pandas and NumPy: NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column. When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. This may end up being object, which requires casting every value to a Python object.

In [34]:
df.to_numpy()

array([[ 1.23453195,  0.2445243 , -0.66857486, -0.72161524],
       [ 1.15246367,  0.96352468,  0.36842408,  1.33133163],
       [-0.03091384,  1.46090517, -0.36388603, -0.20592509],
       [ 0.37260782,  1.46827511, -0.63906775,  1.12002112],
       [-1.54532874,  0.3983599 , -0.68630241,  0.79097386],
       [-1.60350369, -0.60174606,  0.01381734, -0.53237766]])

In [35]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo']],
      dtype=object)

### Describe

In [38]:
df.describe()

Unnamed: 0,Big,Medium,Small,Tiny
count,6.0,6.0,6.0,6.0
mean,-0.070024,0.655641,-0.329265,0.297068
std,1.25872,0.802477,0.434571,0.891021
min,-1.603504,-0.601746,-0.686302,-0.721615
25%,-1.166725,0.282983,-0.661198,-0.450765
50%,0.170847,0.680942,-0.501477,0.292524
75%,0.9575,1.33656,-0.080609,1.037759
max,1.234532,1.468275,0.368424,1.331332


### Transpose

In [40]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
Big,1.234532,1.152464,-0.030914,0.372608,-1.545329,-1.603504
Medium,0.244524,0.963525,1.460905,1.468275,0.39836,-0.601746
Small,-0.668575,0.368424,-0.363886,-0.639068,-0.686302,0.013817
Tiny,-0.721615,1.331332,-0.205925,1.120021,0.790974,-0.532378


### DataFrame.sort_index() sorts by an axis:

In [41]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,Tiny,Small,Medium,Big
2013-01-01,-0.721615,-0.668575,0.244524,1.234532
2013-01-02,1.331332,0.368424,0.963525,1.152464
2013-01-03,-0.205925,-0.363886,1.460905,-0.030914
2013-01-04,1.120021,-0.639068,1.468275,0.372608
2013-01-05,0.790974,-0.686302,0.39836,-1.545329
2013-01-06,-0.532378,0.013817,-0.601746,-1.603504


### DataFrame.sort_values() sorts by values:

In [44]:
df.sort_values(by="Big", ascending=False)

Unnamed: 0,Big,Medium,Small,Tiny
2013-01-01,1.234532,0.244524,-0.668575,-0.721615
2013-01-02,1.152464,0.963525,0.368424,1.331332
2013-01-04,0.372608,1.468275,-0.639068,1.120021
2013-01-03,-0.030914,1.460905,-0.363886,-0.205925
2013-01-05,-1.545329,0.39836,-0.686302,0.790974
2013-01-06,-1.603504,-0.601746,0.013817,-0.532378


# Selection
While standard Python / NumPy expressions for selecting and setting are intuitive and come in handy for interactive work, for production code, we recommend the optimized `pandas` data access methods,
  - DataFrame.at(),
  - DataFrame.iat(),
  - DataFrame.loc() and
  - DataFrame.iloc().

---
## Getting
Selecting a single column, which yields a Series, equivalent to df.A:

In [45]:
df["Big"]

2013-01-01    1.234532
2013-01-02    1.152464
2013-01-03   -0.030914
2013-01-04    0.372608
2013-01-05   -1.545329
2013-01-06   -1.603504
Freq: D, Name: Big, dtype: float64

In [46]:
df.Big

2013-01-01    1.234532
2013-01-02    1.152464
2013-01-03   -0.030914
2013-01-04    0.372608
2013-01-05   -1.545329
2013-01-06   -1.603504
Freq: D, Name: Big, dtype: float64

In [49]:
df[1:3]

Unnamed: 0,Big,Medium,Small,Tiny
2013-01-02,1.152464,0.963525,0.368424,1.331332
2013-01-03,-0.030914,1.460905,-0.363886,-0.205925


In [56]:
print("get all index dates from 20130102 to 20130104")
df["20130102":"20130104"]

get all index dates from 20130102 to 20130104


Unnamed: 0,Big,Medium,Small,Tiny
2013-01-02,1.152464,0.963525,0.368424,1.331332
2013-01-03,-0.030914,1.460905,-0.363886,-0.205925
2013-01-04,0.372608,1.468275,-0.639068,1.120021


---

## Selection by label

- DataFrame.loc()
- DataFrame.at().



In [55]:
print("dates[0] is the first row, and df.loc[dates[0]] gets all columns for that row")
df.loc[dates[0]]

dates[0] is the first row, and df.loc[dates[0]] gets all columns for that row


Big       1.234532
Medium    0.244524
Small    -0.668575
Tiny     -0.721615
Name: 2013-01-01 00:00:00, dtype: float64

In [61]:
df.loc[:, ["Medium", "Tiny"]]

Unnamed: 0,Medium,Tiny
2013-01-01,0.244524,-0.721615
2013-01-02,0.963525,1.331332
2013-01-03,1.460905,-0.205925
2013-01-04,1.468275,1.120021
2013-01-05,0.39836,0.790974
2013-01-06,-0.601746,-0.532378


In [60]:
df.loc[:]

Unnamed: 0,Big,Medium,Small,Tiny
2013-01-01,1.234532,0.244524,-0.668575,-0.721615
2013-01-02,1.152464,0.963525,0.368424,1.331332
2013-01-03,-0.030914,1.460905,-0.363886,-0.205925
2013-01-04,0.372608,1.468275,-0.639068,1.120021
2013-01-05,-1.545329,0.39836,-0.686302,0.790974
2013-01-06,-1.603504,-0.601746,0.013817,-0.532378


In [65]:
print("get the rows between two values-  and specific columns")
df.loc["20130102":"20130104", ["Medium", "Tiny"]]

get the rows between two values-  and specific columns


Unnamed: 0,Medium,Tiny
2013-01-02,0.963525,1.331332
2013-01-03,1.460905,-0.205925
2013-01-04,1.468275,1.120021


In [67]:
print("get the first row and a specific column")
df.loc[dates[0], "Tiny"]

get the first row and a specific column


-0.7216152398205058

In [68]:
df.at[dates[0], "Big"]


1.2345319516925999

---

# Selection by position

- DataFrame.iloc()
- DataFrame.at().

Select via the position of the passed integers:

