In [1]:
import pandas as pd
import numpy as np

In [2]:
# https://pandas.pydata.org/docs/user_guide/10min.html
# Documentation: https://pandas.pydata.org/

**Data Structures**
* Series: One dimensional labeled array holding data of any type
* Dataframe: Two dimensional data structure that holds data like a 2D array with rows and columns

In [11]:
s = pd.Series([1, 2, 4, np.nan, 6, 8])
print("s:", s)
print("type(s) =", type(s))

s: 0    1.0
1    2.0
2    4.0
3    NaN
4    6.0
5    8.0
dtype: float64
type(s) = <class 'pandas.core.series.Series'>


In [48]:
# https://numpy.org/doc/2.2/reference/random/generated/numpy.random.randn.html
df = pd.DataFrame(np.random.randn(6, 4), columns=list("ABCD")) # samples from standard normal distribution
# df = pd.DataFrame(np.random.rand(6, 4), columns=list("ABCD")) # random values in a given shape
# df = pd.DataFrame(np.random.randint(0, 5, size=(6, 4), dtype=int), columns=list("ABCD")) # given shape, between two values, ints
# df = pd.DataFrame(np.random.random(size = (6, 4)), columns=list("ABCD")) # random floats between [0.0, 1.0)
print(df)

          A         B         C         D
0  0.591851  0.178438  0.084814 -1.341294
1 -0.130197  0.658346  1.844166  2.562577
2  0.564361 -0.383927  0.234956 -0.059547
3 -0.052050 -0.313077 -0.602867  1.667919
4  0.046530 -0.646889  1.362815  1.393568
5  0.401489  1.225638 -0.280262  1.017179


In [46]:
df = pd.DataFrame({
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
    )
print("size = ", df.size)
print("shape = ", df.shape)
print("dtype = ")
print(df.dtypes)
print("columns = ", df.columns)

size =  24
shape =  (4, 6)
dtype = 
A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object
columns =  Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')


In [40]:
df.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [41]:
df.tail(3)

Unnamed: 0,A,B,C,D,E,F
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [42]:
df.index

Index([0, 1, 2, 3], dtype='int64')

In [52]:
# converting to numpy
df = df.to_numpy()
print(df)
print(type(df))

[[ 0.59185144  0.17843753  0.08481352 -1.34129434]
 [-0.13019707  0.65834593  1.84416624  2.56257722]
 [ 0.56436084 -0.3839267   0.23495641 -0.05954686]
 [-0.05204957 -0.31307671 -0.60286655  1.66791855]
 [ 0.04653016 -0.64688864  1.36281527  1.39356761]
 [ 0.40148941  1.22563838 -0.28026184  1.01717891]]
<class 'numpy.ndarray'>


In [56]:
# new df
df = pd.DataFrame(np.random.random(size = (6, 4)), columns=list("ABCD")) # random floats between [0.0, 1.0)

In [57]:
# describe - quick statistic summary of our data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.254262,0.555481,0.429235,0.504878
std,0.248393,0.297603,0.362796,0.308244
min,0.006965,0.206567,0.021295,0.145872
25%,0.062756,0.313759,0.12363,0.312356
50%,0.222269,0.545817,0.426385,0.433526
75%,0.351887,0.787466,0.669278,0.70314
max,0.669389,0.930265,0.928391,0.951365


In [58]:
# transpose
df.T

Unnamed: 0,0,1,2,3,4,5
A,0.146777,0.369929,0.669389,0.006965,0.297761,0.034749
B,0.829705,0.430883,0.660752,0.930265,0.274718,0.206567
C,0.021295,0.299859,0.55291,0.708068,0.064886,0.928391
D,0.272564,0.145872,0.43173,0.435322,0.951365,0.792413


In [None]:
# sort values based on some column 
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
5,0.034749,0.206567,0.928391,0.792413
4,0.297761,0.274718,0.064886,0.951365
1,0.369929,0.430883,0.299859,0.145872
2,0.669389,0.660752,0.55291,0.43173
0,0.146777,0.829705,0.021295,0.272564
3,0.006965,0.930265,0.708068,0.435322


In [13]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [None]:
# selecting portions of the df
