In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

In [2]:
# Pandas data frame consists of multiple series (they represent both rows and columns)
# Series is a one-dimensional array of indexed data
# Series can be created from a list or array

# Create a series from a list
series_from_list = pd.Series([1, 2, 3, 4, 5])
print("Series from list: ")
print(series_from_list)
print()

Series from list: 
0    1
1    2
2    3
3    4
4    5
dtype: int64



In [3]:
# Create a series from a numpy array
series_from_array = pd.Series(np.array([1, 2, 3, 4, 5]))
print("Series from array: ")
print(series_from_array)
print()

Series from array: 
0    1
1    2
2    3
3    4
4    5
dtype: int64



In [4]:
# you can also add labels to the series
labeled_series = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
print("Labeled series: ")
print(labeled_series)
print()

Labeled series: 
a    1
b    2
c    3
d    4
e    5
dtype: int64



In [5]:
# create a data frame from series
labeled_series_2 = pd.Series([6, 7, 8, 9, 10], index=["a", "b", "c", "d", "e"])

data_frame_from_series = pd.DataFrame(
    {"col_1": labeled_series, "col_2": labeled_series_2}
)  # as columns
print("Data frame from series stacked as columns: ")
print(data_frame_from_series)
print()

Data frame from series stacked as columns: 
   col_1  col_2
a      1      6
b      2      7
c      3      8
d      4      9
e      5     10



In [6]:
data_frame_from_series_2 = pd.DataFrame([labeled_series, labeled_series_2])  # as rows
print("Data frame from series stacked as rows: ")
print(data_frame_from_series_2)
print()

Data frame from series stacked as rows: 
   a  b  c  d   e
0  1  2  3  4   5
1  6  7  8  9  10



In [7]:
# create a data frame from 2D numpy array
data_frame_from_array = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]))
print("Data frame from 2D array: ")
print(data_frame_from_array)
print()

Data frame from 2D array: 
   0  1  2
0  1  2  3
1  4  5  6
2  7  8  9



In [8]:
# create a data frame from a dictionary
data_frame_from_dict = pd.DataFrame({"col_1": [1, 2, 3], "col_2": [4, 5, 6]})
print("Data frame from dictionary: ")
print(data_frame_from_dict)
print()

Data frame from dictionary: 
   col_1  col_2
0      1      4
1      2      5
2      3      6



In [9]:
# create a data frame from csv file
content = ["col_1,col_2,col_3", "1,2,3", "4,5,6", "7,8,9"]  # first we need a csv file
path = Path("data.csv")
path.write_text("\n".join(content))
data_frame_from_csv = pd.read_csv(path)
path.unlink()  # delete the file
print("Data frame from csv file: ")
print(data_frame_from_csv)
print()

Data frame from csv file: 
   col_1  col_2  col_3
0      1      2      3
1      4      5      6
2      7      8      9



In [10]:
# we have a lot of functions implented for dealing with data frames
# for example we can get the shape of the data frame
print("Shape of the data frame: ")
print(data_frame_from_csv.shape)
print()

Shape of the data frame: 
(3, 3)



In [11]:
# or easier, number of rows and columns
n_rows, n_cols = data_frame_from_csv.shape
print("Number of rows: ", n_rows)
print("Number of columns: ", n_cols)
print()

Number of rows:  3
Number of columns:  3



In [12]:
# we can get the column names
print("Column names: ")
print(data_frame_from_csv.columns)
print()

Column names: 
Index(['col_1', 'col_2', 'col_3'], dtype='object')



In [13]:
# we can get the second column
print("Second column: ")
print(data_frame_from_csv[data_frame_from_csv.columns[1]])
print()

Second column: 
0    2
1    5
2    8
Name: col_2, dtype: int64



In [14]:
# we can get the last row
print("Last row: ")
print(data_frame_from_csv.iloc[-1])
print()

Last row: 
col_1    7
col_2    8
col_3    9
Name: 2, dtype: int64



In [15]:
# useful statistics
print("Mean of the data frame: ")
print(data_frame_from_csv.mean())
print()

print("Mean of the second column: ")
print(data_frame_from_csv[data_frame_from_csv.columns[1]].mean())
print()

Mean of the data frame: 
col_1    4.0
col_2    5.0
col_3    6.0
dtype: float64

Mean of the second column: 
5.0



In [16]:
# we can also transform the data frame to numpy array
print("Data frame to numpy array: ")
print(data_frame_from_csv.to_numpy())
print()

Data frame to numpy array: 
[[1 2 3]
 [4 5 6]
 [7 8 9]]

