In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from XRBID.Sources import LoadSources

I am following this [tutorial](https://www.youtube.com/watch?v=2uvysYbKdjM). 

In [2]:
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9],[10,11,12]], columns=["A", "B", "C"], index=["x", "y", "z", "zz"])

In [4]:
df.head()

Unnamed: 0,A,B,C
x,1,2,3
y,4,5,6
z,7,8,9
zz,10,11,12


In [5]:
df.head(2)

Unnamed: 0,A,B,C
x,1,2,3
y,4,5,6


In [10]:
df.index.tolist()

['x', 'y', 'z', 'zz']

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, x to z
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int64
 1   B       3 non-null      int64
 2   C       3 non-null      int64
dtypes: int64(3)
memory usage: 96.0+ bytes


In [13]:
df.describe()

Unnamed: 0,A,B,C
count,3.0,3.0,3.0
mean,4.0,5.0,6.0
std,3.0,3.0,3.0
min,1.0,2.0,3.0
25%,2.5,3.5,4.5
50%,4.0,5.0,6.0
75%,5.5,6.5,7.5
max,7.0,8.0,9.0


In [14]:
df.nunique()

A    3
B    3
C    3
dtype: int64

In [11]:
df.shape

(4, 3)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, x to zz
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       4 non-null      int64
 1   B       4 non-null      int64
 2   C       4 non-null      int64
dtypes: int64(3)
memory usage: 128.0+ bytes


In [12]:
df.size

12

# Load DataFrames

In [4]:
coffee = pd.read_csv("https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/refs/heads/master/warmup-data/coffee.csv")
results = pd.read_parquet('results.parquet')
bios = pd.read_csv('bios.csv')

# Accessing Data with Pandas


In [25]:
coffee.head(10)

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [26]:
coffee.tail(10)

Unnamed: 0,Day,Coffee Type,Units Sold
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35
10,Saturday,Espresso,45
11,Saturday,Latte,35
12,Sunday,Espresso,45
13,Sunday,Latte,35


In [28]:
coffee.sample(10, random_state=1)

Unnamed: 0,Day,Coffee Type,Units Sold
3,Tuesday,Latte,20
7,Thursday,Latte,30
6,Thursday,Espresso,40
2,Tuesday,Espresso,30
10,Saturday,Espresso,45
4,Wednesday,Espresso,35
1,Monday,Latte,15
12,Sunday,Espresso,45
0,Monday,Espresso,25
13,Sunday,Latte,35


## Accessing specific values from a dataframe

There are a couple of ways to do that, namely:
1. using `loc()` method 
2. using `iloc()` method
3. using `at` method
4. using `iat()` mathod

### loc property
Access a group of rows and columns by labels or a boolean array.<br>
`coffee[Rows, columns]`

In [24]:
coffee.loc[[0, 1, 5]]

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
5,Wednesday,Latte,25


In [30]:
coffee.loc[5:7] # both the ends are included, unlike indexing in python

Unnamed: 0,Day,Coffee Type,Units Sold
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30


In [29]:
coffee.loc[0:3, ["Day", 'Units Sold']]

Unnamed: 0,Day,Units Sold
0,Monday,25
1,Monday,15
2,Tuesday,30
3,Tuesday,20


### iloc
integer based location indexing 
`iloc[Rows, Columns]`

In [28]:
# iloc uses index location
coffee.iloc[0:3, [2]]

Unnamed: 0,Units Sold
0,25
1,15
2,30


### at
Access a single value. Use `at` if you only need to get or set a single value in a DataFrame or Series.<br>
`at[Row, Column`]


In [34]:
coffee.at[0, "Units Sold"], coffee.at[0, 'Day']

(np.int64(25), 'Monday')

### iat
Access a single value for a row/column pair by integer position

In [37]:
coffee.iat[1, 2], coffee.iat[2, 0]

(np.int64(15), 'Tuesday')

In [None]:
coffee.index = coffee["Day"]
coffee.head()

In [55]:
coffee.loc["Monday":"Wednesday", "Units Sold"]

Series([], Name: Units Sold, dtype: int64)

In [56]:
coffee.loc[1:3, "Units Sold"] = 10

In [57]:
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,10
2,Tuesday,Espresso,10
3,Tuesday,Latte,10
4,Wednesday,Espresso,35


In [59]:
coffee.at[0, "Units Sold"]

np.int64(25)

In [5]:
coffee.iat[0,0]

'Monday'

In [6]:
coffee.iloc[0,0]

'Monday'

In [7]:
coffee["Day"]

0        Monday
1        Monday
2       Tuesday
3       Tuesday
4     Wednesday
5     Wednesday
6      Thursday
7      Thursday
8        Friday
9        Friday
10     Saturday
11     Saturday
12       Sunday
13       Sunday
Name: Day, dtype: object

In [8]:
coffee.Day

0        Monday
1        Monday
2       Tuesday
3       Tuesday
4     Wednesday
5     Wednesday
6      Thursday
7      Thursday
8        Friday
9        Friday
10     Saturday
11     Saturday
12       Sunday
13       Sunday
Name: Day, dtype: object

In [9]:
coffee["Units Sold"]

0     25
1     15
2     30
3     20
4     35
5     25
6     40
7     30
8     45
9     35
10    45
11    35
12    45
13    35
Name: Units Sold, dtype: int64

## Sort Data

In [10]:
coffee.sort_values("Units Sold", ascending=False)

Unnamed: 0,Day,Coffee Type,Units Sold
8,Friday,Espresso,45
10,Saturday,Espresso,45
12,Sunday,Espresso,45
6,Thursday,Espresso,40
4,Wednesday,Espresso,35
9,Friday,Latte,35
11,Saturday,Latte,35
13,Sunday,Latte,35
2,Tuesday,Espresso,30
7,Thursday,Latte,30


In [13]:
coffee.sort_values(["Units Sold", "Coffee Type"], ascending=False) 
# first sort by units sold
# then sort by coffee type

Unnamed: 0,Day,Coffee Type,Units Sold
8,Friday,Espresso,45
10,Saturday,Espresso,45
12,Sunday,Espresso,45
6,Thursday,Espresso,40
9,Friday,Latte,35
11,Saturday,Latte,35
13,Sunday,Latte,35
4,Wednesday,Espresso,35
7,Thursday,Latte,30
2,Tuesday,Espresso,30


In [14]:
coffee.sort_values(["Units Sold", "Coffee Type"], ascending=[0, 1])
# dont make units sold ascending 

Unnamed: 0,Day,Coffee Type,Units Sold
8,Friday,Espresso,45
10,Saturday,Espresso,45
12,Sunday,Espresso,45
6,Thursday,Espresso,40
4,Wednesday,Espresso,35
9,Friday,Latte,35
11,Saturday,Latte,35
13,Sunday,Latte,35
2,Tuesday,Espresso,30
7,Thursday,Latte,30


## Iterate through a data frame (will lose some memory performance)

In [None]:
# You should never modify something you are iterating over. 
# This is not guaranteed to work in all cases. 
# Depending on the data types, the iterator returns a copy 
# and not a view, and writing to it will have no effect.
for index, rows in coffee.iterrows():
    print(index)
    print(rows['Units Sold'])
    print("\n\n")

0
25



1
15



2
30



3
20



4
35



5
25



6
40



7
30



8
45



9
35



10
45



11
35



12
45



13
35





In [23]:
for row in coffee.itertuples():
    print(row)

Pandas(Index=0, Day='Monday', _2='Espresso', _3=25)
Pandas(Index=1, Day='Monday', _2='Latte', _3=15)
Pandas(Index=2, Day='Tuesday', _2='Espresso', _3=30)
Pandas(Index=3, Day='Tuesday', _2='Latte', _3=20)
Pandas(Index=4, Day='Wednesday', _2='Espresso', _3=35)
Pandas(Index=5, Day='Wednesday', _2='Latte', _3=25)
Pandas(Index=6, Day='Thursday', _2='Espresso', _3=40)
Pandas(Index=7, Day='Thursday', _2='Latte', _3=30)
Pandas(Index=8, Day='Friday', _2='Espresso', _3=45)
Pandas(Index=9, Day='Friday', _2='Latte', _3=35)
Pandas(Index=10, Day='Saturday', _2='Espresso', _3=45)
Pandas(Index=11, Day='Saturday', _2='Latte', _3=35)
Pandas(Index=12, Day='Sunday', _2='Espresso', _3=45)
Pandas(Index=13, Day='Sunday', _2='Latte', _3=35)
