In [None]:
import pandas as pd

## Basic Methods and Attributes

In [None]:
# a dataframe is a basic data structure in python which stores data in tabular format
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], columns= ["A", "B", "C"], index = ['x', 'y', 'z'])

# print the whole data frame
print(df, '\n')

#print the first two rows 
print(df.head(2), '\n') 

#print last two rows
print(df.tail(2), '\n')

# by default the head and tail without parameters gives the first and last rows respectively

In [None]:
# print the columns
print(df.columns)

# print the indices
print(list(df.index))

In [None]:
# print the info about the dataframe 
print(df.info())

# print some other meaningful info
print(df.describe())

# print unique values
print(df.nunique())

# print the dimensions of the dataframe 
print(df.shape)

# print the number of elements in the dataframe
print(df.size)

## Loading in DataFrames from files

In [None]:
education = pd.read_csv("datasets/International_Education_Costs.csv")

# apply the above operations
print("Head: \n", education.head(),'\n')
print("Tail: \n", education.tail())

In [None]:
print(education.info())

In [None]:
print(education.describe())

In [None]:
print(education.shape)
print(education.size)

print(list(education.columns))

## Loading different types of files

---

### **CSV (Comma-Separated Values)**

* Stores tabular data in plain text format.
* Each line represents a row; columns are separated by commas.
* **Human-readable** and can be opened with text editors or Excel.
* Does **not** store data types or schema—everything is treated as text.
* **No compression** or indexing; results in **larger file sizes**.
* **Slower** to read/write, especially with large datasets.
* Still widely used due to its **simplicity and compatibility** across tools.

---

### **Feather**

* A **binary columnar format** optimized for fast data reading/writing.
* Part of the **Apache Arrow** ecosystem.
* Designed for **in-memory analytics**, especially with **Pandas** and **R**.
* **Preserves data types and schema**, unlike CSV.
* Supports **lightweight compression**.
* **Not human-readable**, but much faster and more space-efficient than CSV.
* Ideal for **medium-sized datasets** and fast data exchange between tools.

---

### **Parquet**

* A **highly efficient columnar storage format** developed by Apache.
* Best suited for **big data systems** like Spark, Hadoop, and cloud storage.
* Supports **compression**, **indexing**, and **complex nested data**.
* Retains detailed **schema and data types**.
* **Not human-readable**, but excellent for performance and scalability.
* Enables **fast access to specific columns**, improving query speed.
* Ideal for **large datasets**, data lakes, and distributed data processing.

---


In [None]:
bios = pd.read_csv("datasets/bios.csv") # csv file
bios.info()
bios.size

In [None]:
results = pd.read_parquet("datasets/results.parquet") # parquet file
results.info()
results.size

In [None]:
f_results = pd.read_feather("datasets/results.feather") # feather file
f_results.info()
f_results.size

In [None]:
olympics = pd.read_excel("datasets/olympics-data.xlsx") # excel file
olympics.info()
olympics.size

In [None]:
# you can convert files from one to another 

parkayy = bios.to_parquet()

In [None]:
# use the display function to view the entire datset (displays the first 5 and last 5 rows)
display(bios)

In [None]:
# access random rows using the sample function 
bios.sample(10, random_state=1) # setting random_state to True doesn't change the rows each time you run it

In [None]:
# Indexing using loc and iloc 

# dataset.loc[] - Used to access rows and columns by their labels (names). (label based indexing)
# dataset.iloc[] - Used to access rows and columns by their integer positions (like a 2D array). (integer location based indexing)
# dataset.loc[[rows],[columns]]
bios.loc[0] # returns the first row

bios.loc[[0,1,2]] # returns the first 3 rows
bios.loc[[23,47,89]] # returns the 23rd, 47th and 89th rows 

# you can do indexing too 
bios.loc[5:10]

# specify column name with row slice
bios.loc[5:8, ["name", "born_date"]]

In [None]:
# bios.loc[35450:35455, "height"] = 172.5
bios.loc[35450:35455, ['name', 'height']]

In [None]:
# Acessing objects at specific index using at and iat 

print(bios.at[45678, 'name'])
print(bios.iat[45678, 1])

In [None]:
coffee = pd.read_csv('datasets/coffee.csv')
coffee

In [None]:
# sorting

coffee.sort_values(["Units Sold"])


# sort in descending order, add more parameters for the sort
coffee.sort_values(["Units Sold", 'Coffee Type'], ascending = False)

In [None]:
# iterating through rows using a for loops 

for index, row in coffee.iterrows(): 
    print(index)
    print(row['Units Sold'],'\n') # you can grab a specific row

for index, row in coffee.iterrows(): 
    print(index, row)
