In [3]:
"""
===========================================================
PANDAS BASICS REVISION FILE
Tasks 1 â†’ 10 (Series & DataFrame Fundamentals)

This file is structured for long-term memory retention.
Every section includes:
- Concept
- Code
- Explanation

Author: Your Data Science Journey ðŸš€
===========================================================
"""



In [4]:

import pandas as pd


# ===========================================================
# TASK 1 â€” Create Series from List & Dictionary
# ===========================================================

# Concept:
# Series = 1D labeled array
# Can be created from list or dictionary

print("\n===== TASK 1: SERIES CREATION =====")

# From list
price_list = [120, 60, 80, 150, 200]
series_from_list = pd.Series(price_list)
print("\nSeries from List:")
print(series_from_list)

# From dictionary
price_dict = {
    "Apple": 120,
    "Banana": 60,
    "Orange": 80
}
series_from_dict = pd.Series(price_dict)
print("\nSeries from Dictionary:")
print(series_from_dict)

# Key Idea:
# Dictionary keys become index labels automatically.




===== TASK 1: SERIES CREATION =====

Series from List:
0    120
1     60
2     80
3    150
4    200
dtype: int64

Series from Dictionary:
Apple     120
Banana     60
Orange     80
dtype: int64


In [5]:

# ===========================================================
# TASK 2 â€” Create DataFrame
# ===========================================================

print("\n===== TASK 2: DATAFRAME CREATION =====")

data = {
    "fruit": ["Apple", "Banana", "Orange", "Mango", "Grapes"],
    "price": [120, 60, 80, 150, 200],
    "origin": ["USA", "India", "Brazil", "Bangladesh", "Italy"]
}

df = pd.DataFrame(data)
print(df)

# Key Idea:
# DataFrame = 2D structure (rows + columns)
# Think of it as an Excel sheet.




===== TASK 2: DATAFRAME CREATION =====
    fruit  price      origin
0   Apple    120         USA
1  Banana     60       India
2  Orange     80      Brazil
3   Mango    150  Bangladesh
4  Grapes    200       Italy


In [6]:

# ===========================================================
# TASK 3 â€” Access Row by Position (.iloc)
# ===========================================================

print("\n===== TASK 3: .iloc (Position-based indexing) =====")

print(df.iloc[0])  # First row

# Key Idea:
# .iloc uses numerical position
# df.iloc[row_index, column_index]




===== TASK 3: .iloc (Position-based indexing) =====
fruit     Apple
price       120
origin      USA
Name: 0, dtype: object


In [7]:

# ===========================================================
# TASK 4 â€” Access Row by Label (.loc)
# ===========================================================

print("\n===== TASK 4: .loc (Label-based indexing) =====")

print(df.loc[0])  # Row with label 0

# Key Idea:
# .loc uses index label
# Important when index is not numeric




===== TASK 4: .loc (Label-based indexing) =====
fruit     Apple
price       120
origin      USA
Name: 0, dtype: object


In [8]:

# ===========================================================
# TASK 5 â€” Add New Column (Derived Column)
# ===========================================================

print("\n===== TASK 5: ADD NEW COLUMN =====")

df["price_after_tax"] = df["price"] * 1.1
print(df)

# Key Idea:
# You can create new column using existing column
# Vectorized operation (no loop needed)




===== TASK 5: ADD NEW COLUMN =====
    fruit  price      origin  price_after_tax
0   Apple    120         USA            132.0
1  Banana     60       India             66.0
2  Orange     80      Brazil             88.0
3   Mango    150  Bangladesh            165.0
4  Grapes    200       Italy            220.0


In [9]:

# ===========================================================
# TASK 6 â€” Delete Column and Row
# ===========================================================

print("\n===== TASK 6: DELETE COLUMN & ROW =====")

# Delete column
df = df.drop(columns=["price_after_tax"])

# Delete row (example: drop index 1)
df = df.drop(index=1)

print(df)

# Key Idea:
# drop() removes data
# Must assign back unless using inplace=True




===== TASK 6: DELETE COLUMN & ROW =====
    fruit  price      origin
0   Apple    120         USA
2  Orange     80      Brazil
3   Mango    150  Bangladesh
4  Grapes    200       Italy


In [10]:

# ===========================================================
# TASK 7 â€” Filter & Sort Data
# ===========================================================

print("\n===== TASK 7: FILTER & SORT =====")

# Filter (price >= 100)
df_filtered = df[df["price"] >= 100]
print("\nFiltered (price >= 100):")
print(df_filtered)

# Sort by price
df_sorted = df.sort_values(by="price", ascending=True)
print("\nSorted by price:")
print(df_sorted)

# Key Idea:
# Filtering uses boolean conditions
# Sorting rearranges rows




===== TASK 7: FILTER & SORT =====

Filtered (price >= 100):
    fruit  price      origin
0   Apple    120         USA
3   Mango    150  Bangladesh
4  Grapes    200       Italy

Sorted by price:
    fruit  price      origin
2  Orange     80      Brazil
0   Apple    120         USA
3   Mango    150  Bangladesh
4  Grapes    200       Italy


In [11]:

# ===========================================================
# TASK 8 â€” Reset Index
# ===========================================================

print("\n===== TASK 8: RESET INDEX =====")

df_reset = df_filtered.reset_index(drop=True)
print(df_reset)

# Key Idea:
# After filtering, index may look like 0,3,4
# reset_index(drop=True) makes clean 0,1,2




===== TASK 8: RESET INDEX =====
    fruit  price      origin
0   Apple    120         USA
1   Mango    150  Bangladesh
2  Grapes    200       Italy


In [12]:

# ===========================================================
# TASK 9 â€” Rename Columns
# ===========================================================

print("\n===== TASK 9: RENAME COLUMNS =====")

df_renamed = df.rename(columns={
    "fruit": "product",
    "price": "unit_price",
    "origin": "country"
})

print(df_renamed)

# Make all column names lowercase
df_renamed.columns = df_renamed.columns.str.lower()

print("\nLowercase column names:")
print(df_renamed)

# Key Idea:
# rename() uses dictionary: old_name â†’ new_name
# Cleaning column names is part of real-world data cleaning




===== TASK 9: RENAME COLUMNS =====
  product  unit_price     country
0   Apple         120         USA
2  Orange          80      Brazil
3   Mango         150  Bangladesh
4  Grapes         200       Italy

Lowercase column names:
  product  unit_price     country
0   Apple         120         USA
2  Orange          80      Brazil
3   Mango         150  Bangladesh
4  Grapes         200       Italy


In [13]:

# ===========================================================
# TASK 10 â€” DataFrame Info & Memory Usage
# ===========================================================

print("\n===== TASK 10: DATAFRAME INFO =====")

print("\n--- INFO ---")
df_renamed.info()

print("\n--- MEMORY USAGE ---")
print(df_renamed.memory_usage(deep=True))

print("\n--- DESCRIBE (ALL) ---")
print(df_renamed.describe(include="all"))

# Change dtype example
df_renamed["unit_price"] = df_renamed["unit_price"].astype(float)

print("\n--- INFO AFTER DTYPE CHANGE ---")
df_renamed.info()

# Key Idea:
# .info() â†’ structure + dtype + missing values
# .memory_usage() â†’ memory consumption
# .describe() â†’ statistical summary
# dtype affects memory & ML performance




===== TASK 10: DATAFRAME INFO =====

--- INFO ---
<class 'pandas.DataFrame'>
Index: 4 entries, 0 to 4
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   product     4 non-null      str  
 1   unit_price  4 non-null      int64
 2   country     4 non-null      str  
dtypes: int64(1), str(2)
memory usage: 128.0 bytes

--- MEMORY USAGE ---
Index          32
product       218
unit_price     32
country       220
dtype: int64

--- DESCRIBE (ALL) ---


       product  unit_price country
count        4     4.00000       4
unique       4         NaN       4
top      Apple         NaN     USA
freq         1         NaN       1
mean       NaN   137.50000     NaN
std        NaN    50.57997     NaN
min        NaN    80.00000     NaN
25%        NaN   110.00000     NaN
50%        NaN   135.00000     NaN
75%        NaN   162.50000     NaN
max        NaN   200.00000     NaN

--- INFO AFTER DTYPE CHANGE ---
<class 'pandas.DataFrame'>
Index: 4 entries, 0 to 4
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   product     4 non-null      str    
 1   unit_price  4 non-null      float64
 2   country     4 non-null      str    
dtypes: float64(1), str(2)
memory usage: 128.0 bytes


In [14]:

"""
===========================================================
FINAL REVISION SUMMARY

Series:
- 1D labeled structure

DataFrame:
- 2D table structure

.loc:
- Label-based indexing

.iloc:
- Position-based indexing

Filtering:
- Boolean conditions

Sorting:
- sort_values()

Reset Index:
- reset_index(drop=True)

Rename Columns:
- rename(columns={})

Info & Memory:
- .info()
- .memory_usage()
- .describe()

If you understand this file fully,
you have mastered Pandas fundamentals.
===========================================================
"""




In [15]:
"""
===========================================================
PANDAS FILE INPUT / OUTPUT PRACTICE
Tasks 11 â†’ 20

Focus:
- Reading files
- Writing files
- Handling large data
- Performance awareness
- Professional data loading habits

===========================================================
"""




In [16]:

import pandas as pd
import json


# ===========================================================
# SETUP: Create Sample Data
# ===========================================================

print("\n===== SETUP: CREATE SAMPLE DATA =====")

data = {
    "id": [1, 2, 3, 4, 5],
    "name": ["Apple", "Banana", "Orange", "Mango", None],
    "price": [120, 60, 80, None, 200],
    "origin": ["USA", "India", "Brazil", "Bangladesh", "Italy"]
}

df = pd.DataFrame(data)
print(df)




===== SETUP: CREATE SAMPLE DATA =====
   id    name  price      origin
0   1   Apple  120.0         USA
1   2  Banana   60.0       India
2   3  Orange   80.0      Brazil
3   4   Mango    NaN  Bangladesh
4   5     NaN  200.0       Italy


In [17]:

# ===========================================================
# TASK 11 â€” Read CSV & Excel Files
# ===========================================================

print("\n===== TASK 11: WRITE THEN READ CSV & EXCEL =====")

# First save files
df.to_csv("sample.csv", index=False)
# df.to_excel("sample.xlsx", index=False)

# Read CSV
df_csv = pd.read_csv("sample.csv")
print("\nRead CSV:")
print(df_csv)

# Read Excel
# df_excel = pd.read_excel("sample.xlsx")
# print("\nRead Excel:")
# print(df_excel)

# Key Idea:
# read_csv() is most common in real-world datasets.
# Excel reading requires openpyxl installed.




===== TASK 11: WRITE THEN READ CSV & EXCEL =====

Read CSV:
   id    name  price      origin
0   1   Apple  120.0         USA
1   2  Banana   60.0       India
2   3  Orange   80.0      Brazil
3   4   Mango    NaN  Bangladesh
4   5     NaN  200.0       Italy


In [18]:

# ===========================================================
# TASK 12 â€” Write DataFrame to CSV & Excel
# ===========================================================

print("\n===== TASK 12: WRITE DATAFRAME =====")

df.to_csv("output.csv", index=False)
# df.to_excel("output.xlsx", index=False)

# Always use index=False unless index is meaningful.




===== TASK 12: WRITE DATAFRAME =====


In [19]:

# ===========================================================
# TASK 13 â€” Load JSON Data into DataFrame
# ===========================================================

print("\n===== TASK 13: JSON LOADING =====")

# Save JSON
df.to_json("sample.json", orient="records", indent=4)

# Read JSON
df_json = pd.read_json("sample.json")
print(df_json)

# Key Idea:
# JSON is common in APIs and web data.




===== TASK 13: JSON LOADING =====
   id    name  price      origin
0   1   Apple  120.0         USA
1   2  Banana   60.0       India
2   3  Orange   80.0      Brazil
3   4   Mango    NaN  Bangladesh
4   5     NaN  200.0       Italy


In [20]:

# ===========================================================
# TASK 14 â€” Select Specific Columns During Import
# ===========================================================

print("\n===== TASK 14: READ SPECIFIC COLUMNS =====")

df_selected = pd.read_csv("sample.csv", usecols=["name", "price"])
print(df_selected)

# usecols saves memory and improves speed.




===== TASK 14: READ SPECIFIC COLUMNS =====
     name  price
0   Apple  120.0
1  Banana   60.0
2  Orange   80.0
3   Mango    NaN
4     NaN  200.0


In [21]:

# ===========================================================
# TASK 15 â€” Read Large CSV in Chunks
# ===========================================================

print("\n===== TASK 15: READ IN CHUNKS =====")

# Simulating chunk reading
chunk_size = 2

for chunk in pd.read_csv("sample.csv", chunksize=chunk_size):
    print("\nChunk:")
    print(chunk)

# Key Idea:
# Used when file is too large for RAM.
# chunksize prevents memory crash.




===== TASK 15: READ IN CHUNKS =====

Chunk:
   id    name  price origin
0   1   Apple  120.0    USA
1   2  Banana   60.0  India



Chunk:
   id    name  price      origin
2   3  Orange   80.0      Brazil
3   4   Mango    NaN  Bangladesh

Chunk:
   id  name  price origin
4   5   NaN  200.0  Italy


In [22]:

# ===========================================================
# TASK 16 â€” Export Subset of DataFrame
# ===========================================================

print("\n===== TASK 16: EXPORT SUBSET =====")

df_subset = df[df["price"] > 100]
df_subset.to_csv("high_price.csv", index=False)

print(df_subset)

# Always filter before exporting when needed.




===== TASK 16: EXPORT SUBSET =====
   id   name  price origin
0   1  Apple  120.0    USA
4   5    NaN  200.0  Italy


In [23]:

# ===========================================================
# TASK 17 â€” Handle Files with Missing Values
# ===========================================================

print("\n===== TASK 17: HANDLE MISSING VALUES =====")

df_missing = pd.read_csv("sample.csv")

print("\nBefore Handling Missing:")
print(df_missing)

# Fill missing numeric values
df_missing["price"] = df_missing["price"].fillna(0)

# Fill missing string values
df_missing["name"] = df_missing["name"].fillna("Unknown")

print("\nAfter Handling Missing:")
print(df_missing)

# Key Idea:
# Missing values appear as NaN.
# Must clean before ML.




===== TASK 17: HANDLE MISSING VALUES =====

Before Handling Missing:
   id    name  price      origin
0   1   Apple  120.0         USA
1   2  Banana   60.0       India
2   3  Orange   80.0      Brazil
3   4   Mango    NaN  Bangladesh
4   5     NaN  200.0       Italy

After Handling Missing:
   id     name  price      origin
0   1    Apple  120.0         USA
1   2   Banana   60.0       India
2   3   Orange   80.0      Brazil
3   4    Mango    0.0  Bangladesh
4   5  Unknown  200.0       Italy


In [24]:

# ===========================================================
# TASK 18 â€” Specify Dtypes While Reading
# ===========================================================

print("\n===== TASK 18: SPECIFY DTYPES =====")

df_typed = pd.read_csv(
    "sample.csv",
    dtype={
        "id": "int32",
        "price": "float32"
    }
)

print(df_typed.info())

# Why?
# Reduces memory usage in large datasets.




===== TASK 18: SPECIFY DTYPES =====
<class 'pandas.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      5 non-null      int32  
 1   name    4 non-null      str    
 2   price   4 non-null      float32
 3   origin  5 non-null      str    
dtypes: float32(1), int32(1), str(2)
memory usage: 252.0 bytes
None


In [25]:

# ===========================================================
# TASK 19 â€” Save & Load Compressed CSV
# ===========================================================

print("\n===== TASK 19: COMPRESSED CSV =====")

df.to_csv("compressed.csv.gz", compression="gzip", index=False)

df_compressed = pd.read_csv("compressed.csv.gz", compression="gzip")
print(df_compressed)

# Useful for large datasets.




===== TASK 19: COMPRESSED CSV =====
   id    name  price      origin
0   1   Apple  120.0         USA
1   2  Banana   60.0       India
2   3  Orange   80.0      Brazil
3   4   Mango    NaN  Bangladesh
4   5     NaN  200.0       Italy


In [26]:

# ===========================================================
# TASK 20 â€” Practice Reading Kaggle Datasets
# ===========================================================

print("\n===== TASK 20: KAGGLE DATASET PRACTICE =====")

"""
Steps for Kaggle:

1. Download dataset from kaggle.com
2. Place CSV file in project folder
3. Load using:

df = pd.read_csv("dataset_name.csv")

4. Immediately inspect:
   df.head()
   df.shape
   df.info()
   df.describe()

5. Check missing values:
   df.isnull().sum()

This workflow is STANDARD in industry.
"""

print("Kaggle workflow instructions added above.")



===== TASK 20: KAGGLE DATASET PRACTICE =====
Kaggle workflow instructions added above.


In [27]:

# ===========================================================
# FINAL SUMMARY
# ===========================================================

"""
===========================================================
YOU NOW KNOW:

- read_csv()
- read_excel()
- read_json()
- to_csv()
- to_excel()
- usecols
- chunksize
- fillna()
- dtype specification
- compression handling

If you master this,
you are ready for REAL DATA.

Next Level:
- Data Cleaning Pipeline
- EDA (Exploratory Data Analysis)
- Feature Engineering
===========================================================
"""




In [28]:
"""
===========================================================
PANDAS DATA CLEANING & TRANSFORMATION PRACTICE
Tasks 21 â†’ 30

Focus:
- Handling missing values
- Duplicates
- Type conversion
- String operations
- Lambda functions

===========================================================
"""




In [29]:

import pandas as pd
import numpy as np

# ===========================================================
# SETUP: Sample DataFrame with missing values and duplicates
# ===========================================================

print("\n===== SETUP =====")

data = {
    "id": [1, 2, 3, 4, 5, 5],
    "name": ["Apple ", "Banana", " Orange", None, "Grapes", "Grapes"],
    "price": [120, 60, np.nan, 150, 200, 200],
    "origin": ["USA", "India", "Brazil", "Bangladesh", "Italy", "Italy"]
}

df = pd.DataFrame(data)
print(df)




===== SETUP =====
   id     name  price      origin
0   1   Apple   120.0         USA
1   2   Banana   60.0       India
2   3   Orange    NaN      Brazil
3   4      NaN  150.0  Bangladesh
4   5   Grapes  200.0       Italy
5   5   Grapes  200.0       Italy


In [30]:

# ===========================================================
# TASK 21 â€” Find Missing Values
# ===========================================================

print("\n===== TASK 21: FIND MISSING VALUES =====")

# Count missing values per column
print(df.isnull())

print("\nMissing count per column:")
print(df.isnull().sum())

# Key Idea:
# isnull() returns Boolean mask
# sum() counts True values (NaN)




===== TASK 21: FIND MISSING VALUES =====
      id   name  price  origin
0  False  False  False   False
1  False  False  False   False
2  False  False   True   False
3  False   True  False   False
4  False  False  False   False
5  False  False  False   False

Missing count per column:
id        0
name      1
price     1
origin    0
dtype: int64


In [31]:

# ===========================================================
# TASK 22 â€” Drop Rows & Columns with NaNs
# ===========================================================

print("\n===== TASK 22: DROP ROWS & COLUMNS WITH NaN =====")

# Drop rows with any NaN
df_drop_rows = df.dropna()
print("\nRows dropped if NaN:")
print(df_drop_rows)

# Drop columns with any NaN
df_drop_cols = df.dropna(axis=1)
print("\nColumns dropped if NaN:")
print(df_drop_cols)




===== TASK 22: DROP ROWS & COLUMNS WITH NaN =====

Rows dropped if NaN:
   id    name  price origin
0   1  Apple   120.0    USA
1   2  Banana   60.0  India
4   5  Grapes  200.0  Italy
5   5  Grapes  200.0  Italy

Columns dropped if NaN:
   id      origin
0   1         USA
1   2       India
2   3      Brazil
3   4  Bangladesh
4   5       Italy
5   5       Italy


In [32]:

# ===========================================================
# TASK 23 â€” Fill Missing Values (Mean/Median/Mode)
# ===========================================================

print("\n===== TASK 23: FILL MISSING VALUES =====")

# Fill numeric with mean
df["price"] = df["price"].fillna(df["price"].mean())

# Fill string with mode
df["name"] = df["name"].fillna(df["name"].mode()[0])

print(df)




===== TASK 23: FILL MISSING VALUES =====
   id     name  price      origin
0   1   Apple   120.0         USA
1   2   Banana   60.0       India
2   3   Orange  146.0      Brazil
3   4   Grapes  150.0  Bangladesh
4   5   Grapes  200.0       Italy
5   5   Grapes  200.0       Italy


In [33]:

# ===========================================================
# TASK 24 â€” Drop Duplicates
# ===========================================================

print("\n===== TASK 24: DROP DUPLICATES =====")

df_no_dup = df.drop_duplicates()
print(df_no_dup)

# Key Idea:
# drop_duplicates() checks all columns by default
# Can also use subset=["id"] to drop based on specific column




===== TASK 24: DROP DUPLICATES =====
   id     name  price      origin
0   1   Apple   120.0         USA
1   2   Banana   60.0       India
2   3   Orange  146.0      Brazil
3   4   Grapes  150.0  Bangladesh
4   5   Grapes  200.0       Italy


In [34]:

# ===========================================================
# TASK 25 â€” Convert Column Types
# ===========================================================

print("\n===== TASK 25: COLUMN TYPE CONVERSION =====")

# Convert id to float
df["id"] = df["id"].astype(float)
# Convert price to int
df["price"] = df["price"].astype(int)
# Convert id back to string
df["id"] = df["id"].astype(str)

print(df.info())




===== TASK 25: COLUMN TYPE CONVERSION =====
<class 'pandas.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      6 non-null      str  
 1   name    6 non-null      str  
 2   price   6 non-null      int64
 3   origin  6 non-null      str  
dtypes: int64(1), str(3)
memory usage: 324.0 bytes
None


In [35]:

# ===========================================================
# TASK 26 â€” Strip Whitespace from Strings
# ===========================================================

print("\n===== TASK 26: STRIP WHITESPACE =====")

df["name"] = df["name"].str.strip()
print(df["name"])




===== TASK 26: STRIP WHITESPACE =====
0     Apple
1    Banana
2    Orange
3    Grapes
4    Grapes
5    Grapes
Name: name, dtype: str


In [36]:

# ===========================================================
# TASK 27 â€” Lowercase / Uppercase Strings
# ===========================================================

print("\n===== TASK 27: LOWERCASE & UPPERCASE =====")

df["name_lower"] = df["name"].str.lower()
df["name_upper"] = df["name"].str.upper()
print(df[["name", "name_lower", "name_upper"]])




===== TASK 27: LOWERCASE & UPPERCASE =====
     name name_lower name_upper
0   Apple      apple      APPLE
1  Banana     banana     BANANA
2  Orange     orange     ORANGE
3  Grapes     grapes     GRAPES
4  Grapes     grapes     GRAPES
5  Grapes     grapes     GRAPES


In [37]:

# ===========================================================
# TASK 28 â€” Replace Values Conditionally
# ===========================================================

print("\n===== TASK 28: REPLACE VALUES CONDITIONALLY =====")

# Replace price < 100 â†’ 100
df["price"] = df["price"].apply(lambda x: 100 if x < 100 else x)
print(df)




===== TASK 28: REPLACE VALUES CONDITIONALLY =====
    id    name  price      origin name_lower name_upper
0  1.0   Apple    120         USA      apple      APPLE
1  2.0  Banana    100       India     banana     BANANA
2  3.0  Orange    146      Brazil     orange     ORANGE
3  4.0  Grapes    150  Bangladesh     grapes     GRAPES
4  5.0  Grapes    200       Italy     grapes     GRAPES
5  5.0  Grapes    200       Italy     grapes     GRAPES


In [38]:

# ===========================================================
# TASK 29 â€” Extract Substring from Column
# ===========================================================

print("\n===== TASK 29: EXTRACT SUBSTRING =====")

# Extract first 3 letters of name
df["name_prefix"] = df["name"].str[:3]
print(df[["name", "name_prefix"]])




===== TASK 29: EXTRACT SUBSTRING =====
     name name_prefix
0   Apple         App
1  Banana         Ban
2  Orange         Ora
3  Grapes         Gra
4  Grapes         Gra
5  Grapes         Gra


In [39]:

# ===========================================================
# TASK 30 â€” Apply Lambda Function on Column
# ===========================================================

print("\n===== TASK 30: APPLY LAMBDA FUNCTION =====")

# Create new column: price after 10% tax
df["price_after_tax"] = df["price"].apply(lambda x: round(x * 1.1, 2))
print(df)




===== TASK 30: APPLY LAMBDA FUNCTION =====
    id    name  price      origin name_lower name_upper name_prefix  \
0  1.0   Apple    120         USA      apple      APPLE         App   
1  2.0  Banana    100       India     banana     BANANA         Ban   
2  3.0  Orange    146      Brazil     orange     ORANGE         Ora   
3  4.0  Grapes    150  Bangladesh     grapes     GRAPES         Gra   
4  5.0  Grapes    200       Italy     grapes     GRAPES         Gra   
5  5.0  Grapes    200       Italy     grapes     GRAPES         Gra   

   price_after_tax  
0            132.0  
1            110.0  
2            160.6  
3            165.0  
4            220.0  
5            220.0  


In [40]:

"""
===========================================================
FINAL NOTES:

- Missing values:
    isnull(), dropna(), fillna()

- Duplicates:
    drop_duplicates()

- Type conversion:
    astype()

- String cleaning:
    str.strip(), str.lower(), str.upper(), slicing

- Conditional replacement:
    apply(lambda x: ...)

- Substrings:
    str[:n]

- Lambda functions:
    Vectorized operations, clean and fast

Master these tasks â†’ You can handle 80% of messy real datasets!
===========================================================
"""




In [41]:
"""
===========================================================
PANDAS EDA PRACTICE
Tasks 31 â†’ 40

Focus:
- Summary statistics
- Grouping
- Correlation
- Outlier detection
- Ranking
- Pivot tables
- Crosstab

===========================================================
"""




In [42]:

import pandas as pd
import numpy as np


# ===========================================================
# SETUP: Sample Dataset
# ===========================================================

print("\n===== SETUP DATA =====")

data = {
    "product": ["Apple", "Banana", "Orange", "Mango", "Grapes",
                "Apple", "Mango", "Banana", "Apple", "Grapes"],
    "category": ["Fruit", "Fruit", "Fruit", "Fruit", "Fruit",
                 "Fruit", "Fruit", "Fruit", "Fruit", "Fruit"],
    "price": [120, 60, 80, 150, 200, 130, 170, 65, 125, 210],
    "quantity": [10, 20, 15, 8, 5, 12, 7, 18, 9, 4],
    "origin": ["USA", "India", "Brazil", "Bangladesh", "Italy",
               "USA", "Bangladesh", "India", "USA", "Italy"]
}

df = pd.DataFrame(data)
print(df)




===== SETUP DATA =====
  product category  price  quantity      origin
0   Apple    Fruit    120        10         USA
1  Banana    Fruit     60        20       India
2  Orange    Fruit     80        15      Brazil
3   Mango    Fruit    150         8  Bangladesh
4  Grapes    Fruit    200         5       Italy
5   Apple    Fruit    130        12         USA
6   Mango    Fruit    170         7  Bangladesh
7  Banana    Fruit     65        18       India
8   Apple    Fruit    125         9         USA
9  Grapes    Fruit    210         4       Italy


In [43]:

# ===========================================================
# TASK 31 â€” Summary Statistics
# ===========================================================

print("\n===== TASK 31: SUMMARY STATISTICS =====")

print(df.describe())  # numeric summary
print("\nIncluding categorical:")
print(df.describe(include="all"))

# Key Idea:
# Shows count, mean, std, min, percentiles, max




===== TASK 31: SUMMARY STATISTICS =====
            price   quantity
count   10.000000  10.000000
mean   131.000000  10.800000
std     52.746775   5.391351
min     60.000000   4.000000
25%     90.000000   7.250000
50%    127.500000   9.500000
75%    165.000000  14.250000
max    210.000000  20.000000

Including categorical:
       product category       price   quantity origin
count       10       10   10.000000  10.000000     10
unique       5        1         NaN        NaN      5
top      Apple    Fruit         NaN        NaN    USA
freq         3       10         NaN        NaN      3
mean       NaN      NaN  131.000000  10.800000    NaN
std        NaN      NaN   52.746775   5.391351    NaN
min        NaN      NaN   60.000000   4.000000    NaN
25%        NaN      NaN   90.000000   7.250000    NaN
50%        NaN      NaN  127.500000   9.500000    NaN
75%        NaN      NaN  165.000000  14.250000    NaN
max        NaN      NaN  210.000000  20.000000    NaN


In [44]:

# ===========================================================
# TASK 32 â€” Count Unique Values Per Column
# ===========================================================

print("\n===== TASK 32: UNIQUE VALUE COUNT =====")

print(df.nunique())

# Unique values of a single column
print("\nUnique products:")
print(df["product"].unique())




===== TASK 32: UNIQUE VALUE COUNT =====
product      5
category     1
price       10
quantity    10
origin       5
dtype: int64

Unique products:
<StringArray>
['Apple', 'Banana', 'Orange', 'Mango', 'Grapes']
Length: 5, dtype: str


In [45]:

# ===========================================================
# TASK 33 â€” Group By & Compute Mean
# ===========================================================

print("\n===== TASK 33: GROUPBY MEAN =====")

grouped = df.groupby("product")["price"].mean()
print(grouped)

# Multiple aggregations
grouped_multi = df.groupby("product").agg({
    "price": "mean",
    "quantity": "sum"
})
print("\nMultiple aggregation:")
print(grouped_multi)

# Key Idea:
# groupby() splits data into groups â†’ apply function â†’ combine




===== TASK 33: GROUPBY MEAN =====
product
Apple     125.0
Banana     62.5
Grapes    205.0
Mango     160.0
Orange     80.0
Name: price, dtype: float64

Multiple aggregation:
         price  quantity
product                 
Apple    125.0        31
Banana    62.5        38
Grapes   205.0         9
Mango    160.0        15
Orange    80.0        15


In [46]:

# ===========================================================
# TASK 34 â€” Correlation Matrix
# ===========================================================

print("\n===== TASK 34: CORRELATION MATRIX =====")

correlation = df.corr(numeric_only=True)
print(correlation)

# Correlation range:
# -1 â†’ strong negative
# 0  â†’ no relation
# +1 â†’ strong positive




===== TASK 34: CORRELATION MATRIX =====
            price  quantity
price     1.00000  -0.96234
quantity -0.96234   1.00000


In [47]:

# ===========================================================
# TASK 35 â€” Top 5 Values of Column
# ===========================================================

print("\n===== TASK 35: TOP 5 VALUES =====")

top5 = df.sort_values(by="price", ascending=False).head(5)
print(top5)

# Alternative:
print("\nUsing nlargest():")
print(df.nlargest(5, "price"))




===== TASK 35: TOP 5 VALUES =====
  product category  price  quantity      origin
9  Grapes    Fruit    210         4       Italy
4  Grapes    Fruit    200         5       Italy
6   Mango    Fruit    170         7  Bangladesh
3   Mango    Fruit    150         8  Bangladesh
5   Apple    Fruit    130        12         USA

Using nlargest():
  product category  price  quantity      origin
9  Grapes    Fruit    210         4       Italy
4  Grapes    Fruit    200         5       Italy
6   Mango    Fruit    170         7  Bangladesh
3   Mango    Fruit    150         8  Bangladesh
5   Apple    Fruit    130        12         USA


In [48]:

# ===========================================================
# TASK 36 â€” Identify Outliers Using IQR
# ===========================================================

print("\n===== TASK 36: OUTLIERS USING IQR =====")

Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df["price"] < lower_bound) |
              (df["price"] > upper_bound)]

print("Outliers:")
print(outliers)

# IQR method:
# Values outside (Q1 - 1.5*IQR, Q3 + 1.5*IQR)




===== TASK 36: OUTLIERS USING IQR =====
Outliers:
Empty DataFrame
Columns: [product, category, price, quantity, origin]
Index: []


In [49]:

# ===========================================================
# TASK 37 â€” Rank Numerical Column
# ===========================================================

print("\n===== TASK 37: RANK COLUMN =====")

df["price_rank"] = df["price"].rank(ascending=False)
print(df[["product", "price", "price_rank"]])

# Rank assigns position based on value




===== TASK 37: RANK COLUMN =====
  product  price  price_rank
0   Apple    120         7.0
1  Banana     60        10.0
2  Orange     80         8.0
3   Mango    150         4.0
4  Grapes    200         2.0
5   Apple    130         5.0
6   Mango    170         3.0
7  Banana     65         9.0
8   Apple    125         6.0
9  Grapes    210         1.0


In [50]:

# ===========================================================
# TASK 38 â€” Filter Based on Multiple Conditions
# ===========================================================

print("\n===== TASK 38: MULTIPLE CONDITIONS =====")

filtered = df[(df["price"] > 100) & (df["quantity"] > 8)]
print(filtered)

# Use:
# & â†’ AND
# | â†’ OR
# Always wrap conditions in parentheses




===== TASK 38: MULTIPLE CONDITIONS =====
  product category  price  quantity origin  price_rank
0   Apple    Fruit    120        10    USA         7.0
5   Apple    Fruit    130        12    USA         5.0
8   Apple    Fruit    125         9    USA         6.0


In [51]:

# ===========================================================
# TASK 39 â€” Pivot Table Basics
# ===========================================================

print("\n===== TASK 39: PIVOT TABLE =====")

pivot = pd.pivot_table(
    df,
    values="price",
    index="product",
    columns="origin",
    aggfunc="mean"
)

print(pivot)

# Pivot table = advanced groupby with rows & columns




===== TASK 39: PIVOT TABLE =====
origin   Bangladesh  Brazil  India  Italy    USA
product                                         
Apple           NaN     NaN    NaN    NaN  125.0
Banana          NaN     NaN   62.5    NaN    NaN
Grapes          NaN     NaN    NaN  205.0    NaN
Mango         160.0     NaN    NaN    NaN    NaN
Orange          NaN    80.0    NaN    NaN    NaN


In [52]:

# ===========================================================
# TASK 40 â€” Cross Tabulation
# ===========================================================

print("\n===== TASK 40: CROSS TABULATION =====")

crosstab = pd.crosstab(df["product"], df["origin"])
print(crosstab)

# Crosstab counts frequency between two categorical columns




===== TASK 40: CROSS TABULATION =====
origin   Bangladesh  Brazil  India  Italy  USA
product                                       
Apple             0       0      0      0    3
Banana            0       0      2      0    0
Grapes            0       0      0      2    0
Mango             2       0      0      0    0
Orange            0       1      0      0    0


In [53]:

"""
===========================================================
FINAL EDA CONCEPT SUMMARY

describe() â†’ summary stats
nunique() â†’ distinct counts
groupby() â†’ split-apply-combine
corr() â†’ correlation relationships
nlargest() â†’ top values
IQR â†’ outlier detection
rank() â†’ ordering values
boolean filtering â†’ multiple conditions
pivot_table() â†’ reshaped grouped summary
crosstab() â†’ cat
"""

