41. Merging two datasets(inner and outer join)

In [1]:
import pandas as pd

# Sample DataFrames
df1 = pd.DataFrame({'id':[1,2,3], 'value1':['A','B','C']})
df2 = pd.DataFrame({'id':[2,3,4], 'value2':['X','Y','Z']})

# Inner join
inner_merge = pd.merge(df1, df2, on='id', how='inner')

# Outer join
outer_merge = pd.merge(df1, df2, on='id', how='outer')

print(inner_merge)
print(outer_merge)

   id value1 value2
0   2      B      X
1   3      C      Y
   id value1 value2
0   1      A    NaN
1   2      B      X
2   3      C      Y
3   4    NaN      Z


42. Pivot table for total sales

In [2]:
# Sample dataset
data = pd.DataFrame({
    'Category':['Electronics','Electronics','Clothing','Clothing'],
    'Sales':[200,150,100,250]
})

pivot = pd.pivot_table(data, values='Sales', index='Category', aggfunc='sum')
print(pivot)

             Sales
Category          
Clothing       350
Electronics    350


43. Reshaping dataframe using melt()

In [3]:
wide = pd.DataFrame({
    'id':[1,2],
    'Math':[90,80],
    'Science':[85,75]
})

long = pd.melt(wide, id_vars=['id'], var_name='Subject', value_name='Score')
print(long)

   id  Subject  Score
0   1     Math     90
1   2     Math     80
2   1  Science     85
3   2  Science     75


44. Detecting and handling missing values

In [4]:
df = pd.DataFrame({
    'A':[1,2,None,4],
    'B':['x',None,'y','z']
})

# Detect
print(df.isnull().sum())

# Handle: fill with default values
df_filled = df.fillna({'A':df['A'].mean(), 'B':'missing'})
print(df_filled)

# Or drop rows with missing values
df_dropped = df.dropna()
print(df_dropped)

A    1
B    1
dtype: int64
          A        B
0  1.000000        x
1  2.000000  missing
2  2.333333        y
3  4.000000        z
     A  B
0  1.0  x
3  4.0  z


45. Encoding categorical variables using one-hot encoding

In [5]:
df = pd.DataFrame({'Color':['Red','Blue','Green','Red']})

encoded = pd.get_dummies(df, columns=['Color'])
print(encoded)

   Color_Blue  Color_Green  Color_Red
0       False        False       True
1        True        False      False
2       False         True      False
3       False        False       True


46. Converting column to datetime & time-based indexing

In [6]:
df = pd.DataFrame({
    'Date':['2025-01-01','2025-01-02','2025-01-03'],
    'Sales':[100,200,150]
})

# Convert to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Set as index
df = df.set_index('Date')

# Time-based indexing
print(df.loc['2025-01-02'])

Sales    200
Name: 2025-01-02 00:00:00, dtype: int64


47. Forward and backward fill on time-series data

In [7]:
import pandas as pd

# Sample time-series
ts = pd.DataFrame({
    'Date': pd.date_range('2025-01-01', periods=5),
    'Value':[10, None, None, 20, None]
}).set_index('Date')

# Forward fill
ffill = ts.ffill()

# Backward fill
bfill = ts.bfill()

print(ffill)
print(bfill)

            Value
Date             
2025-01-01   10.0
2025-01-02   10.0
2025-01-03   10.0
2025-01-04   20.0
2025-01-05   20.0
            Value
Date             
2025-01-01   10.0
2025-01-02   20.0
2025-01-03   20.0
2025-01-04   20.0
2025-01-05    NaN


48. Normalizing and standarizing numerical features

In [8]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

df = pd.DataFrame({'Feature':[10,20,30,40,50]})

# Normalize (0–1 scale)
scaler = MinMaxScaler()
df['Normalized'] = scaler.fit_transform(df[['Feature']])

# Standardize (mean=0, std=1)
std_scaler = StandardScaler()
df['Standardized'] = std_scaler.fit_transform(df[['Feature']])

print(df)

   Feature  Normalized  Standardized
0       10        0.00     -1.414214
1       20        0.25     -0.707107
2       30        0.50      0.000000
3       40        0.75      0.707107
4       50        1.00      1.414214


49. Reducing memory usage of dataframe

In [9]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'int_col': np.random.randint(0,1000, size=10000),
    'float_col': np.random.rand(10000),
    'cat_col': np.random.choice(['A','B','C'], size=10000)
})

# Convert int to smaller dtype
df['int_col'] = df['int_col'].astype('int16')

# Convert float to smaller dtype
df['float_col'] = df['float_col'].astype('float32')

# Convert object to category
df['cat_col'] = df['cat_col'].astype('category')

print(df.info(memory_usage='deep'))

<class 'pandas.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   int_col    10000 non-null  int16   
 1   float_col  10000 non-null  float32 
 2   cat_col    10000 non-null  category
dtypes: category(1), float32(1), int16(1)
memory usage: 68.6 KB
None


50. Processing large CSV file using chunking

In [11]:
import pandas as pd
import numpy as np

# Simulate a large dataset
rows = 10000
df_large = pd.DataFrame({
    "id": range(rows),
    "value": np.random.randint(1, 100, size=rows)
})

# Save to a temporary CSV
df_large.to_csv("simulated_large.csv", index=False)

# Read in chunks
chunk_iter = pd.read_csv("simulated_large.csv", chunksize=2000)

for i, chunk in enumerate(chunk_iter):
    print(f"Processing chunk {i+1} with {len(chunk)} rows")
    print("Chunk average:", chunk["value"].mean())

Processing chunk 1 with 2000 rows
Chunk average: 50.1175
Processing chunk 2 with 2000 rows
Chunk average: 50.434
Processing chunk 3 with 2000 rows
Chunk average: 50.9055
Processing chunk 4 with 2000 rows
Chunk average: 49.1885
Processing chunk 5 with 2000 rows
Chunk average: 50.4755


51. Reusable data cleaning function

In [13]:
def clean_data(df):
    # Drop duplicates
    df = df.drop_duplicates()
    # Fill missing numeric with mean
    for col in df.select_dtypes(include='number'):
        df[col] = df[col].fillna(df[col].mean())
    # Fill missing categorical with mode
    for col in df.select_dtypes(include='object'):
        df[col] = df[col].fillna(df[col].mode()[0])
    return df
print(df)

      int_col  float_col cat_col
0         953   0.466697       B
1         938   0.500477       A
2         794   0.122961       B
3         571   0.707239       A
4         238   0.082230       C
...       ...        ...     ...
9995      425   0.195040       A
9996      362   0.201111       A
9997      432   0.159121       A
9998      797   0.520129       A
9999      656   0.883197       A

[10000 rows x 3 columns]


52. Implementing simple data-cleaning pipeline

In [14]:
import pandas as pd

df = pd.DataFrame({
    'A':[1,2,None,4],
    'B':['x',None,'y','z']
})

# Pipeline steps
df = df.drop_duplicates()
df['A'] = df['A'].fillna(df['A'].mean())
df['B'] = df['B'].fillna('missing')

print(df)

          A        B
0  1.000000        x
1  2.000000  missing
2  2.333333        y
3  4.000000        z


53. Ingesting data from CSV,database,and API

In [26]:
import pandas as pd
import requests

# Example API (JSONPlaceholder)
response = requests.get("https://jsonplaceholder.typicode.com/posts")
api_df = pd.DataFrame(response.json())

print(api_df.head())

   userId  id                                              title  \
0       1   1  sunt aut facere repellat provident occaecati e...   
1       1   2                                       qui est esse   
2       1   3  ea molestias quasi exercitationem repellat qui...   
3       1   4                               eum et est occaecati   
4       1   5                                 nesciunt quas odio   

                                                body  
0  quia et suscipit\nsuscipit recusandae consequu...  
1  est rerum tempore vitae\nsequi sint nihil repr...  
2  et iusto sed quo iure\nvoluptatem occaecati om...  
3  ullam et saepe reiciendis voluptatem adipisci\...  
4  repudiandae veniam quaerat sunt sed\nalias aut...  


In [35]:
import pandas as pd
import sqlite3
import requests

# 1. CSV ingestion
csv_df = pd.DataFrame({'id':[1,2], 'name':['Honey','Bunny']})
csv_df.to_csv('data.csv', index=False)
csv_df = pd.read_csv('data.csv')

# 2. Database ingestion (SQLite)
conn = sqlite3.connect('example.db')
cursor = conn.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS sample (id INTEGER, name TEXT)")
cursor.execute("INSERT INTO sample VALUES (1,'Alice'), (2,'Bob')")
conn.commit()

db_df = pd.read_sql_query("SELECT * FROM sample", conn)

# 3. API ingestion (real endpoint)
response = requests.get("https://jsonplaceholder.typicode.com/posts")
api_df = pd.DataFrame(response.json())

print("CSV Data:\n", csv_df.head())
print("DB Data:\n", db_df.head())
print("API Data:\n", api_df.head())

CSV Data:
    id   name
0   1  Honey
1   2  Bunny
DB Data:
    id   name
0   1  Alice
1   2    Bob
2   1  Alice
3   2    Bob
4   1  Alice
API Data:
    userId  id                                              title  \
0       1   1  sunt aut facere repellat provident occaecati e...   
1       1   2                                       qui est esse   
2       1   3  ea molestias quasi exercitationem repellat qui...   
3       1   4                               eum et est occaecati   
4       1   5                                 nesciunt quas odio   

                                                body  
0  quia et suscipit\nsuscipit recusandae consequu...  
1  est rerum tempore vitae\nsequi sint nihil repr...  
2  et iusto sed quo iure\nvoluptatem occaecati om...  
3  ullam et saepe reiciendis voluptatem adipisci\...  
4  repudiandae veniam quaerat sunt sed\nalias aut...  


54. Merging multiple datasets into a unified table

In [36]:
import pandas as pd

df1 = pd.DataFrame({'id':[1,2], 'name':['Ross','Rachel']})
df2 = pd.DataFrame({'id':[2,3], 'age':[25,30]})
df3 = pd.DataFrame({'id':[1,3], 'city':['NY','LA']})

# Merge sequentially
merged = df1.merge(df2, on='id', how='outer').merge(df3, on='id', how='outer')
print(merged)

   id    name   age city
0   1    Ross   NaN   NY
1   2  Rachel  25.0  NaN
2   3     NaN  30.0   LA


55. Implementing RFM(Recency,Frequency,Monetary)

In [37]:
import pandas as pd
import datetime as dt

# Sample transactions
data = pd.DataFrame({
    'CustomerID':[1,1,2,2,3],
    'Date':[dt.date(2025,1,1), dt.date(2025,1,10),
            dt.date(2025,1,5), dt.date(2025,1,20),
            dt.date(2025,1,15)],
    'Amount':[100,200,150,300,400]
})

# Reference date
ref_date = dt.date(2025,1,31)

rfm = data.groupby('CustomerID').agg({
    'Date': lambda x: (ref_date - max(x)).days,   # Recency
    'CustomerID': 'count',                        # Frequency
    'Amount': 'sum'                               # Monetary
})

rfm.rename(columns={'Date':'Recency','CustomerID':'Frequency','Amount':'Monetary'}, inplace=True)
print(rfm)

            Recency  Frequency  Monetary
CustomerID                              
1                21          2       300
2                11          2       450
3                16          1       400


56. Computing CLV(Customer Lifetime Value)

In [38]:
# Simplified CLV: average purchase value × purchase frequency × customer lifespan
transactions = pd.DataFrame({
    'CustomerID':[1,1,2,2,2,3],
    'Amount':[100,200,150,300,250,400]
})

clv = transactions.groupby('CustomerID').agg({
    'Amount':'mean'
}).rename(columns={'Amount':'AvgPurchaseValue'})

clv['PurchaseFrequency'] = transactions.groupby('CustomerID').size()
clv['CustomerLifespan'] = 3  # assume 3 years for demo
clv['CLV'] = clv['AvgPurchaseValue'] * clv['PurchaseFrequency'] * clv['CustomerLifespan']

print(clv)

            AvgPurchaseValue  PurchaseFrequency  CustomerLifespan     CLV
CustomerID                                                               
1                 150.000000                  2                 3   900.0
2                 233.333333                  3                 3  2100.0
3                 400.000000                  1                 3  1200.0


57. Storing processed data into CSV or database

In [39]:
# Save to CSV
rfm.to_csv('rfm_results.csv', index=True)

# Save to SQLite database
import sqlite3
conn = sqlite3.connect('results.db')
rfm.to_sql('rfm_table', conn, if_exists='replace', index=True)
conn.close()

58. Designing simple end-to-end data pipeline

In [40]:
import pandas as pd
import sqlite3

def ingest_csv(path):
    return pd.read_csv(path)

def clean_data(df):
    df = df.drop_duplicates()
    df = df.fillna(0)
    return df

def transform(df):
    df['Total'] = df.select_dtypes(include='number').sum(axis=1)
    return df

def store(df, db_path, table_name):
    conn = sqlite3.connect(db_path)
    df.to_sql(table_name, conn, if_exists='replace', index=False)
    conn.close()

# Pipeline execution
raw = ingest_csv('data.csv')
clean = clean_data(raw)
transformed = transform(clean)
store(transformed, 'pipeline_results.db', 'final_table')