In [2]:
import numpy as np
import pandas as pd

In [6]:
# create a sample dataset2
data = {
    'old_name': ['Alice', 'Bob', np.nan, 'David'],
    'age': [25, np.nan, 30, 35],
    'score': [85, 90, np.nan, 88]
}
data2 = {
    'name': ['Eve', 'Frank', 'Grace', 'Heidi', 'Alice'],
    'age': [28, 22, 32, 29, 25],
    'score': [92, 85, 88, 91, 85]
}

data3 = {
    'name': ['Ivan', 'Judy', 'Charlie', 'Mallory', 'David'],
    'age': [24, 27, 30, 31, 35],
    'score': [87, 89, 90, 86, 88]
}

df = pd.DataFrame(data)
df1 = pd.DataFrame(data2)
df2 = pd.DataFrame(data3)

df.rename(columns={'old_name': 'name'}, inplace=True)

df["age_float"] = df["age"].astype("float")
# df["column_name"] = pd.to_datetime(df["column_name"])

df["age_doubled"] = df["age"] * 2
df["age_doubled_again"] = df["age_doubled"].apply(lambda x: x * 2)

df_backup = df.copy()
df = df.dropna()
df = df.dropna(axis=1)
print(f"Dropped:\n{df}")

df = df_backup.copy() 
df["age"] = df["age"].fillna(0)
print(f"Filled:\n{df}")

df = df_backup.copy()
df.fillna(method='ffill')
print(f"Filled with forward fill:\n{df}")
df = df_backup.copy()
df.fillna(method='bfill')
print(f"Filled with backward fill:\n{df}")

df = df_backup.copy()
df["score"] = df["score"].interpolate(method='linear')
print(f"Interpolated:\n{df}")
df = df_backup.copy()
df["score"] = df["score"].fillna(df["score"].mean())
print(f"Filled with mean:\n{df}")
df = df_backup.copy()
df["score"] = df["score"].fillna(df["score"].median())
print(f"Filled with median:\n{df}")
df = df_backup.copy()

Dropped:
    name   age  score  age_float  age_doubled  age_doubled_again
0  Alice  25.0   85.0       25.0         50.0              100.0
3  David  35.0   88.0       35.0         70.0              140.0
Filled:
    name   age  score  age_float  age_doubled  age_doubled_again
0  Alice  25.0   85.0       25.0         50.0              100.0
1    Bob   0.0   90.0        NaN          NaN                NaN
2    NaN  30.0    NaN       30.0         60.0              120.0
3  David  35.0   88.0       35.0         70.0              140.0
Filled with forward fill:
    name   age  score  age_float  age_doubled  age_doubled_again
0  Alice  25.0   85.0       25.0         50.0              100.0
1    Bob   NaN   90.0        NaN          NaN                NaN
2    NaN  30.0    NaN       30.0         60.0              120.0
3  David  35.0   88.0       35.0         70.0              140.0
Filled with backward fill:
    name   age  score  age_float  age_doubled  age_doubled_again
0  Alice  25.0   85.

  df.fillna(method='ffill')
  df.fillna(method='bfill')


In [9]:
# rows
combined = pd.concat([df1, df2], axis=0)
print(f"Combined:\n{combined}")
# columns
combined = pd.concat([df1, df2], axis=1)
print(f"Combined:\n{combined}")

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html
merged = pd.merge(df, df1, on='name', how='inner')
print(f"Merged:\n{merged}")
merged = pd.merge(df, df1, on='name', how='outer')
print(f"Merged:\n{merged}")


Combined:
      name  age  score
0      Eve   28     92
1    Frank   22     85
2    Grace   32     88
3    Heidi   29     91
4    Alice   25     85
0     Ivan   24     87
1     Judy   27     89
2  Charlie   30     90
3  Mallory   31     86
4    David   35     88
Combined:
    name  age  score     name  age  score
0    Eve   28     92     Ivan   24     87
1  Frank   22     85     Judy   27     89
2  Grace   32     88  Charlie   30     90
3  Heidi   29     91  Mallory   31     86
4  Alice   25     85    David   35     88
Merged:
    name  age_x  score_x  age_float  age_doubled  age_doubled_again  age_y  \
0  Alice   25.0     85.0       25.0         50.0              100.0     25   

   score_y  
0       85  
Merged:
    name  age_x  score_x  age_float  age_doubled  age_doubled_again  age_y  \
0  Alice   25.0     85.0       25.0         50.0              100.0   25.0   
1    Bob    NaN     90.0        NaN          NaN                NaN    NaN   
2    NaN   30.0      NaN       30.0       

In [11]:
# Excercize 1

# create a sample dataset
data = {
    'name': ['Alice', 'Bob', np.nan, 'David'],
    'age': [25, np.nan, 30, 35],
    'score': [85, 90, np.nan, 88]
}
df = pd.DataFrame(data)
print(f"Original data:\n{df}")

df["age"] = df["age"].fillna(df["age"].mean())
df["score"] = df["score"].interpolate()
df = df.rename(columns={'name': 'student_name', 'score': 'exam_score'})
print(f"Dataset:\n{df}")


Original data:
    name   age  score
0  Alice  25.0   85.0
1    Bob   NaN   90.0
2    NaN  30.0    NaN
3  David  35.0   88.0
Dataset:
  student_name   age  exam_score
0        Alice  25.0        85.0
1          Bob  30.0        90.0
2          NaN  30.0        89.0
3        David  35.0        88.0


In [14]:
# excercize 2: Merge Two Datasets and Perform Data Transformation

# create a sample dataframes
df1 = pd.DataFrame(
    {
        "id": [1, 2, 3 ],
        "name": ['Alice', 'Bob', 'Charlie'],
        "age": [25, 30, 35]
    }
)

df2 = pd.DataFrame(
    {
        "id": [1, 2, 3],
        "score": [85, 90, 88]
    }
)

print(f"Dataset 1:\n{df1}")
print(f"Dataset 2:\n{df2}")

merged = pd.merge(df1, df2, on='id', how='inner')
print(f"Merged:\n{merged}")

merged['score_percentage'] = (merged['score'] / 100) * 100
print(f"Transformed:\n{merged}")


Dataset 1:
   id     name  age
0   1    Alice   25
1   2      Bob   30
2   3  Charlie   35
Dataset 2:
   id  score
0   1     85
1   2     90
2   3     88
Merged:
   id     name  age  score
0   1    Alice   25     85
1   2      Bob   30     90
2   3  Charlie   35     88
Transformed:
   id     name  age  score  score_percentage
0   1    Alice   25     85              85.0
1   2      Bob   30     90              90.0
2   3  Charlie   35     88              88.0
