In [17]:
import pandas as pd

# Creating a sample dataset of an imaginary AI Research Team
data = {
    'Employee_ID': [101, 102, 103, 104, 105, 106, 107, 108],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Henry'],
    'Department': ['Engineering', 'Data Science', 'Engineering', 'Data Science', 'HR', 'Engineering', 'Data Science', 'HR'],
    'Salary': [85000, 92000, 88000, 95000, 60000, 105000, 91000, 62000],
    'Years_Exp': [3, 5, 4, 7, 2, 10, 5, 3],
    'Remote': [True, False, True, True, False, False, True, False]
}

df = pd.DataFrame(data)

# Save it to your computer as a CSV
df.to_csv('practice_data.csv', index=False)

print("File 'practice_data.csv' has been created!")

File 'practice_data.csv' has been created!


In [19]:
import numpy as np

# Let's intentionally break some data
df.loc[1, 'Salary'] = np.nan
df.loc[3, 'Years_Exp'] = np.nan
df.loc[5, 'Department'] = np.nan

print(df)

   Employee_ID     Name    Department    Salary  Years_Exp  Remote
0          101    Alice   Engineering   85000.0        3.0    True
1          102      Bob  Data Science       NaN        5.0   False
2          103  Charlie   Engineering   88000.0        4.0    True
3          104    David  Data Science   95000.0        NaN    True
4          105      Eve            HR   60000.0        2.0   False
5          106    Frank           NaN  105000.0       10.0   False
6          107    Grace  Data Science   91000.0        5.0    True
7          108    Henry            HR   62000.0        3.0   False


In [3]:
# 1. Total count of missing values per column
print(df.isnull().sum())

# 2. The percentage of missing data (Expert move)
# This helps you decide if you should drop the column or fix it
print(df.isnull().mean() * 100)

Employee_ID    0
Name           0
Department     1
Salary         1
Years_Exp      1
Remote         0
dtype: int64
Employee_ID     0.0
Name            0.0
Department     12.5
Salary         12.5
Years_Exp      12.5
Remote          0.0
dtype: float64


In [21]:
# Clean Salary: Fill the missing Salary with the Median salary of the team.

median_val = df['Salary'].median()
df['Salary'] = df['Salary'].fillna(median_val)

In [23]:
# Clean Experience: Fill the missing Years_Exp with the number 0

df['Years_Exp'] = df['Years_Exp'].fillna(0)

In [25]:
# Clean Department: Since we don't know the department, fill it with the string "Bench".

df['Department'] = df['Department'].fillna("Bench")

In [26]:
print(df)

   Employee_ID     Name    Department    Salary  Years_Exp  Remote
0          101    Alice   Engineering   85000.0        3.0    True
1          102      Bob  Data Science   88000.0        5.0   False
2          103  Charlie   Engineering   88000.0        4.0    True
3          104    David  Data Science   95000.0        0.0    True
4          105      Eve            HR   60000.0        2.0   False
5          106    Frank         Bench  105000.0       10.0   False
6          107    Grace  Data Science   91000.0        5.0    True
7          108    Henry            HR   62000.0        3.0   False


In [27]:
# Hint for the code:
encoded_df = pd.get_dummies(df, columns=['Department'])
print(encoded_df.head())

   Employee_ID     Name   Salary  Years_Exp  Remote  Department_Bench  \
0          101    Alice  85000.0        3.0    True             False   
1          102      Bob  88000.0        5.0   False             False   
2          103  Charlie  88000.0        4.0    True             False   
3          104    David  95000.0        0.0    True             False   
4          105      Eve  60000.0        2.0   False             False   

   Department_Data Science  Department_Engineering  Department_HR  
0                    False                    True          False  
1                     True                   False          False  
2                    False                    True          False  
3                     True                   False          False  
4                    False                   False           True  


In [28]:
# 1. Find the boundaries
min_val = df['Salary'].min()
max_val = df['Salary'].max()

# 2. Apply the formula to the whole column at once
df['Salary_Scaled'] = (df['Salary'] - min_val) / (max_val - min_val)

print(df[['Name', 'Salary', 'Salary_Scaled']])

      Name    Salary  Salary_Scaled
0    Alice   85000.0       0.555556
1      Bob   88000.0       0.622222
2  Charlie   88000.0       0.622222
3    David   95000.0       0.777778
4      Eve   60000.0       0.000000
5    Frank  105000.0       1.000000
6    Grace   91000.0       0.688889
7    Henry   62000.0       0.044444


In [29]:
# 1. Find the boundaries
min_val = df['Years_Exp'].min()
max_val = df['Years_Exp'].max()

# 2. Apply the formula to the whole column at once
df['Years_Exp_Scaled'] = (df['Years_Exp'] - min_val) / (max_val - min_val)

print(df[['Name', 'Years_Exp', 'Years_Exp_Scaled']])

      Name  Years_Exp  Years_Exp_Scaled
0    Alice        3.0               0.3
1      Bob        5.0               0.5
2  Charlie        4.0               0.4
3    David        0.0               0.0
4      Eve        2.0               0.2
5    Frank       10.0               1.0
6    Grace        5.0               0.5
7    Henry        3.0               0.3


In [30]:
import pandas as pd

# Load the real Titanic dataset from a public URL
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Let's see what we are dealing with
print("Dataset Shape:", df.shape)
print("\nMissing Values:")
print(df.isnull().sum())
print("\nFirst 5 rows:")
print(df.head())

Dataset Shape: (891, 12)

Missing Values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

First 5 rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ti

In [31]:
median_val = df['Age'].median()
df['Age'] = df['Age'].fillna(median_val)

In [35]:
df['Sex'].map({'male': 0, 'female': 1})

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    1
889    0
890    0
Name: Sex, Length: 891, dtype: int64

In [33]:
min_val = df['Fare'].min()
max_val = df['Fare'].max()

# 2. Apply the formula to the whole column at once
df['Fare_Scaled'] = (df['Fare'] - min_val) / (max_val - min_val)

print(df[['Name', 'Fare', 'Fare_Scaled']])

                                                  Name     Fare  Fare_Scaled
0                              Braund, Mr. Owen Harris   7.2500     0.014151
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  71.2833     0.139136
2                               Heikkinen, Miss. Laina   7.9250     0.015469
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  53.1000     0.103644
4                             Allen, Mr. William Henry   8.0500     0.015713
..                                                 ...      ...          ...
886                              Montvila, Rev. Juozas  13.0000     0.025374
887                       Graham, Miss. Margaret Edith  30.0000     0.058556
888           Johnston, Miss. Catherine Helen "Carrie"  23.4500     0.045771
889                              Behr, Mr. Karl Howell  30.0000     0.058556
890                                Dooley, Mr. Patrick   7.7500     0.015127

[891 rows x 3 columns]


In [36]:
# Assuming you've run your cleaning code:
print("--- Final Cleanliness Check ---")
print(df[['Age', 'Sex', 'Fare_Scaled']].head())

print("\n--- Any Remaining NaNs? ---")
print(df[['Age', 'Sex', 'Fare_Scaled']].isnull().sum())

--- Final Cleanliness Check ---
    Age     Sex  Fare_Scaled
0  22.0    male     0.014151
1  38.0  female     0.139136
2  26.0  female     0.015469
3  35.0  female     0.103644
4  35.0    male     0.015713

--- Any Remaining NaNs? ---
Age            0
Sex            0
Fare_Scaled    0
dtype: int64
