In [None]:
#1

In [1]:
import pandas as pd

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [24, 30, None, 22, 35],
    'Salary': [48000, None, 57000, None, 60000]
}
df = pd.DataFrame(data)

df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].median(), inplace=True)
df.dropna(subset=['Name'], inplace=True)
print('After cleaning:\n', df)

After cleaning:
       Name    Age   Salary
0    Alice  24.00  48000.0
1      Bob  30.00  57000.0
2  Charlie  27.75  57000.0
3    David  22.00  57000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].median(), inplace=True)


In [2]:
import pandas as pd

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [24, 30, None, 22, 35],
    'Salary': [48000, None, 57000, None, 60000]
}
df = pd.DataFrame(data)

df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].median())
df.dropna(subset=['Name'], inplace=True)
print('After cleaning:\n', df)

After cleaning:
       Name    Age   Salary
0    Alice  24.00  48000.0
1      Bob  30.00  57000.0
2  Charlie  27.75  57000.0
3    David  22.00  57000.0


In [None]:
#2

In [3]:
data = {
    'Product': ['Laptop', 'Laptop', 'Desktop', 'Tablet', 'Tablet'],
    'Category': ['Electronics', 'electronics', 'Electronics', 'Gadgets', 'gadgets']
}
df = pd.DataFrame(data)

df['Category'] = df['Category'].str.capitalize()
print('Standardized Data:\n', df)

Standardized Data:
    Product     Category
0   Laptop  Electronics
1   Laptop  Electronics
2  Desktop  Electronics
3   Tablet      Gadgets
4   Tablet      Gadgets


In [None]:
TASK

In [None]:
#1

In [2]:
import kagglehub
import pandas as pd
import os

path = kagglehub.dataset_download("yasserh/titanic-dataset")

print("Path to dataset files:", path)
print("Files in dataset folder:", os.listdir(path))

df = pd.read_csv(path + "/Titanic-Dataset.csv")

print("First rows of dataset:\n", df.head())
print("\nMissing values in each column:\n", df.isnull().sum())

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/vboxuser/.cache/kagglehub/datasets/yasserh/titanic-dataset/versions/1
Files in dataset folder: ['Titanic-Dataset.csv']
First rows of dataset:
    PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85 

In [None]:
#2

In [3]:
from sklearn.preprocessing import MinMaxScaler

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
print("Numerical columns:", list(numeric_cols))

scaler = MinMaxScaler()

df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print("\nNormalized numerical columns (first rows):\n")
print(df[numeric_cols].head())


Numerical columns: ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

Normalized numerical columns (first rows):

   PassengerId  Survived  Pclass       Age  SibSp  Parch      Fare
0     0.000000       0.0     1.0  0.271174  0.125    0.0  0.014151
1     0.001124       1.0     0.0  0.472229  0.125    0.0  0.139136
2     0.002247       1.0     1.0  0.321438  0.000    0.0  0.015469
3     0.003371       1.0     0.0  0.434531  0.125    0.0  0.103644
4     0.004494       0.0     1.0  0.434531  0.000    0.0  0.015713


In [None]:
#3

In [4]:
categorical_cols = df.select_dtypes(include=['object']).columns
print("Categorical columns:", list(categorical_cols))

for col in categorical_cols:
    df[col] = (df[col].astype(str).str.strip().str.lower().str.title())

print("\nStandardized categorical columns (first rows):\n")
print(df[categorical_cols].head())

before = len(df)
df = df.drop_duplicates()
after = len(df)

print(f"\nDuplicates removed: {before - after}")
print("\nFinal dataset shape:", df.shape)

Categorical columns: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

Standardized categorical columns (first rows):

                                                Name     Sex  \
0                            Braund, Mr. Owen Harris    Male   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  Female   
2                             Heikkinen, Miss. Laina  Female   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  Female   
4                           Allen, Mr. William Henry    Male   

             Ticket Cabin Embarked  
0         A/5 21171   Nan        S  
1          Pc 17599   C85        C  
2  Ston/O2. 3101282   Nan        S  
3            113803  C123        S  
4            373450   Nan        S  

Duplicates removed: 0

Final dataset shape: (891, 12)


In [None]:
#4

In [5]:
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[column] >= lower) & (df[column] <= upper)]

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

for col in numeric_cols:
    df = handle_outliers(df, col)

print("After handling outliers, dataset shape:", df.shape)

After handling outliers, dataset shape: (436, 12)


In [None]:
#5

In [6]:
print("\nFinal dataset ready for analysis:\n")
print(df.head())
print("\nShape:", df.shape)


Final dataset ready for analysis:

   PassengerId  Survived  Pclass  \
0     0.000000       0.0     1.0   
2     0.002247       1.0     1.0   
3     0.003371       1.0     0.0   
4     0.004494       0.0     1.0   
6     0.006742       0.0     0.0   

                                           Name     Sex       Age  SibSp  \
0                       Braund, Mr. Owen Harris    Male  0.271174  0.125   
2                        Heikkinen, Miss. Laina  Female  0.321438  0.000   
3  Futrelle, Mrs. Jacques Heath (Lily May Peel)  Female  0.434531  0.125   
4                      Allen, Mr. William Henry    Male  0.434531  0.000   
6                       Mccarthy, Mr. Timothy J    Male  0.673285  0.000   

   Parch            Ticket      Fare Cabin Embarked  
0    0.0         A/5 21171  0.014151   Nan        S  
2    0.0  Ston/O2. 3101282  0.015469   Nan        S  
3    0.0            113803  0.103644  C123        S  
4    0.0            373450  0.015713   Nan        S  
6    0.0            