Problem statement 3:
Perform the following operations using Python
1. Scan all variables for missing values and inconsistencies. If there are missing values and/or
inconsistencies, use the following techniques to deal with them.
a)Delete rows or column b)replace missing values with mean c)replace missing values with
mode d)replace missing values with median
2. Apply data transformations on at least one of the variables. The purpose of this transformation
should be one of the following reasons: to change the scale for better understanding of the
variable, to convert a non-linear relation into a linear one, or to decrease the skewness and
convert the distribution into a normal distribution


In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

# Load your dataset into a DataFrame (assuming it is named 'df')
df = sns.load_dataset('titanic')


print(len(df.columns))

# Calculate the threshold for null values
threshold = len(df) // 2  # Set the threshold to be more than half of the data

# Identify columns with null values greater than the threshold
columns_to_delete = df.columns[df.isnull().sum() > threshold]

# Delete the identified columns from the DataFrame
df.drop(columns=columns_to_delete, inplace=True)

# Print the updated DataFrame
print(len(df.columns))


In [18]:
df


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


In [None]:

numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Scan variables for missing values and inconsistencies
for column in numeric_columns:
    missing_values = df[column].isnull().sum()
    inconsistencies = df[column].apply(lambda x: np.isnan(x) or np.isinf(x)).sum()
    print(f"Variable: {column}")
    print(f"Missing Values: {missing_values}")
    print(f"Inconsistencies: {inconsistencies}")
    print()

    # Deal with missing values and inconsistencies
    # a) Delete rows or columns
    # Uncomment the respective lines if you want to delete rows or columns
    # df.dropna(subset=[column], inplace=True)  # Delete rows with missing values for the specific column
    # df.drop(column, axis=1, inplace=True)  # Delete the specific column
  
    # b) Replace missing values with mean
    mean_value = df[column].mean()
    df[column].fillna(mean_value, inplace=True)

    # c) Replace missing values with mode
    mode_value = df[column].mode().iloc[0]
    df[column].fillna(mode_value, inplace=True)

    # d) Replace missing values with median
    median_value = df[column].median()
    df[column].fillna(median_value, inplace=True)

# Verify the updated DataFrame after dealing with missing values and inconsistencies
print("Updated DataFrame:")
print(df)
  

In [None]:
df.isnull().sum()

In [21]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.000000,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.000000,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.000000,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.000000,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.000000,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,19.000000,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,29.699118,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.000000,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


In [None]:
# 2. Apply data transformations on at least one variable
# Choose a variable on which to apply a data transformation

variable_to_transform = 'fare'

# 1. Change the scale for better understanding of the variable
df[variable_to_transform + '_scaled_variable'] = df[variable_to_transform] / 1000

# 2. Convert a non-linear relation into a linear one
df[variable_to_transform + '_linear_variable'] = np.sqrt(df[variable_to_transform])

# 3. Decrease skewness and convert the distribution into a normal distribution
df[variable_to_transform + '_log_transformed_variable'] = np.log(df[variable_to_transform])


In [23]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone,fare_scaled_variable,fare_linear_variable,fare_log_transformed_variable
0,0,3,male,22.000000,1,0,7.2500,S,Third,man,True,Southampton,no,False,0.007250,2.692582,1.981001
1,1,1,female,38.000000,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False,0.071283,8.442944,4.266662
2,1,3,female,26.000000,0,0,7.9250,S,Third,woman,False,Southampton,yes,True,0.007925,2.815138,2.070022
3,1,1,female,35.000000,1,0,53.1000,S,First,woman,False,Southampton,yes,False,0.053100,7.286975,3.972177
4,0,3,male,35.000000,0,0,8.0500,S,Third,man,True,Southampton,no,True,0.008050,2.837252,2.085672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S,Second,man,True,Southampton,no,True,0.013000,3.605551,2.564949
887,1,1,female,19.000000,0,0,30.0000,S,First,woman,False,Southampton,yes,True,0.030000,5.477226,3.401197
888,0,3,female,29.699118,1,2,23.4500,S,Third,woman,False,Southampton,no,False,0.023450,4.842520,3.154870
889,1,1,male,26.000000,0,0,30.0000,C,First,man,True,Cherbourg,yes,True,0.030000,5.477226,3.401197
