In [None]:
import pandas as pd
# Load the dataset
df = pd.read_csv('tested.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


# **First clean the dataset by removing the unwanted column which have no use for machine learning**

In [None]:
#remove data that has no use for machine learning
data = df.drop(columns=['Name', 'PassengerId', 'Ticket'])
print(data)

     Survived  Pclass     Sex   Age  SibSp  Parch      Fare Cabin Embarked
0           0       3    male  34.5      0      0    7.8292   NaN        Q
1           1       3  female  47.0      1      0    7.0000   NaN        S
2           0       2    male  62.0      0      0    9.6875   NaN        Q
3           0       3    male  27.0      0      0    8.6625   NaN        S
4           1       3  female  22.0      1      1   12.2875   NaN        S
..        ...     ...     ...   ...    ...    ...       ...   ...      ...
413         0       3    male   NaN      0      0    8.0500   NaN        S
414         1       1  female  39.0      0      0  108.9000  C105        C
415         0       3    male  38.5      0      0    7.2500   NaN        S
416         0       3    male   NaN      0      0    8.0500   NaN        S
417         0       3    male   NaN      1      1   22.3583   NaN        C

[418 rows x 9 columns]


#**Now calculating the null data percentage in each column and remove column which have more than 75% of missing values**

In [None]:
# Calculate missing percentage
null_percentage = data.isnull().mean() * 100
print(null_percentage)

# Drop columns with more than 75% missing values
data_cleaned = data.drop(columns=null_percentage[null_percentage > 75].index)
print(data_cleaned)

Survived     0.000000
Pclass       0.000000
Sex          0.000000
Age         20.574163
SibSp        0.000000
Parch        0.000000
Fare         0.239234
Cabin       78.229665
Embarked     0.000000
dtype: float64
     Survived  Pclass     Sex   Age  SibSp  Parch      Fare Embarked
0           0       3    male  34.5      0      0    7.8292        Q
1           1       3  female  47.0      1      0    7.0000        S
2           0       2    male  62.0      0      0    9.6875        Q
3           0       3    male  27.0      0      0    8.6625        S
4           1       3  female  22.0      1      1   12.2875        S
..        ...     ...     ...   ...    ...    ...       ...      ...
413         0       3    male   NaN      0      0    8.0500        S
414         1       1  female  39.0      0      0  108.9000        C
415         0       3    male  38.5      0      0    7.2500        S
416         0       3    male   NaN      0      0    8.0500        S
417         0       3    mal

# **Next, performing Advanced Data Cleaning with Fuzzy String Matching**


#### Installing requried libraries

In [None]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
  Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-Levenshtein)
  Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

#### Implementing Fuzzy String Matching

In [None]:
from fuzzywuzzy import fuzz

# Fuzzy matching on 'Embarked'
def fuzzy_match(x, choices, threshold=80):
  best_match = None
  best_score = 0
  for choice in choices:
    score = fuzz.ratio(str(x).lower(), str(choice).lower())
    if score > best_score and score >= threshold:
      best_score = score
      best_match = choice
  return best_match

# Get unique values in the 'Embarked' column
unique_embarked = data_cleaned['Embarked'].dropna().unique()
# Apply fuzzy matching
data_cleaned['Embarked'] = data_cleaned['Embarked'].apply(lambda x: fuzzy_match(x, unique_embarked))


# Handle remaining missing values
for col in data_cleaned.select_dtypes(include=['number']):
  if data_cleaned[col].isnull().any():
      data_cleaned[col] = data_cleaned[col].fillna(data_cleaned[col].median())

data_cleaned

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,34.5,0,0,7.8292,Q
1,1,3,female,47.0,1,0,7.0000,S
2,0,2,male,62.0,0,0,9.6875,Q
3,0,3,male,27.0,0,0,8.6625,S
4,1,3,female,22.0,1,1,12.2875,S
...,...,...,...,...,...,...,...,...
413,0,3,male,27.0,0,0,8.0500,S
414,1,1,female,39.0,0,0,108.9000,C
415,0,3,male,38.5,0,0,7.2500,S
416,0,3,male,27.0,0,0,8.0500,S
