In [2]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

### Cleaning Train Data

In [None]:

data=pd.read_csv("data/train.csv")
# Checking no of null values in our dataset
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
# Turning Gender to num vals
le = LabelEncoder()
data['Sex_encoded'] = le.fit_transform(data['Sex'])
print("Available columns:", data.columns.tolist())

Available columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Sex_encoded']


In [8]:
# Fixing Name Column (first remove Mr,Mrs etc and any other trash , format - name surname)
def clean_name(df, colname='Name'):
    df = df.copy()
    
    # Last Name
    df['LastName'] = df[colname].str.split(',').str[0].str.strip()
    
    # Everything after the comma
    rest = df[colname].str.split(',').str[1].str.strip()
    
    # Extract actual Title (Mr., Mrs., etc.) 
    df['Honorific'] = rest.str.split().str[0]
    
    # First name (if in parentheses)
    df['FirstName'] = rest.str.extract(r'\((.*?)\)')
    
    # If parentheses missing, take next word after title
    df['FirstName'] = df['FirstName'].fillna(rest.str.split().str[1])
    
    # Title: FirstName + LastName, both stripped
    df['Title'] = (df['FirstName'].apply(lambda x: ' '.join(str(x).split())) 
                   + ' ' + df['LastName'].apply(lambda x: ' '.join(str(x).split())))
    
    # remove extra spaces
    df['Title'] = df['Title'].apply(lambda x: ' '.join(str(x).split()))
    
    return df
# data = clean_name(data, 'Name')
# print(data['Title'].head())
# data.to_csv('cleaned_data.csv', index=False)


In [None]:
# Fill missing 'Age' with the median age per group (e.g., 'Sex' and 'Pclass')
data['Age'] = data.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))

# merging parch(parent children) sibsp(sibling,spouse)
data['Family']=data['Parch']+data['SibSp']

# Filling missing vals in Embarked
data['Embarked']=data['Embarked'].fillna('S')

#For cabin coln using a col having 0/1 for having/;not having col
data['Has_Cabin'] = data['Cabin'].notna().astype(int)

# Ticket Coln
# First checking how many rows are alnum and num
import re
def count_patterns(data):
    char_int = sum(1 for x in data if re.search(r'(?=.*[a-zA-Z])(?=.*\d)', str(x)))
    int_only = sum(1 for x in data if re.fullmatch(r'\d+', str(x)))
    return char_int, int_only
char_int, int_only = count_patterns(data['Ticket'])
# print(int_only) # 661
# print(char_int) # 226

data['Ticket_Number'] = data['Ticket'].str.extract(r'(\d+)').astype(float)

Drop columns
data = data.drop(columns=['Cabin'])

data['Ticket_Number'] = data['Ticket_Number'].fillna(0)


In [None]:
# Saving cleaned data
#data.to_csv('titanic_cleaned.csv', index=False)  # Excludes row numbers

### Cleaning Test data

In [6]:

# Importing Libraries
import pandas as pd
import numpy as np
import math
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [14]:
# Read CSV
df = pd.read_csv("data/test.csv")

### 1️⃣ Clean `Name`
def clean_name(name):
    if pd.isnull(name):
        return name
    # Remove honorifics: remove words with dot (like Mr., Mrs., etc.)
    name = re.sub(r'\b\w+\.\s*', '', name)
    # Remove symbols: commas, brackets, quotes, etc.
    name = re.sub(r'[,()\[\]{}"\'`]', '', name)
    # Keep only first and last word
    parts = name.strip().split()
    if len(parts) >= 2:
        return parts[0] + ' ' + parts[-1]
    return name

df['Name'] = df['Name'].apply(clean_name)

### 2️⃣ Encode Gender
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})

### 3️⃣ Clean `Age`
def clean_age(x):
    if pd.isnull(x):
        return np.nan
    frac = x - int(x)
    return math.ceil(x) if frac >= 0.5 else math.floor(x)

df['Age'] = df['Age'].apply(clean_age)
mean_age = df['Age'].mean()
df['Age'] = df['Age'].fillna(round(mean_age))
df['Age'] = df['Age'].astype(int)

### 4️⃣ Family Size
df['Family'] = df['SibSp'] + df['Parch']

# Drop SibSp & Parch
df = df.drop(columns=['SibSp', 'Parch'])

### 5️⃣ Ticket: keep fully numeric only
def clean_ticket(ticket):
    if pd.isnull(ticket):
        return ''
    return ''.join(re.findall(r'\d+', str(ticket)))

df['Ticket'] = df['Ticket'].apply(clean_ticket)

### 6️⃣ Cabin → has_cabin
df['has_cabin'] = df['Cabin'].notnull().astype(int)

# Drop Cabin
df = df.drop(columns=['Cabin'])

### 7️⃣ Fare: fill missing with 0
df['Fare'] = df['Fare'].fillna(0)

### 8️⃣ Encode `Embarked`
df['Embarked'] = df['Embarked'].map({'S':0, 'C':1, 'Q':2})

### ✅ Save cleaned dataset
df.to_csv("test_cleaned.csv", index=False)
df.head()
#print("🎯 Cleaning completed. Saved as: test_cleaned.csv")


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Ticket,Fare,Embarked,Family,has_cabin
0,892,3,Kelly James,1,35,330911,7.8292,2,0,0
1,893,3,Wilkes Needs,0,47,363272,7.0,0,1,0
2,894,2,Myles Francis,1,62,240276,9.6875,2,0,0
3,895,3,Wirz Albert,1,27,315154,8.6625,0,0,0
4,896,3,Hirvonen Lindqvist,0,22,3101298,12.2875,0,2,0
