In [54]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit,KFold
from sklearn.metrics import accuracy_score,fbeta_score,f1_score
import numpy as np
import matplotlib.pyplot as plt
import re

ADULT INCOME

In [46]:
column_names = [
    "age", # continuous
    "workclass", # Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
    "fnlwgt", # continuous 
    "education", # Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
    "education-num", # continuous. 
    "marital-status", # Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
    "occupation", # Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
    "relationship", # Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried 
    "race", # White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black, 
    "sex", # Female, Male, 
    "capital-gain", # continuous, 
    "capital-loss", # continuous, 
    "hours-per-week", # continuous, 
    "native-country", # United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands 
    "target" # >50K, <=50K
]
df = pd.read_csv(filepath_or_buffer="adult.data.txt", sep=',', names=column_names, skipinitialspace=True)

In [47]:
print(f"Shape: {df.shape}")
print("")
print("Missing values")
print("--------------")
for col in column_names:
    if df[col].dtype == object:
        missing_count = (df[col] == "?").sum()
        print(f"{col}:{missing_count}")

Shape: (32561, 15)

Missing values
--------------
workclass:1836
education:0
marital-status:0
occupation:1843
relationship:0
race:0
sex:0
native-country:583
target:0


In [48]:
df["target"] = df["target"].map({"<=50K": 0, ">50K": 1})
df["workclass"] = df["workclass"].replace(to_replace='?', value=np.nan)
df["occupation"] = df["occupation"].replace(to_replace='?', value=np.nan)
df["native-country"] = df["native-country"].replace(to_replace='?', value=np.nan)
df = df.dropna()

In [49]:
df.to_csv(path_or_buf='adult_cleaned.csv', index=False)

TITANIC CLEANING

In [51]:
train_df = pd.read_csv(filepath_or_buffer="titanic_train.csv")
test_df = pd.read_csv(filepath_or_buffer="titanic_test.csv")

In [66]:
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print("")
print("info")
print(train_df.info())

Train shape: (891, 16)
Test shape: (418, 15)

info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null int64
Age            891 non-null int64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null object
Has_Cabin      891 non-null int64
FamilySize     891 non-null int64
IsAlone        891 non-null int64
Title          891 non-null object
dtypes: float64(1), int64(10), object(5)
memory usage: 111.5+ KB
None


In [52]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

def clean(dataset):
    original_dataset = dataset.copy()
    # Cabin is NaN
    dataset['Has_Cabin'] = dataset['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
    dataset['FamilySize'] = dataset['SibSp'] + train_df['Parch'] + 1
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    dataset['Fare'] = dataset['Fare'].fillna(train_df['Fare'].median())
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)

    dataset['Title'] = dataset['Name'].apply(get_title)
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

    dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)
    
    drop_columns = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
    return dataset.drop(drop_columns, axis = 1)

In [55]:
cleaned_train = clean(train_df)
cleaned_test = clean(test_df)

In [56]:
cleaned_train.to_csv(path_or_buf='titanic_train_cleaned.csv', index=False)
cleaned_test.to_csv(path_or_buf='titanic_test_cleaned.csv', index=False)