## titanic

In [None]:
import pandas as pd

# train - 70%
# test - 30%

data = pd.read_csv('./04/train.csv')
data.head()

In [None]:
print(data.shape)
data.describe()

In [None]:
data.head()

In [None]:
data['Age'] = data['Age'].fillna(data['Age'].median())
data.info()

In [None]:
data[data["Survived"]==1]["Sex"].value_counts()

In [None]:
survived_by_class = data[data["Survived"]==1]["Pclass"].value_counts()
survived_by_class.head()

dead_by_class = data[data["Survived"]==0]["Pclass"].value_counts()
dead_by_class.head()

df_by_class = pd.DataFrame([survived_by_class, dead_by_class])
df_by_class.index = ["Survived", "Died"]
df_by_class.plot(kind="bar", stacked=True, title="Survived/Died by Class")

In [None]:
survived_by_sex = data[data["Survived"]==1]["Sex"].value_counts()
survived_by_sex.head()

dead_by_sex = data[data["Survived"]==0]["Sex"].value_counts()
dead_by_sex.head()

df_by_sex = pd.DataFrame([survived_by_sex, dead_by_sex])
df_by_sex.index = ["Survived", "Died"]
df_by_sex.plot(kind="bar", stacked=True, title="Survived/Died by Sex")

In [None]:
women = data.loc[data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

men = data.loc[data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

In [None]:
survived_by_embarked = data[data["Survived"]==1]["Embarked"].value_counts()
survived_by_embarked.head()

dead_by_embarked = data[data["Survived"]==0]["Embarked"].value_counts()
dead_by_embarked.head()

df_by_class = pd.DataFrame([survived_by_embarked, dead_by_embarked])
df_by_class.index = ["Survived", "Died"]
df_by_class.plot(kind="bar", stacked=True, title="Survived/Died by Embarked")

In [None]:
survived_by_age = data[data["Survived"]==1]["Age"].value_counts()
survived_by_age.info()

dead_by_age = data[data["Survived"]==0]["Age"].value_counts()
dead_by_age.head()

In [None]:
data.head()

# Survived => RESULT
# Input => Survived, Sex, Age, PClass, Embarked
# Columns to delete: PassengerId, Cabin, Name, SibSp, Parch, Fare

# 891 values, Age => 88 valores
data['Age'].nunique()
# 891 values, Pclass => 3 valores
data['Pclass'].nunique()
data['Fare'].describe()


# cleaning

In [None]:
# PassengerId, Cabin, Name, SibSp, Parch, Age
data.drop(labels=['PassengerId', 'Cabin', 'Name', 'SibSp', 'Parch', 'Fare', 'Ticket', 'Age'], inplace=True, axis=1)
data

In [None]:
# data = data.dropna(subset=["Age"])

# Y - X

In [None]:
Y = data.Survived
X = data.drop(["Survived"], axis=1)

In [None]:
Y.head()

In [None]:
X.head()

In [None]:
!pip install scikit-learn

In [None]:
# encode "Sex"

from sklearn.preprocessing import LabelEncoder

label_encoder_x = LabelEncoder()
X.Sex = label_encoder_x.fit_transform(X.Sex)
X.head()

In [None]:
# revisar los Embarked con valor nulo
sum(X.Embarked.isnull())

In [None]:
row_with_embarked_null = X.Embarked.isnull()
X.loc[row_with_embarked_null, "Embarked"] = "S"

In [None]:
# encode "Embarked"

from sklearn.preprocessing import LabelEncoder

label_encoder_x = LabelEncoder()
X.Embarked = label_encoder_x.fit_transform(X.Embarked)
X.head()

# model

In [None]:
# logistic regression for our data

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(penalty="l2", random_state=0)

# apply k-fold cross validation

from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator = classifier, X=X, y=Y, cv=10)

print("Accuracy:", accuracies.mean())
print("+/-", accuracies.std())

## model 2

In [None]:
import pandas as pd

data = pd.read_csv('./04/train.csv')
data.head()

In [None]:
data.info()

In [None]:
data = data.dropna(subset=["Age"])
data.info()

In [None]:
def get_age_range(age):
    if age < 3:
        return "baby"
    elif age < 12:
        return "child"
    elif age < 30:
        return "young"
    elif age < 60:
        return "adult"
    else:
        return "old"

data["age_range"] = data["Age"].apply(get_age_range)
data.info()

In [None]:
data.head()

In [None]:
data = data.dropna(subset=["Cabin"])
data.info()

In [None]:
data["cabin_type"] = data["Cabin"].astype(str).str[0]
data.info()

In [None]:
data.head()

In [None]:
data.drop(labels=['PassengerId', 'Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'SibSp', 'Parch'], inplace=True, axis=1)


In [None]:
data.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder_x = LabelEncoder()
data.Sex = label_encoder_x.fit_transform(data.Sex)
data.Embarked = label_encoder_x.fit_transform(data.Embarked)
data.age_range = label_encoder_x.fit_transform(data.age_range)
#data.cabin_type = label_encoder_x.fit_transform(data.cabin_type)
data.head()


In [None]:
from sklearn.model_selection import train_test_split

target = data.Survived
features = data.drop(["Survived"], axis=1)

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

reg_log_model = LogisticRegression()
reg_log_model.fit(x_train, y_train)
reg_log_model_predictions = reg_log_model.predict(x_test)

print(confusion_matrix(y_test, reg_log_model_predictions))

In [None]:
print(classification_report(y_test, reg_log_model_predictions))