In [None]:
# SQL Interface | Advantage : ETL in SQL and not in pandas
# https://colab.research.google.com/drive/1BNRu6ZS0HAlg4c2ls-UfGGkKD8bLAZ0T?usp=sharing

In [None]:
import openml
import pandas as pd
import sqlite3

In [None]:
titanic_data = openml.datasets.get_dataset(40945)

# Get the data and target separately
X, y, _, _ = titanic_data.get_data(target=titanic_data.default_target_attribute)

# Get the feature names from the Titanic dataset
attribute_names = titanic_data.features.values()
feature_names = [feat.name for feat in attribute_names if feat.name != titanic_data.default_target_attribute]

titanic_df = pd.DataFrame(X, columns=feature_names)
titanic_df['Survived'] = y

In [None]:
titanic_df.head(3)

In [None]:
# Create a SQLite connection in memory
conn=sqlite3.connect(':memory:')

# Write the data into db table
titanic_df.to_sql('titanic',conn,if_exists='replace',index=False)

In [None]:
query='SELECT * FROM titanic LIMIT 10;'
pd.read_sql_query(query,conn) #output is a pandas df

In [None]:
titanic_df.columns

In [None]:
# PIPELINE (Auto ETL) | NOTE the difference steps for categorical and numerical
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Define preprocessing steps
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['pclass', 'sex', 'embarked']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Preprocess the data
X_preprocessed = preprocessor.fit_transform(X)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed, y, test_size=0.2, random_state=42)

model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy:.2f}") # means model is able to predict 77% accuracy whether the person survived or dead

In [None]:
# import ipywidgets as widgets
# from IPython.display import display

# def predict_survival(pclass, sex, age, fare, embarked):
#     # Create a DataFrame for the input
#     input_data = pd.DataFrame(
#         [[pclass, sex, age, fare, embarked]],
#         columns=['pclass', 'sex', 'age', 'fare', 'embarked']
#     )
    
#     # Preprocess the input
#     input_preprocessed = preprocessor.transform(input_data)
    
#     # Make a prediction
#     prediction = model.predict(input_preprocessed)
    
#     if prediction[0] == 1:
#         print("The passenger is predicted to survive.")
#     else:
#         print("The passenger is predicted to perish.")

# # Create widgets for user input
# pclass_widget = widgets.IntSlider(min=1, max=3, step=1, value=1, description='Pclass:')
# sex_widget = widgets.Dropdown(options=['male', 'female'], value='male', description='Sex:')
# age_widget = widgets.FloatSlider(min=0, max=100, step=1, value=30, description='Age:')
# fare_widget = widgets.FloatSlider(min=0, max=600, step=1, value=50, description='Fare:')
# embarked_widget = widgets.Dropdown(options=['C', 'Q', 'S'], value='S', description='Embarked:')

# # Display the widgets and bind them to the predict_survival function
# widgets.interact(predict_survival, pclass=pclass_widget, sex=sex_widget, age=age_widget, fare=fare_widget, embarked=embarked_widget)

In [None]:
# https://colab.research.google.com/drive/1ONMhTMJU2M2FaHWtppSWwqPAqQJNMTdw#scrollTo=o9uPme0QdZGk

# what is the difference between fit and transform

In scikit-learn, fit() and transform() are two separate methods used for centering/feature scaling of a given dataset [0]. The fit() method calculates the parameters (e.g. mean and standard deviation) of the dataset and saves them as internal objects [2]. On the other hand, the transform() method applies these calculated parameters to the dataset to generate transformed data [0].

The fit_transform() method is a combination of both fit() and transform() methods on the same dataset. It is used for the initial fitting of parameters on the training set, while also returning the transformed dataset. Internally, the transformer object first calls fit() and then transform() on the same data [2]. This method is used to normalize the data within a particular range and help in feature scaling [0].

The fit() method is used for generating learning model parameters from training data [0]. It is used to compute the mean and standard deviation for a given feature to be used further for scaling [3]. When applying the fit() method to a dataset, it learns from the data and calculates the required parameters [1].

The transform() method, on the other hand, applies the learned parameters to the dataset to generate transformed data set [0]. It is used to perform scaling using the mean and standard deviation calculated using the fit() method [3]. The transform() method is normally used on the test data and unseen data in general [1].

It is important to note that fit_transform() should only be applicable to the training data, and not to the test data. This is because the test data should use the same parameters (e.g. mean and standard deviation) calculated from the training data set [0]. Using fit_transform() on the test data is a common rookie mistake [1].

In summary, the main difference between fit() and transform() in scikit-learn is that fit() is used for generating learning model parameters from training data, while transform() applies the learned parameters to the dataset to generate transformed data set. fit_transform() is a combination of both fit() and transform() methods on the same dataset, used to normalize the data within a particular range and help in feature scaling.