**Step 1) Importing and Instantiating Classes**

In [1]:
# pip install sweetviz

In [2]:
from utils import open_csv_file, analyze_dataframe, create_report

from correlation_matrix import Correlation
from data import DataFrame
from machine_learning import MachineLearning
from sklearn import metrics

import numpy as np

from graph import Graph
import seaborn as sns

training_data = open_csv_file('train.csv')
data_frame_instance = DataFrame(training_data)

analysis_result = analyze_dataframe(data_frame_instance)
print("Analysis Result:")
print(analysis_result)

  from .autonotebook import tqdm as notebook_tqdm


Analysis Result:
{'total_categorical_columns': 3, 'total_numerical_columns': 7, 'categorical_column_names': ['Name', 'Sex', 'Ticket'], 'numerical_column_names': ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']}


**Step 2) Inspecting and Cleaning Data**

In [3]:
# Check missing data
# print(data_frame_instance.check_missing_data())
# Check data types
# print(data_frame_instance.check_data_type())

# Replace missing values with median
data_frame_instance.replace_missing_value_with_median('Age')
# print(data_frame_instance.get_df())

**Step 3) Transforming Data**

In [4]:
# Create column for last name
data_frame_instance.create_last_name_column('Name')
# print(data_frame_instance.get_df()['last_name'])

In [5]:
# Label encoding
data_frame_instance.label_encoding('Last_name')
# print(data_frame_instance.get_df())

# One-hot encoding
data_frame_instance.hot_encoding('Embarked')
data_frame_instance.hot_encoding('Sex')
# print(data_frame_instance.get_df()['Embarked_Q'])
# print(data_frame_instance.get_df()['Embarked_S'])
# print(data_frame_instance.get_df()['Embarked_C'])
# print(data_frame_instance.get_df()['Sex_male'])
# print(data_frame_instance.get_df()['Sex_female'])

In [6]:
# graph_instance = Graph(sns)
# graph_instance.bar_graph(data_frame_instance.get_df()['Last_name_encoded'])

In [7]:
# print(data_frame_instance.get_df()['Last_name_encoded'])

In [8]:
data_frame_instance.min_max_scaler('Last_name_encoded')
# print(data_frame_instance.get_df()['Last_name_encoded_scaled'])

In [9]:
data_frame_instance.get_bin_edges('Fare','quantile')

array([array([  0.    ,   7.8542,  10.5   ,  21.6792,  39.6875, 512.3292])],
      dtype=object)

In [10]:
data_frame_instance.bucket_quantile('Age')
data_frame_instance.bucket_quantile('Fare')



In [11]:
data_frame_instance.remove_columns(['PassengerId', 'Name', 'Cabin', 'Age', 'Fare', 'Ticket', 'Last_name'])

In [12]:
# print(data_frame_instance.get_df())

**Step 4) Feature Selection**

In [13]:
# ## feature selection
# corr_mat_type_two = data_frame_instance.get_df().corr().abs()
# ## created a matrix of ones with the same shape with corr mat
# mask = np.triu(np.ones_like(corr_mat_type_two, dtype=bool))
# mask_df = corr_mat_type_two.mask(mask)
# mask_df

In [14]:
# correlation_instance = Correlation(data_frame_instance.get_df())
# correlation_instance.create_correlation_matrix()

In [15]:
# correlation_instance.filter_most_correlated_features()

**Step 5) Machine learning and evaluation**

In [16]:
data_to_use = data_frame_instance.get_df().iloc[:, 1:13]
data_to_use
data_to_target = data_frame_instance.get_df().iloc[:, 0]
data_to_target
ml = MachineLearning(data_to_use, data_to_target)
# Split the data
ml.split_data()

# Fit the KNN classifier
ml.fit_knn_classification()

X_test = ml.X_test

y_pred_knn = ml.predict_model(ml.knn_model)

ml.evaluate_model(y_pred_knn)

print("Mean Absolute Error (MAE):", ml.mae_train)
print("Root Mean Squared Error (RMSE):", ml.rmse_train)
print("R2 Score:", ml.r2_train)
print("Mean Squared Error (MSE):", ml.mse_train)
print('Accuracy', metrics.accuracy_score(ml.y_test, y_pred_knn))



Mean Absolute Error (MAE): 0.208955223880597
Root Mean Squared Error (RMSE): 0.45711620391383745
R2 Score: 0.12007504690431525
Mean Squared Error (MSE): 0.208955223880597
Accuracy 0.7910447761194029


