In [2]:
import pandas as pd

# Load the data from the provided CSV files
train_data = pd.read_csv('train.csv')
validation_data = pd.read_csv('validation.csv')
test_data = pd.read_csv('test.csv')

# Display the first few rows of each dataset
train_data.head() 

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,29,1,2,130,204,0,2,202,0,0.0,1,0.0,3.0,0
1,57,1,3,150,126,1,0,173,0,0.2,1,1.0,7.0,0
2,69,1,1,160,234,1,2,131,0,0.1,2,1.0,3.0,0
3,65,0,3,160,360,0,2,151,0,0.8,1,0.0,3.0,0
4,52,1,4,108,233,1,0,147,0,0.1,1,3.0,7.0,0


In [3]:
validation_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,60,1,4,125,258,0,2,141,1,2.8,2,1.0,7.0,1
1,43,1,4,115,303,0,0,181,0,1.2,2,0.0,3.0,0
2,57,1,4,165,289,1,2,124,0,1.0,2,3.0,7.0,1
3,58,1,3,140,211,1,2,165,0,0.0,1,0.0,3.0,0
4,34,1,1,118,182,0,2,174,0,0.0,1,0.0,3.0,0


In [4]:
test_data.head()

Unnamed: 0,id,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,179,53,1,3,130,246,1,2,173,0,0.0,1,3.0,3.0
1,9,53,1,4,140,203,1,2,155,1,3.1,3,0.0,7.0
2,281,47,1,3,130,253,0,0,179,0,0.0,1,0.0,3.0
3,284,61,1,4,148,203,0,0,161,0,0.0,1,1.0,7.0
4,175,57,1,4,152,274,0,0,88,1,1.2,2,1.0,7.0


In [5]:
import sklearn

In [6]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [23]:
X_train = train_data.drop(columns='label')
y_train = train_data['label']
X_val = validation_data.drop(columns='label')
y_val = validation_data['label']

from sklearn.impute import SimpleImputer

# Define a pipeline with imputation, scaling, and SVM
svm_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant')),  # Impute missing values with the mean
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='poly', degree=3, gamma='scale', C=1.0))
])

# Train the SVM model
svm_pipeline.fit(X_train, y_train)

# Predict on the validation dataset
val_predictions = svm_pipeline.predict(X_val)

# Calculate the accuracy on the validation set
validation_accuracy = accuracy_score(y_val, val_predictions)
print("accuracy", validation_accuracy)


accuracy 0.9666666666666667


In [24]:
# Prepare test data by dropping the 'id' column which is not needed for prediction
X_test = test_data.drop(columns='id')

# Predict on the test dataset using the trained SVM pipeline
test_predictions = svm_pipeline.predict(X_test)

# Prepare the submission DataFrame
submission = pd.DataFrame({
    'id': test_data['id'],
    'label': test_predictions
})

# Save the submission file
submission_file_path = 'submission.csv'
submission.to_csv(submission_file_path, index=False)

submission_file_path, submission.head()

('submission.csv',
     id  label
 0  179      0
 1    9      1
 2  281      0
 3  284      1
 4  175      1)

In [25]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for C and gamma
param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': [0.001, 0.01, 0.1, 1, 'scale', 'auto']
}

# Setup GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, scoring='accuracy', verbose=1)

# Fit the grid search model
grid_search.fit(X_train, y_train)

# Best parameters and best score from grid search
best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_params, best_score

Fitting 5 folds for each of 24 candidates, totalling 120 fits


({'svm__C': 1, 'svm__gamma': 0.1}, 0.8305272108843539)

In [26]:
# Update the SVM pipeline with the best parameters found from grid search
optimized_svm_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf', gamma=best_params['svm__gamma'], C=best_params['svm__C']))
])

# Retrain the model with the optimized parameters
optimized_svm_pipeline.fit(X_train, y_train)

# Predict on the validation dataset
optimized_val_predictions = optimized_svm_pipeline.predict(X_val)

# Calculate the accuracy on the validation set
optimized_validation_accuracy = accuracy_score(y_val, optimized_val_predictions)

# Predict on the test dataset using the optimized SVM pipeline
optimized_test_predictions = optimized_svm_pipeline.predict(X_test)

# Prepare the optimized submission DataFrame
optimized_submission = pd.DataFrame({
    'id': test_data['id'],
    'label': optimized_test_predictions
})

# Save the optimized submission file
optimized_submission_file_path = 'hyper_submission.csv'
optimized_submission.to_csv(optimized_submission_file_path, index=False)

optimized_validation_accuracy, optimized_submission_file_path, optimized_submission.head()


(0.9333333333333333,
 'hyper_submission.csv',
     id  label
 0  179      0
 1    9      1
 2  281      0
 3  284      1
 4  175      1)

In [27]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Imputers and scalers setup
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

# Columns for imputation and scaling
columns_to_impute = ['ca', 'thal']
columns_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Impute missing values in train, validation, and test datasets
train_data[columns_to_impute] = imputer.fit_transform(train_data[columns_to_impute])
validation_data[columns_to_impute] = imputer.transform(validation_data[columns_to_impute])
test_data[columns_to_impute] = imputer.transform(test_data[columns_to_impute])

# Scale specified columns in train, validation, and test datasets
train_data[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])
validation_data[columns_to_scale] = scaler.transform(validation_data[columns_to_scale])
test_data[columns_to_scale] = scaler.transform(test_data[columns_to_scale])

# Check if the transformations were applied correctly
train_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,-2.838504,1,2,-0.125982,-0.864142,0,2,2.31447,0,-0.873573,1,0.0,3.0,0
1,0.241352,1,3,0.974653,-2.483637,1,0,1.021242,0,-0.704854,1,1.0,7.0,0
2,1.561291,1,1,1.52497,-0.24126,1,2,-0.85171,0,-0.789214,2,1.0,3.0,0
3,1.121311,0,3,1.52497,2.374848,0,2,0.040172,0,-0.198698,1,0.0,3.0,0
4,-0.308622,1,4,-1.33668,-0.262023,1,0,-0.138205,0,-0.789214,1,3.0,7.0,0


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [29]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Evaluate the model on the validation set
y_val_pred = rf_classifier.predict(X_val)
validation_accuracy = accuracy_score(y_val, y_val_pred)

validation_accuracy

0.8666666666666667

In [30]:
#Implement Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
# Impute missing values
train_data.fillna(train_data.median(), inplace=True)
validation_data.fillna(validation_data.median(), inplace=True)
test_data.fillna(test_data.median(), inplace=True)

# Scale numerical features
columns_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()
train_data[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])
validation_data[columns_to_scale] = scaler.transform(validation_data[columns_to_scale])
test_data[columns_to_scale] = scaler.transform(test_data[columns_to_scale])

In [33]:
# Define logistic regression model
logistic_regression = LogisticRegression()

# Extract features and target variable
X_train = train_data.drop(columns='label')
y_train = train_data['label']
X_val = validation_data.drop(columns='label')
y_val = validation_data['label']

# Train logistic regression model
logistic_regression.fit(X_train, y_train)

# Predict on the validation set
val_predictions_lr = logistic_regression.predict(X_val)

# Calculate accuracy on the validation set
validation_accuracy_lr = accuracy_score(y_val, val_predictions_lr)
print("Validation Accuracy (Logistic Regression):", validation_accuracy_lr)


Validation Accuracy (Logistic Regression): 0.9


In [19]:
# Prepare the test data by dropping the 'id' column
X_test_lr = test_data.drop(columns='id')

# Predict on the test dataset using the trained logistic regression model
test_predictions_lr = logistic_regression.predict(X_test_lr)

# Prepare the submission DataFrame
submission_lr = pd.DataFrame({
    'id': test_data['id'],
    'label': test_predictions_lr
})

# Save the submission file
submission_file_path_lr = 'submission_lr.csv'
submission_lr.to_csv(submission_file_path_lr, index=False)

print("Submission file saved successfully:", submission_file_path_lr)


Submission file saved successfully: submission_lr.csv


In [34]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import accuracy_score

# Define Kernel Ridge Regression model
krr = KernelRidge(kernel='rbf', alpha=0.1)  # You can specify the desired kernel and regularization parameter alpha

# Train the model on the training data
krr.fit(X_train, y_train)

# Predict on the validation set
val_predictions_krr = krr.predict(X_val)
val_predictions_krr_binary = (val_predictions_krr > 0.5).astype(int)  # Convert to binary predictions (0 or 1)

# Calculate accuracy on the validation set
validation_accuracy_krr = accuracy_score(y_val, val_predictions_krr_binary)
print("Validation Accuracy (Kernel Ridge Regression):", validation_accuracy_krr)

# Predict on the test dataset using Kernel Ridge Regression
test_predictions_krr = krr.predict(X_test)
test_predictions_krr_binary = (test_predictions_krr > 0.5).astype(int)  # Convert to binary predictions (0 or 1)

# Prepare the submission DataFrame
submission_krr = pd.DataFrame({
    'id': test_data['id'],
    'label': test_predictions_krr_binary
})

# Save the submission file
submission_file_path_krr = 'submission_krr.csv'
submission_krr.to_csv(submission_file_path_krr, index=False)

print("Submission file saved successfully:", submission_file_path_krr)


Validation Accuracy (Kernel Ridge Regression): 0.8666666666666667
Submission file saved successfully: submission_krr.csv


In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv1D, MaxPooling1D
from tensorflow.keras.utils import to_categorical

# Load the data from the provided CSV files
train_data = pd.read_csv('train.csv')
validation_data = pd.read_csv('validation.csv')
test_data = pd.read_csv('test.csv')

# Combine train and validation data for preprocessing
combined_data = pd.concat([train_data, validation_data], axis=0)

# Encode categorical variables
encoder = LabelEncoder()
combined_data['sex'] = encoder.fit_transform(combined_data['sex'])
combined_data['cp'] = encoder.fit_transform(combined_data['cp'])
combined_data['fbs'] = encoder.fit_transform(combined_data['fbs'])
combined_data['restecg'] = encoder.fit_transform(combined_data['restecg'])
combined_data['exang'] = encoder.fit_transform(combined_data['exang'])
combined_data['slope'] = encoder.fit_transform(combined_data['slope'])
combined_data['thal'] = encoder.fit_transform(combined_data['thal'])

# Split features and target variable
X = combined_data.drop(columns=['label'])
y = combined_data['label']

# Normalize numerical features
X = (X - X.mean()) / X.std()

# Reshape the data for CNN input
X_cnn = X.values.reshape(X.shape[0], X.shape[1], 1)

# One-hot encode the target variable
y_categorized = to_categorical(y)

# Split the data into training and validation sets
X_train_cnn, X_val_cnn, y_train_cnn, y_val_cnn = train_test_split(X_cnn, y_categorized, test_size=0.2, random_state=42)

# Define the CNN architecture
model = Sequential([
    Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], X_train_cnn.shape[2])),
    MaxPooling1D(pool_size=2),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(2, activation='softmax')
])

optimizer = Adam(learning_rate=0.00000001)  # Try different learning rates, e.g., 0.001, 0.0001, etc.

# Compile the model
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_cnn, y_train_cnn, epochs=30, batch_size=64, validation_data=(X_val_cnn, y_val_cnn))

# Evaluate the model on the validation set
val_loss, val_accuracy = model.evaluate(X_val_cnn, y_val_cnn)
print("Validation Accuracy:", val_accuracy)

# Prepare test data for prediction
test_data_cnn = test_data.drop(columns=['id'])
test_data_cnn = (test_data_cnn - X.mean()) / X.std()  # Normalize test data
X_test_cnn = test_data_cnn.values.reshape(test_data_cnn.shape[0], test_data_cnn.shape[1], 1)

# Make predictions on test data
test_predictions = model.predict(X_test_cnn)
predicted_labels = np.argmax(test_predictions, axis=1)

# Prepare submission DataFrame
submission = pd.DataFrame({'id': test_data['id'], 'label': predicted_labels})

# Save submission to CSV
submission.to_csv('cnn_submission.csv', index=False)

submission.head()


Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.5272 - loss: nan - val_accuracy: 0.4727 - val_loss: nan
Epoch 2/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5324 - loss: nan - val_accuracy: 0.4727 - val_loss: nan
Epoch 3/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5850 - loss: nan - val_accuracy: 0.4727 - val_loss: nan
Epoch 4/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5340 - loss: nan - val_accuracy: 0.4727 - val_loss: nan
Epoch 5/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5366 - loss: nan - val_accuracy: 0.4727 - val_loss: nan
Epoch 6/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5678 - loss: nan - val_accuracy: 0.4727 - val_loss: nan
Epoch 7/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.57

Unnamed: 0,id,label
0,179,0
1,9,0
2,281,0
3,284,0
4,175,0
