In [36]:
import pandas as pd 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

base_url = "https://raw.githubusercontent.com/gayatrisreeraj/mnist-digit-recognizer/main/"

testing_csv = "testing%20dataset.csv"
training_csv = "training%20dataset.csv"


**TRAINING DATA**

In [37]:
# Load the training data
train_data = pd.read_csv(base_url + training_csv)

# Assign column names to the training dataset
train_columns = ['ID'] + [f'pixel_{i}_{j}' for i in range(8) for j in range(8)] + ['Category']
train_data.columns = train_columns
train_data.head()

Unnamed: 0,ID,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,Category
0,2,0,0,0,10,12,3,0,0,0,...,0,0,0,2,10,14,13,4,0,8
1,3,0,0,3,10,15,8,0,0,0,...,0,0,0,2,10,8,0,0,0,5
2,4,0,0,5,11,16,16,8,0,0,...,0,0,0,7,16,11,2,0,0,3
3,5,0,0,4,12,16,16,4,0,0,...,0,0,0,3,12,13,9,0,0,2
4,6,0,1,11,13,2,0,0,0,0,...,0,0,1,12,12,12,15,11,0,2


**TESTING DATA**

In [38]:
# Load the testing data
test_data = pd.read_csv(base_url + testing_csv)
custom_row = custom_row = pd.DataFrame([[1431, 0, 0, 0, 2, 14, 5, 0, 0, 0, 0, 1, 11, 11, 0, 0, 0, 0, 0, 7,
                            14, 1, 0, 0, 0, 0, 5, 16, 3, 4, 6, 2, 0, 0, 14, 16, 14, 16, 16,
                            10, 0, 0, 9, 12, 7, 8, 16, 4, 0, 0, 0, 0, 0, 13, 12, 0, 0, 0, 0,
                            0, 3, 15, 6, 0, 0]], columns=test_data.columns)
test_data = pd.concat([custom_row, test_data], ignore_index=True)

# Assign column names to the testing dataset
test_columns = ['ID'] + [f'pixel_{i}_{j}' for i in range(8) for j in range(8)]
test_data.columns = test_columns
test_data.head()

Unnamed: 0,ID,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,1431,0,0,0,2,14,5,0,0,0,...,0,0,0,0,0,3,15,6,0,0
1,1432,0,0,0,13,12,0,0,0,0,...,12,0,0,0,1,10,16,14,4,0
2,1433,0,0,7,16,16,16,6,0,0,...,0,0,0,0,7,15,1,0,0,0
3,1434,0,2,15,15,6,0,0,0,0,...,1,0,0,3,15,14,11,2,0,0
4,1435,0,0,0,10,9,0,0,0,0,...,0,0,0,0,0,13,10,0,0,0


**PREPARE THE DATA**

In [39]:
# Prepare the data
X_train = train_data.drop(['ID', 'Category'], axis=1)
y_train = train_data['Category']
X_test = test_data.drop(['ID'], axis=1)

**SPLITTING THE DATA**

In [40]:
# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

**MODEL FITTING**

In [41]:
# Train the decision tree model with different hyperparameters
# Decision tree with default parameters
tree1 = DecisionTreeClassifier(random_state=42)
tree1.fit(X_train, y_train)

# Decision tree with increased maximum depth
tree2 = DecisionTreeClassifier(max_depth=15, random_state=42)
tree2.fit(X_train, y_train)

# Decision tree with entropy as the attribute selection method and increased minimum samples split
tree3 = DecisionTreeClassifier(criterion='entropy', min_samples_split=10, random_state=42)
tree3.fit(X_train, y_train)

**EVALUATING MODEL PERFORMANCE**

In [42]:
# Evaluate the performance of the trained models using the validation set
y_pred1 = tree1.predict(X_val)
accuracy1 = accuracy_score(y_val, y_pred1)
print("Accuracy (Default Parameters):", accuracy1)

y_pred2 = tree2.predict(X_val)
accuracy2 = accuracy_score(y_val, y_pred2)
print("Accuracy (Increased Max Depth):", accuracy2)

y_pred3 = tree3.predict(X_val)
accuracy3 = accuracy_score(y_val, y_pred3)
print("Accuracy (Entropy Attribute Selection):", accuracy3)

Accuracy (Default Parameters): 0.8426573426573427
Accuracy (Increased Max Depth): 0.8426573426573427
Accuracy (Entropy Attribute Selection): 0.8496503496503497


**FINDING THE MODEL WITH MAXIMUM ACCURACY**

In [43]:
# Find the maximum accuracy
max_accuracy = max(accuracy1, accuracy2, accuracy3)
best_model = None

if max_accuracy == accuracy1:
    best_model = tree1
elif max_accuracy == accuracy2:
    best_model = tree2
else:
    best_model = tree3

print("Best Model Accuracy:", max_accuracy)

Best Model Accuracy: 0.8496503496503497


**PREDICTIONS**

In [44]:
# Select the model with the best performance and use it to make predictions on the test set
test_predictions = best_model.predict(X_test)

**OUTPUT**

In [45]:
# Save predictions to an output file with ID and Category
predictions = pd.DataFrame({'ID': test_data['ID'], 'Category': test_predictions})
predictions.to_csv('output.csv', index=False)

**CHECK WHETHER SUBMISSION HAS 367 ROWS**

In [46]:
if predictions.shape[0] == 367:
    print("Submission file created successfully.")
else:
    print("Error: Submission must have 367 rows.")

Submission file created successfully.
