In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/smoking-status-prediction-b1/sample_submission.csv
/kaggle/input/smoking-status-prediction-b1/train.csv
/kaggle/input/smoking-status-prediction-b1/test.csv


**TRAINING DATA**

In [2]:
# Load the training data
train_data = pd.read_csv('/kaggle/input/smoking-status-prediction-b1/train.csv')
train_data.head()

Unnamed: 0,ID,gender,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),...,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,oral,dental caries,tartar,smoking
0,1,M,70,175,65,89.8,0.8,1.0,1,1,...,15.5,5,1.3,29,18,53,Y,0,Y,1
1,2,M,40,170,65,86.0,1.2,1.0,1,1,...,15.3,1,1.2,26,23,21,Y,1,Y,1
2,3,M,20,170,70,83.0,1.0,1.0,1,1,...,14.2,1,1.0,17,11,12,Y,0,N,0
3,4,F,60,145,50,77.0,0.4,0.5,1,2,...,12.8,1,0.5,26,12,13,Y,0,N,0
4,5,F,40,160,70,82.9,0.9,0.7,1,1,...,13.8,1,0.9,16,25,20,Y,0,N,0


**TESTING DATA**

In [3]:
# Load the testing data
test_data = pd.read_csv('/kaggle/input/smoking-status-prediction-b1/test.csv')
test_data.head()

Unnamed: 0,ID,gender,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),...,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,oral,dental caries,tartar
0,1,F,45,155,55,78.0,0.8,1.2,1,1,...,72,13.6,1,0.7,19,17,24,Y,1,N
1,2,F,40,160,55,74.0,0.9,0.9,1,1,...,114,14.7,1,0.9,32,22,8,Y,0,Y
2,3,M,40,175,85,97.0,1.5,1.0,1,1,...,97,14.7,1,0.7,20,29,62,Y,0,Y
3,4,M,60,160,50,72.0,1.0,1.2,1,1,...,107,14.7,1,0.9,22,14,28,Y,0,N
4,5,F,50,160,65,88.1,0.7,0.8,1,1,...,155,13.9,1,0.9,36,62,40,Y,0,N


**PREPARING THE DATA**

In [4]:
# Prepare the data
X_train = train_data.drop(['ID', 'smoking'], axis=1)
y_train = train_data['smoking']
X_test = test_data.drop(['ID'], axis=1)

# Preprocess categorical variables
categorical_cols = X_train.select_dtypes(include='object').columns

for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])

categorical_cols = X_test.select_dtypes(include='object').columns

for col in categorical_cols:
    le = LabelEncoder()
    X_test[col] = le.fit_transform(X_test[col])

**DIFFERENT ENSEMBLE MODELS WITH VARIOUS HYPERPARAMETERS**

**RANDOM FOREST**

In [5]:
# Random Forest
rf_model1 = RandomForestClassifier(n_estimators=200,max_depth=20,random_state=42)
rf_model1.fit(X_train, y_train)    

# Varying hyperparameters
rf_model2 = RandomForestClassifier(n_estimators=100,max_depth=3,random_state=42)
rf_model2.fit(X_train, y_train) 

# Cross-validation
cv_scores = cross_val_score(rf_model1, X_train, y_train, cv=5, error_score='raise')
mean_cv_score1 = cv_scores.mean()
cv_scores = cross_val_score(rf_model2, X_train, y_train, cv=5, error_score='raise')
mean_cv_score2 = cv_scores.mean()

print("\nCV SCORE:\n")
print("Random Forest Model 1 - ", mean_cv_score1)
print("Random Forest Model 2 - ", mean_cv_score2)


CV SCORE:

Random Forest Model 1 -  0.816325
Random Forest Model 2 -  0.7276


**GRADIENT BOOSTING**

In [6]:
# Gradient Boosting
gb_model1 = GradientBoostingClassifier(n_estimators=100,learning_rate=0.1,max_depth=3,random_state=42)
gb_model1.fit(X_train, y_train)    

# Varying hyperparameters
gb_model2 = GradientBoostingClassifier(n_estimators=100,learning_rate=0.5,max_depth=6,random_state=42)
gb_model2.fit(X_train, y_train) 

# Cross-validation
cv_scores = cross_val_score(gb_model1, X_train, y_train, cv=5, error_score='raise')
mean_cv_score1 = cv_scores.mean()
cv_scores = cross_val_score(gb_model2, X_train, y_train, cv=5, error_score='raise')
mean_cv_score2 = cv_scores.mean()

print("\nCV SCORE:\n")
print("Gradient Boosting Model 1 - ", mean_cv_score1)
print("Gradient Boosting Model 2 - ", mean_cv_score2)


CV SCORE:

Gradient Boosting Model 1 -  0.765475
Gradient Boosting Model 2 -  0.7805


**ADABOOST**

In [7]:
# AdaBoost
ab_model1 = AdaBoostClassifier(n_estimators=100,learning_rate=0.1,random_state=42)
ab_model1.fit(X_train, y_train)    

# Varying hyperparameters
ab_model2 = AdaBoostClassifier(n_estimators=200,learning_rate=0.2,random_state=42)
ab_model2.fit(X_train, y_train) 

# Cross-validation
cv_scores = cross_val_score(ab_model1, X_train, y_train, cv=5, error_score='raise')
mean_cv_score1 = cv_scores.mean()
cv_scores = cross_val_score(ab_model2, X_train, y_train, cv=5, error_score='raise')
mean_cv_score2 = cv_scores.mean()

print("\nCV SCORE:\n")
print("AdaBoost Model 1 - ", mean_cv_score1)
print("AdaBoost Model 2 - ", mean_cv_score2)


CV SCORE:

AdaBoost Model 1 -  0.749425
AdaBoost Model 2 -  0.763225


**ACCURACIES**

In [8]:
# Split the training data into training and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Accuracy
y_pred1 = rf_model1.predict(X_val)
y_pred2 = rf_model2.predict(X_val)
y_pred3 = gb_model1.predict(X_val)
y_pred4 = gb_model2.predict(X_val)
y_pred5 = ab_model1.predict(X_val)
y_pred6 = ab_model2.predict(X_val)

accuracy1 = accuracy_score(y_val, y_pred1)
accuracy2 = accuracy_score(y_val, y_pred2)
accuracy3 = accuracy_score(y_val, y_pred3)
accuracy4 = accuracy_score(y_val, y_pred4)
accuracy5 = accuracy_score(y_val, y_pred5)
accuracy6 = accuracy_score(y_val, y_pred6)

print("\nAccuracies: \n")
print("Random Forest Model 1 - ", accuracy1)
print("Random Forest Model 2 - ", accuracy2)
print("Gradient Boosting Model 1 - ", accuracy3)
print("Gradient Boosting Model 2 - ", accuracy4)
print("AdaBoost Model 1 - ", accuracy5)
print("AdaBoost Model 2 - ", accuracy6)


Accuracies: 

Random Forest Model 1 -  0.99725
Random Forest Model 2 -  0.730625
Gradient Boosting Model 1 -  0.77725
Gradient Boosting Model 2 -  0.917875
AdaBoost Model 1 -  0.7525
AdaBoost Model 2 -  0.768


**PREDICTIONS**

In [9]:
# Make predictions on the test set
predictions = rf_model1.predict(X_test)

print("\nSuccessfully made predictions on the test set.")


Successfully made predictions on the test set.


**OUTPUT**

In [10]:
# Create a DataFrame with predictions
output_data = pd.DataFrame({'ID': test_data.iloc[:, 0], 'smoking': predictions})

# Save predictions to an output file
output_data.to_csv('output_21BAI1171.csv', index = False)
print("\nSuccessfully saved predictions to output file.")


Successfully saved predictions to output file.


**ACCURACIES OF DIFFERENT ENSEMBLE MODELS**

Random Forest Model 1 -  0.99725

Random Forest Model 2 -  0.730625

Gradient Boosting Model 1 -  0.77725

Gradient Boosting Model 2 -  0.917875

AdaBoost Model 1 -  0.7525

AdaBoost Model 2 -  0.768