In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score

# Define the attribute details
attribute_details = {
    'A1': ['b', 'a'],
    'A2': 'continuous',
    'A3': ['u', 'y', 'l'],
    'A4': ['g', 'p', 'gg'],
    'A5': 'continuous',
    'A6': ['c', 'd', 'cc', 'i', 'j', 'k', 'm', 'r', 'q', 'w', 'x', 'e', 'aa', 'ff'],
    'A7': 'continuous',
    'A8': ['TRUE', 'FALSE'],
    'A9': ['v', 'h', 'bb', 'j', 'n', 'z', 'dd', 'ff', 'o'],
    'A10': 'continuous',
    'A11': ['TRUE', 'FALSE'],
    'A12': 'continuous',
    'A13': ['TRUE', 'FALSE'],
    'A14': 'continuous',
    'A15': ['g', 'p', 's'],
    'A16': ['Success', 'Failure']
}

# Load the dataset
df = pd.read_csv("trainData.csv", sep=",", header=0)

# Preprocessing
# Convert categorical variables to numerical using label encoding
categorical_features = ['A1', 'A3', 'A4', 'A6', 'A8', 'A9', 'A11', 'A13', 'A15']
label_encoder = LabelEncoder()

for feature in categorical_features:
    df[feature] = label_encoder.fit_transform(df[feature])

# Handle missing values
df.replace('?', pd.NA, inplace=True)  # Replace '?' with NaN
df.fillna(df.median(), inplace=True)  # Replace NaN with the median of each column

# Split the dataset into features (X) and target (y)
X = df.drop('A16', axis=1)
y = df['A16']

# Perform one-hot encoding for categorical features
one_hot_encoder = OneHotEncoder(sparse=False)
X_encoded = pd.DataFrame(one_hot_encoder.fit_transform(X[categorical_features]))
X_encoded.columns = one_hot_encoder.get_feature_names_out(categorical_features)
X = pd.concat([X.drop(categorical_features, axis=1), X_encoded], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the random forest model
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = random_forest.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


  df.fillna(df.median(), inplace=True)  # Replace NaN with the median of each column


Accuracy: 0.8108108108108109


In [None]:
new_df = pd.read_csv("testdata.csv", sep=",", header=0)

In [None]:
# Label encoding
for feature in categorical_features:
    new_df[feature] = label_encoder.fit_transform(new_df[feature])

# Handling missing values
new_df.replace('?', pd.NA, inplace=True)
new_df.fillna(df.median(), inplace=True)

# One-hot encoding for categorical features
new_encoded = pd.DataFrame(one_hot_encoder.transform(new_df[categorical_features]))
new_encoded.columns = one_hot_encoder.get_feature_names_out(categorical_features)
new_df_encoded = pd.concat([new_df.drop(categorical_features, axis=1), new_encoded], axis=1)


  new_df.fillna(df.median(), inplace=True)


In [None]:
new_predictions = random_forest.predict(new_df_encoded)

In [None]:
predictions_df = pd.DataFrame({'Id': range(1, len(new_predictions) + 1),
                               'Category': new_predictions})

In [None]:
predictions_df.to_csv("predictions.csv", index=False)