In [1]:
import pandas as pd

# Load the datasets
train_data = pd.read_csv('/home/john/ai/kaggle/poison-mushroom-binary-prediction/train.csv')
test_data = pd.read_csv('/home/john/ai/kaggle/poison-mushroom-binary-prediction/test.csv')

In [2]:
# Display the first few rows of the training data
print("First few rows of the training data:")
print(train_data.head())

First few rows of the training data:
   id class  cap-diameter cap-shape cap-surface cap-color  \
0   0     e          8.80         f           s         u   
1   1     p          4.51         x           h         o   
2   2     e          6.94         f           s         b   
3   3     e          3.88         f           y         g   
4   4     e          5.85         x           l         w   

  does-bruise-or-bleed gill-attachment gill-spacing gill-color  ...  \
0                    f               a            c          w  ...   
1                    f               a            c          n  ...   
2                    f               x            c          w  ...   
3                    f               s          NaN          g  ...   
4                    f               d          NaN          w  ...   

   stem-root  stem-surface stem-color veil-type veil-color has-ring ring-type  \
0        NaN           NaN          w       NaN        NaN        f         f   
1      

In [3]:
# Display basic statistics of the training data
print("\nBasic statistics of the training data:")
print(train_data.describe())


Basic statistics of the training data:
                 id  cap-diameter   stem-height    stem-width
count  3.116945e+06  3.116941e+06  3.116945e+06  3.116945e+06
mean   1.558472e+06  6.309848e+00  6.348333e+00  1.115379e+01
std    8.997847e+05  4.657931e+00  2.699755e+00  8.095477e+00
min    0.000000e+00  3.000000e-02  0.000000e+00  0.000000e+00
25%    7.792360e+05  3.320000e+00  4.670000e+00  4.970000e+00
50%    1.558472e+06  5.750000e+00  5.880000e+00  9.650000e+00
75%    2.337708e+06  8.240000e+00  7.410000e+00  1.563000e+01
max    3.116944e+06  8.067000e+01  8.872000e+01  1.029000e+02


In [4]:
# Display the data types and check for missing values
print("\nData types and missing values in the training data:")
print(train_data.info())
print("\nMissing values in the training data:")
print(train_data.isnull().sum())


Data types and missing values in the training data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   class                 object 
 2   cap-diameter          float64
 3   cap-shape             object 
 4   cap-surface           object 
 5   cap-color             object 
 6   does-bruise-or-bleed  object 
 7   gill-attachment       object 
 8   gill-spacing          object 
 9   gill-color            object 
 10  stem-height           float64
 11  stem-width            float64
 12  stem-root             object 
 13  stem-surface          object 
 14  stem-color            object 
 15  veil-type             object 
 16  veil-color            object 
 17  has-ring              object 
 18  ring-type             object 
 19  spore-print-color     object 
 20  habitat               object 
 21  season                ob

In [6]:
# Check the distribution of the target variable (assuming it's named 'target')
print("\nDistribution of the target variable:")
print(train_data['class'].value_counts())


Distribution of the target variable:
class
p    1705396
e    1411549
Name: count, dtype: int64


In [7]:
# Analyze categorical features (assuming most features are categorical)
print("\nUnique values in categorical features:")
for column in train_data.columns:
    if train_data[column].dtype == 'object':
        print(f"{column}: {train_data[column].nunique()} unique values")


Unique values in categorical features:
class: 2 unique values
cap-shape: 74 unique values
cap-surface: 83 unique values
cap-color: 78 unique values
does-bruise-or-bleed: 26 unique values
gill-attachment: 78 unique values
gill-spacing: 48 unique values
gill-color: 63 unique values
stem-root: 38 unique values
stem-surface: 60 unique values
stem-color: 59 unique values
veil-type: 22 unique values
veil-color: 24 unique values
has-ring: 23 unique values
ring-type: 40 unique values
spore-print-color: 32 unique values
habitat: 52 unique values
season: 4 unique values


Prepare that data so we can use it to train a model

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset
train_data = pd.read_csv('/home/john/ai/kaggle/poison-mushroom-binary-prediction/train.csv')

# Handle missing data by imputing with the most frequent value (mode)
imputer = SimpleImputer(strategy='most_frequent')
train_data = pd.DataFrame(imputer.fit_transform(train_data), columns=train_data.columns)

# Encode categorical variables
label_encoders = {}
for column in train_data.columns:
    if train_data[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        train_data[column] = label_encoders[column].fit_transform(train_data[column])

# Split the data into features and target variable
X = train_data.drop('class', axis=1)
y = train_data['class']

# Feature scaling (optional, depending on the model you plan to use)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Display the shapes of the training and validation sets
print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)


Training set shape: (2493556, 21)
Validation set shape: (623389, 21)


train a model, then create a submission.csv

In [23]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the datasets
train_data = pd.read_csv('/home/john/ai/kaggle/poison-mushroom-binary-prediction/train.csv')
test_data = pd.read_csv('/home/john/ai/kaggle/poison-mushroom-binary-prediction/test.csv')
sample_submission = pd.read_csv('/home/john/ai/kaggle/poison-mushroom-binary-prediction/sample_submission.csv')

# Handle missing data by imputing with the most frequent value (mode)
imputer = SimpleImputer(strategy='most_frequent')
train_data = pd.DataFrame(imputer.fit_transform(train_data), columns=train_data.columns)
test_data = pd.DataFrame(imputer.fit_transform(test_data), columns=test_data.columns)

# Encode categorical variables
label_encoders = {}
for column in train_data.columns:
    if train_data[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        train_data[column] = label_encoders[column].fit_transform(train_data[column])

for column in test_data.columns:
    if test_data[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        test_data[column] = label_encoders[column].fit_transform(test_data[column])

# Split the data into features and target variable
X = train_data.drop('class', axis=1)
y = train_data['class']

# Feature scaling (optional, depending on the model you plan to use)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_data_scaled = scaler.transform(test_data)

# Train a RandomForest model
model = RandomForestClassifier(random_state=42)
model.fit(X_scaled, y)

# Make predictions on the test set
predictions = model.predict(test_data_scaled)

# Map 0 to 'e' and 1 to 'p'
predictions_mapped = ['e' if pred == 0 else 'p' for pred in predictions]

# Prepare the submission file by using the sample_submission format
submission = sample_submission.copy()
submission['class'] = predictions_mapped  # Assuming the submission format has a 'class' column

# Save the submission to a CSV file
submission.to_csv('/home/john/ai/kaggle/poison-mushroom-binary-prediction/submission.csv', index=False)

print("Submission file created successfully!")


Submission file created successfully!
