In [3]:
from sklearn.datasets import load_wine
import pandas as pd

# 1. Load the raw data
wine = load_wine()

# 2. Transform into a DataFrame for easy viewing
df = pd.DataFrame(data=wine.data, columns=wine.feature_names)

# 3. Add the target (The type of wine: 0, 1, or 2)
df['target'] = wine.target

print(f"Dataset Loaded! We have {df.shape[0]} samples and {df.shape[1]-1} chemical features.")
print(df.head())

Dataset Loaded! We have 178 samples and 13 chemical features.
   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   



In [4]:
df.isnull().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
target                          0
dtype: int64

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 1. Separate your features (X) and your target (y)
X = df.drop('target', axis=1)
y = df['target']

# 2. This is the line that defines X_train!
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


scaler = StandardScaler()
# We "Fit" the scaler on training data and transform both sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [11]:
from sklearn.ensemble import RandomForestClassifier

# 1. Initialize with 100 trees
# random_state ensures you get the same results every time you run it
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# 2. Train the "Forest"
clf.fit(X_train, y_train)

# 3. Get the predictions
y_pred = clf.predict(X_test)

In [12]:
# Create the score list
feature_scores = pd.Series(clf.feature_importances_, index=X.columns)

# Sort them so you can see the "winners"
feature_scores = feature_scores.sort_values(ascending=False)

# Display the raw numbers
print(feature_scores)

alcohol                         0.162715
color_intensity                 0.159719
flavanoids                      0.156283
proline                         0.124081
hue                             0.111019
od280/od315_of_diluted_wines    0.098852
total_phenols                   0.042004
magnesium                       0.034591
malic_acid                      0.033700
alcalinity_of_ash               0.028841
proanthocyanins                 0.019952
ash                             0.016418
nonflavanoid_phenols            0.011825
dtype: float64


In [13]:
top_6_cols = [
    'alcohol', 'color_intensity', 'flavanoids', 
    'proline', 'hue', 'od280/od315_of_diluted_wines'
]

X_train_6 = X_train[top_6_cols]
X_test_6 = X_test[top_6_cols]

In [14]:
# Get all columns except the last 3 in your sorted list
bottom_3 = feature_scores.tail(3).index.tolist()
top_10_cols = [col for col in X.columns if col not in bottom_3]

X_train_10 = X_train[top_10_cols]
X_test_10 = X_test[top_10_cols]

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Function to train and score quickly
def run_exp(X_tr, X_te, name):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_tr, y_train)
    acc = accuracy_score(y_test, model.predict(X_te))
    print(f"Result for {name}: {acc:.4f}")
    return acc

# Let the games begin!
acc_6 = run_exp(X_train_6, X_test_6, "Top 6 Features")
acc_10 = run_exp(X_train_10, X_test_10, "Top 10 (All but bottom 3)")

Result for Top 6 Features: 1.0000
Result for Top 10 (All but bottom 3): 1.0000
