In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv('styles_cleaned.csv')
df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012,Casual,Peter England Men Party Blue Jeans
2,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011,Casual,Manchester United Men Solid Black Track Pants
3,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012,Casual,Puma Men Grey T-shirt
4,1855,Men,Apparel,Topwear,Tshirts,Grey,Summer,2011,Casual,Inkfruit Mens Chain Reaction T-shirt


### About the `productDisplayName` Feature

At the beginning, we thought about encoding the description (`productDisplayName`) text and using it as an additional feature but we noticed that most descriptions are basically made of words that already appear in the other columns.

For example, consider the following row:

| id    | gender | masterCategory | subCategory | articleType | baseColour | season | year | usage  | productDisplayName                     |

| 15970 | Men    | Apparel        | Topwear     | Shirts       | Navy Blue   | Fall   | 2011 | Casual | Turtle Check Men Navy Blue Shirt         |

In this case, words like *men*, *navy*, *blue* and *shirt* are already present as features (`gender`, `baseColour`, `articleType`) and are also repeated in the description. This shows that the product name does not really bring new information.

We also looked at the most frequent words in the descriptions. The top 10 most common words are:
- men  
- women  
- black  
- blue  
- t-shirt  
- white  
- shoes  
- printed  
- shirt  
- grey  

These words clearly match existing features such as `gender`, `articleType` and `baseColour`.  
If we look at the next most frequent words (top 10â€“20), we start to see some brands such as nike, puma, adidas, polo. Brands are much less informative for a classification problem. In fact, the top 20 most frequent words already represent about **44% of the total word count**, which means that a large part of the descriptions is made of repeated and redundant words.

Because of this, we decided not to do any feature engineering on the `productDisplayName` column and to exclude it from the models.

In [None]:
categorical_cols = ['gender', 'masterCategory', 'subCategory', 'articleType',
       'baseColour', 'season', 'usage', 'productDisplayName']
numerical_cols = ['year'] 

# Encode categorical variables
encoded_df = df.copy()
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    encoded_df[col] = le.fit_transform(encoded_df[col].astype(str))
    label_encoders[col] = le

# features and target
X = encoded_df[['gender', 'masterCategory', 'subCategory', 'baseColour', 'season', 'usage', 'productDisplayName', 'year']] 
y = encoded_df['articleType']  

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
# Find the right max_depth for 83 leaves
for depth in range(5, 20):
    rf_test = RandomForestClassifier(n_estimators=1, max_depth=depth, random_state=42)
    rf_test.fit(X_train, y_train)
    n_leaves = rf_test.estimators_[0].get_n_leaves()
    print(f"max_depth={depth}: {n_leaves} leaves")
    if n_leaves >= 83:
        break

max_depth=5: 31 leaves
max_depth=6: 54 leaves
max_depth=7: 101 leaves


In [40]:
298-130

168

In [38]:
# one tree classifier
rf_model = RandomForestClassifier(n_estimators=1, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# see the splits
estimator = rf_model.estimators_[0]
from sklearn.tree import export_text
tree_rules = export_text(estimator, feature_names=list(X.columns))
print(tree_rules)


Accuracy: 0.7056548167249954
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.80      0.89        15
           1       0.67      0.14      0.24        14
           2       0.00      0.00      0.00         8
           3       0.46      0.59      0.52        94
           4       0.00      0.00      0.00         1
           5       1.00      1.00      1.00         1
           6       0.43      0.43      0.43         7
           7       0.00      0.00      0.00         7
           8       0.00      0.00      0.00         9
           9       0.00      0.00      0.00         1
          10       0.50      0.29      0.37        24
          11       0.67      0.16      0.26        25
          12       0.89      0.95      0.92       391
          13       0.00      0.00      0.00         4
          14       0.40      0.05      0.09        38
          15       0.35      0.55      0.43        22
          16       0.82     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
estimator.get_depth()

15