 # Описание

В датасете содержится информация о более чем 1700 шоколадках. В датасете представлена информация о темном шоколаде.

Система рейтинга шоколада:
5 = Элита
4 = Премиум
3 = Удовлетворительно (3.0) до похвалы (3.75)
2 = Разочаровывающе
1 = Неприятно

Для получения рейтинга каждой шоколадки оцениваются все ее характеристики.

Ваша задача - по характеристикам шоколадки научиться предсказывать ее рейтинг без округлений (то есть если рейтинг 3.75 - хорошая модель должна предсказать 3.75).

Рейтинги были собраны Брэди Брелински, членом-основателем Манхэттенского шоколадного общества.




Никнейм: Marat Ramazanov

# Решение

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

TRAIN = '/kaggle/input/chocolate-rating-prediction-ai-edu/chocolate_train.csv'
TEST = '/kaggle/input/chocolate-rating-prediction-ai-edu/chocolate_test_new.csv'


train = pd.read_csv(TRAIN)
test = pd.read_csv(TEST)

In [None]:
train

Unnamed: 0,Company,Specific Bean Origin,REF,Review,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin
0,Willie's Cacao,Rio Caribe,457,2009,72%,U.K.,3.25,Trinitario,Venezuela
1,Beschle (Felchlin),"Ocumare, Premier Cru, Quizas No. 2",508,2010,72%,Switzerland,3.50,,Venezuela
2,Dark Forest,Tanzania,1554,2015,70%,U.S.A.,3.00,,Tanzania
3,Brasstown aka It's Chocolate,Cooproagro,1125,2013,72%,U.S.A.,3.00,Trinitario,Dominican Republic
4,Pralus,"Java, Indonesie",32,2006,75%,France,3.50,Criollo,Indonesia
...,...,...,...,...,...,...,...,...,...
1250,Artisan du Chocolat,Madagascar,363,2009,80%,U.K.,3.00,"Criollo, Trinitario",Madagascar
1251,Marana,Cusco,1884,2016,70%,Peru,2.75,,Peru
1252,Arete,Nacional,1534,2015,68%,U.S.A.,3.50,Forastero (Nacional),Peru
1253,Fresco,"Conacado, #212, LR, SC",642,2011,72%,U.S.A.,3.50,,Dominican Republic


In [None]:
def info_about_dataset(dataset: pd.DataFrame):
    for column in dataset.columns:
        print(column)
        print('Dtype:', dataset[column].dtype)
        if dataset[column].dtype == 'object':
            print('Num of occur once:', (dataset[column].value_counts() == 1).sum())
        print('NA values:', dataset[column].isna().sum())
        print('Unique values:', len(dataset[column].unique()))
        print('-'*50)


In [None]:
info_about_dataset(train)

In [None]:
info_about_dataset(test)

In [None]:
test.describe(include='object')

# Conslusions:
1. The training dataset includes only 2 unknown values. This object must be removed from the dataset
2. The columns of the dataset "Specific Bean Origin", "Company" include 762 unique values (621 of them occur only 1 time). To improve the predictions of the model, it is worthwhile to apply geographic scaling of the feature.
3. The columns "Company", "Specific Bean Origin", "Cocoa Percent", "Company Location", "Bean Type" and "Broad Bean Origin" need to be converted to numerical form, as they have an object type.

# drop na-values:

In [None]:
train.dropna(inplace=True)
train

Unnamed: 0,Company,Specific Bean Origin,REF,Review,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin
0,Willie's Cacao,Rio Caribe,457,2009,72%,U.K.,3.25,Trinitario,Venezuela
1,Beschle (Felchlin),"Ocumare, Premier Cru, Quizas No. 2",508,2010,72%,Switzerland,3.50,,Venezuela
2,Dark Forest,Tanzania,1554,2015,70%,U.S.A.,3.00,,Tanzania
3,Brasstown aka It's Chocolate,Cooproagro,1125,2013,72%,U.S.A.,3.00,Trinitario,Dominican Republic
4,Pralus,"Java, Indonesie",32,2006,75%,France,3.50,Criollo,Indonesia
...,...,...,...,...,...,...,...,...,...
1250,Artisan du Chocolat,Madagascar,363,2009,80%,U.K.,3.00,"Criollo, Trinitario",Madagascar
1251,Marana,Cusco,1884,2016,70%,Peru,2.75,,Peru
1252,Arete,Nacional,1534,2015,68%,U.S.A.,3.50,Forastero (Nacional),Peru
1253,Fresco,"Conacado, #212, LR, SC",642,2011,72%,U.S.A.,3.50,,Dominican Republic


# mapping

In [None]:
def mapping_bean():
    mapping_bean = {
        'Trinitario' : ['Trinitario', 'Trinitario, Criollo', 'Trinitario, Forastero', 'Trinitario (Amelonado)', 'Trinitario, Nacional', 'Trinitario (85% Criollo)', 'Trinitario, TCGA', 'EET'],
        'Criollo' : ['Criollo', 'Criollo, Trinitario', 'Criollo (Porcelana)', 'Criollo, Forastero', 'Criollo (Amarru)', 'Criollo (Ocumare 67)', 'Criollo (Ocumare 61)', 'Criollo (Wild)', 'Criollo, +', 'Criollo (Ocumare)', 'Criollo (Ocumare 77)'],
        'Forastero' : ['Forastero', 'Forastero (Nacional)', 'Forastero (Arriba)', 'Forastero (Parazinho)', 'Forastero (Arriba) ASS', 'Forastero (Catongo)', 'Forastero(Arriba, CCN)', 'Forastero (Arriba) ASSS', 'Forastero, Trinitario'],
        'Blend' : ['Blend', 'Blend-Forastero,Criollo'],
        'Nacional' : ['Nacional','Nacional (Arriba)'],
        'Amazon' : ['Amazon','Amazon mix', 'Amazon, ICS'],
        'Matina' : ['Matina']
    }

    reverse_mapping = {}
    for key, values in mapping_bean.items():
        for value in values:
            reverse_mapping[value] = key

    train['Bean Type'] = train['Bean Type'].map(reverse_mapping)
    train.loc[train['Bean Type'].isna(), 'Bean Type'] = np.random.choice(
        list(mapping_bean.keys())[:3],
        size=train['Bean Type'].isna().sum())

    test['Bean Type'] = test['Bean Type'].map(reverse_mapping)
    test.loc[test['Bean Type'].isna(), 'Bean Type'] = np.random.choice(
        list(mapping_bean.keys())[:3],
        size=test['Bean Type'].isna().sum())

mapping_bean()

In [None]:
def mapping_country():
    region_mapping = {
        'North America': ['U.S.A.', 'Canada', 'Mexico', 'EET'],
        'Europe': ['France', 'U.K.', 'Italy', 'Belgium', 'Germany', 'Switzerland',
                   'Hungary', 'Denmark', 'Austria', 'Lithuania', 'Scotland', 'Ireland',
                   'Netherlands', 'Poland', 'Spain'],
        'South America': ['Ecuador', 'Colombia', 'Venezuela', 'Brazil', 'Peru',
                         'Argentina', 'Costa Rica', 'Nicaragua', 'Guatemala'],
        'Oceania': ['Australia', 'New Zealand', 'Fiji'],
        'Asia': ['Japan', 'Vietnam', 'South Korea', 'Singapore', 'Philippines', 'Israel']
    }

    reverse_region_mapping = {}
    for region, countries in region_mapping.items():
        for country in countries:
            reverse_region_mapping[country] = region

    train['Company Location'] = train['Company Location'].map(reverse_region_mapping)
    train.loc[train['Company Location'].isna(), 'Company Location'] = "Other"

    test['Company Location'] = test['Company Location'].map(reverse_region_mapping)
    test.loc[test['Company Location'].isna(), 'Company Location'] = "Other"

mapping_country()

In [None]:
def mapping_Broad_Bean_Origin():
    region_mapping = {
        'South America': [
            'Venezuela', 'Ecuador', 'Peru', 'Brazil', 'Bolivia',
            'Colombia', 'Trinidad', 'Trinidad, Tobago', 'Trinidad-Tobago',
            'Venezuela, Trinidad', 'Ecuador, Mad., PNG', 'Ghana & Madagascar',
            'Ven, Trinidad, Ecuador', 'Venezuela/ Ghana', 'South America',
            'Peru, Ecuador', 'Ecuador, Costa Rica', 'Peru(SMartin,Pangoa,nacional)'
        ],
        'Central America': [
            'Nicaragua', 'Belize', 'Guatemala', 'Honduras', 'Panama',
            'Costa Rica', 'Mexico', 'Domincan Republic', 'Dominican Republic',
            'Haiti', 'Cuba', 'Jamaica', 'Grenada', 'St. Lucia', 'Martinique',
            'Carribean', 'Africa, Carribean, C. Am.'
        ],
        'Africa': [
            'Madagascar', 'Ghana', 'Tanzania', 'Congo', 'Uganda',
            'Sao Tome', 'Sao Tome & Principe', 'West Africa', 'Togo',
            'Cameroon', 'Ivory Coast', 'Burma'
        ],
        'Asia/Pacific': [
            'Vietnam', 'Indonesia', 'Philippines', 'India', 'Papua New Guinea',
            'Fiji', 'Vanuatu', 'Solomon Islands', 'Australia', 'Burma'
        ]
    }

    # Создаем обратное отображение
    reverse_mapping = {}
    for region, countries in region_mapping.items():
        for country in countries:
            reverse_mapping[country] = region

    train['Broad Bean Origin'] = train['Broad Bean Origin'].map(reverse_mapping)
    train.loc[train['Broad Bean Origin'].isna(), 'Broad Bean Origin'] = "Other"

    test['Broad Bean Origin'] = test['Broad Bean Origin'].map(reverse_mapping)
    test.loc[test['Broad Bean Origin'].isna(), 'Broad Bean Origin'] = "Other"

mapping_Broad_Bean_Origin()

In [None]:
def code_origin(text):
    if pd.isna(text):
        return text
    parts = str(text).split(',', 1)
    return parts[0].strip()

train['Specific Bean Origin'] = train['Specific Bean Origin'].apply(code_origin)
test['Specific Bean Origin'] = test['Specific Bean Origin'].apply(code_origin)

In [None]:
train.drop_duplicates()

In [None]:
train['Company'].value_counts().index.tolist()

In [None]:
def convert_procent(procent_text):
    procent_text = procent_text[0:len(procent_text)-1]
    return procent_text

train['Cocoa Percent'] = train['Cocoa Percent'].apply(convert_procent).astype('float64')
test['Cocoa Percent'] = test['Cocoa Percent'].apply(convert_procent).astype('float64')


In [None]:
train

# EDA:

In [None]:
vis_train_data = train
vis_test_data = test

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
cat_feature = [column for column in train.columns if train[column].dtype == "object"]
value_feature = [column for column in train.columns if column not in cat_feature]

In [None]:
for feature in cat_feature:
    if feature not in ['Company', 'Specific Bean Origin']:
        sns.countplot(data =vis_train_data, x=feature)
        sns.countplot(data =vis_test_data, x=feature)
        plt.xticks(rotation=60)
        plt.title(f"Count in data set {feature}")
        plt.show()

In [None]:
sns.heatmap(vis_train_data[value_feature].corr(), cmap='coolwarm', annot=True)
plt.title('Correlation Matrix')
plt.show()

In [None]:
for feature in value_feature:
    if feature != 'Rating':
        plt.figure(figsize=(8, 5))

        # Объединяем данные с пометкой источника
        train_temp = vis_train_data[[feature]].copy()
        train_temp['Dataset'] = 'Train'
        test_temp = vis_test_data[[feature]].copy()
        test_temp['Dataset'] = 'Test'
        combined = pd.concat([train_temp, test_temp])

        # Строим boxplot с разделением по Dataset
        sns.boxplot(data=combined, x='Dataset', y=feature)
        plt.title(f'Distribution comparison for {feature}')
        plt.show()

In [None]:
sns.histplot(data=vis_train_data, x='Rating', kde=True)
plt.show()

# Config predict model

In [None]:
X_train = train.drop(['Rating'], axis=1)
Y_train = train['Rating']

In [None]:
X_test = test

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from catboost import CatBoostRegressor
import optuna


In [None]:
scaler = MinMaxScaler()

X_train[['REF', 'Cocoa Percent']] = scaler.fit_transform(X_train[['REF', 'Cocoa Percent']])
X_test[['REF', 'Cocoa Percent']] = scaler.transform(X_test[['REF', 'Cocoa Percent']])

In [None]:
X_train

In [None]:
def objective(trial):
    max_depth = 4
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True)
    n_estimators = trial.suggest_int("n_estimators", 1200, 2000)
    l2_leaf_reg = trial.suggest_float("l2_leaf_reg", 1, 10)
    random_strength = trial.suggest_float("random_strength", 0, 10)
    bagging_temperature = trial.suggest_float("bagging_temperature", 0, 1)

    model = CatBoostRegressor(
        cat_features=[0, 1, 3, 5, 6, 7],
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        l2_leaf_reg=l2_leaf_reg,
        random_strength=random_strength,
        bagging_temperature=bagging_temperature,
        silent=True
    )

    score = cross_val_score(model, X_train, Y_train, cv=3, scoring="r2", n_jobs=-1).mean()
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

In [None]:
  model = CatBoostRegressor(
        cat_features=[0, 1, 3, 5, 6, 7],
        max_depth=4,
        learning_rate=0.02269448682394652,
        n_estimators=1902,
        l2_leaf_reg=4.692496362668251,
        random_strength=9.999005359982192,
        bagging_temperature=0.8045370382834742,
        silent=True
    )

In [None]:
model.fit(X_train, Y_train)

<catboost.core.CatBoostRegressor at 0x7c936e4887d0>

In [None]:
pred = model.predict(X_test)

In [None]:
X_test['id'] = np.arange(len(X_test))
X_test['Rating'] = pred

X_test[['id','Rating']].to_csv("submission.csv", index=False)