In [59]:
% load_ext autoreload
% autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [74]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, cv, Pool
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [75]:
train_data = pd.read_csv('../train.csv')
test_data = pd.read_csv('../test-with-targets.csv')
print(f'Number of rows and columns in the train data set: {train_data.shape}')
print(f'Number of rows and columns in the test data set: {test_data.shape}')

Number of rows and columns in the train data set: (5512, 3)
Number of rows and columns in the test data set: (788, 3)


In [76]:
le = LabelEncoder()

In [78]:
train_data.head()

Unnamed: 0,movie_name,movie_description,target
0,Hellraiser,A new take on Clive Barker's 1987 horror class...,Horror
1,Hocus Pocus 2,It's been 29 years since someone lit the Black...,Kids
2,X,"In 1979, a group of young filmmakers set out t...",Horror
3,Piggy,With the summer sun beating down on her rural ...,Horror
4,Deadstream,After a public controversy left him disgraced ...,Horror


In [79]:
train_data.groupby('target').describe()

Unnamed: 0_level_0,movie_name,movie_name,movie_name,movie_name,movie_description,movie_description,movie_description,movie_description
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Action,707,700,Assault on Precinct 13,2,704,703,"Wisecracking mercenary Deadpool meets Russell,...",2
Comedy,1273,1268,Father of the Bride,2,1269,1269,To show support for his neighborhood friends s...,1
Drama,1204,1198,Little Women,2,1202,1202,Vince used to be in the hottest boy band aroun...,1
Horror,977,945,Carrie,3,974,974,A new take on Clive Barker's 1987 horror class...,1
Kids,552,539,Pinocchio,3,552,552,It's been 29 years since someone lit the Black...,1
Mystery,799,774,Alone,3,795,794,A mysterious figure stalks a pop star when she...,2


In [80]:
train_data[train_data["movie_description"].isna()]


Unnamed: 0,movie_name,movie_description,target
217,Vivarium,,Mystery
755,Little Monsters,,Horror
2110,The Witch: Part 1 - The Subversion,,Action
2615,Hasan Minhaj: Homecoming King,,Comedy
2692,Babyteeth,,Comedy
3077,Ala Vaikunthapurramuloo,,Action
3364,Sorry We Missed You,,Drama
3746,Belzebuth,,Horror
3773,Sound of Violence,,Mystery
3800,Les misérables,,Drama


In [86]:
train_data = train_data[train_data["movie_description"].notna()]
train_data[train_data["movie_description"].isna()]

Unnamed: 0,movie_name,movie_description,target


In [87]:
y_train = le.fit_transform(train_data['target'])
y_test = le.transform(test_data['target'])

In [107]:
def tune_catboost(x_train, y_train_, **kwargs):
    cv_data = Pool(
        data=x_train,
        label=y_train_,
        **kwargs
    )
    params = {
        'iterations': 400,
        'depth': 3,
        'loss_function': 'MultiClass',
        'verbose': False,
        'random_seed': 42
    }
    return cv(
        cv_data,
        params,
        fold_count=3,
        plot=True,
        return_models=True
    )

In [108]:
X_train = train_data['movie_description']
X_test = test_data['movie_description']

In [109]:
scores, models = tune_catboost(
    X_train,
    y_train,
    text_features=[0]
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/3]

bestTest = 1.204157639
bestIteration = 399

Training on fold [1/3]

bestTest = 1.198274359
bestIteration = 399

Training on fold [2/3]

bestTest = 1.233432925
bestIteration = 398



In [103]:
model = models[-1]

In [104]:
cv_dataset_test = Pool(
    data=X_test,
    label=y_test,
    text_features=[0]
)

In [105]:
res2 = classification_report(
    y_test,
    np.argmax(model.predict(cv_dataset_test), axis=1),
    target_names=le.classes_
)

In [106]:
print(res2)

              precision    recall  f1-score   support

      Action       0.59      0.59      0.59       101
      Comedy       0.50      0.68      0.58       182
       Drama       0.56      0.65      0.60       172
      Horror       0.64      0.69      0.66       140
        Kids       0.73      0.46      0.56        79
     Mystery       0.78      0.27      0.40       114

    accuracy                           0.58       788
   macro avg       0.63      0.56      0.57       788
weighted avg       0.61      0.58      0.57       788

