In [3]:
% load_ext autoreload
% autoreload 2

# Import libs

In [11]:
import pandas as pd
import numpy as np
from catboost import cv, Pool
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Reading data

In [5]:
train_data = pd.read_csv('../train.csv')
test_data = pd.read_csv('../test-with-targets.csv')
print(f'Number of rows and columns in the train data set: {train_data.shape}')
print(f'Number of rows and columns in the test data set: {test_data.shape}')

Number of rows and columns in the train data set: (5512, 3)
Number of rows and columns in the test data set: (788, 3)


In [12]:
train_data.head()

Unnamed: 0,movie_name,movie_description,target
0,Hellraiser,A new take on Clive Barker's 1987 horror class...,Horror
1,Hocus Pocus 2,It's been 29 years since someone lit the Black...,Kids
2,X,"In 1979, a group of young filmmakers set out t...",Horror
3,Piggy,With the summer sun beating down on her rural ...,Horror
4,Deadstream,After a public controversy left him disgraced ...,Horror


In [13]:
train_data.groupby('target').describe()

Unnamed: 0_level_0,movie_name,movie_name,movie_name,movie_name,movie_description,movie_description,movie_description,movie_description
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Action,704,697,Assault on Precinct 13,2,704,703,"Wisecracking mercenary Deadpool meets Russell,...",2
Comedy,1269,1264,Father of the Bride,2,1269,1269,To show support for his neighborhood friends s...,1
Drama,1202,1196,Little Women,2,1202,1202,Vince used to be in the hottest boy band aroun...,1
Horror,974,942,Carrie,3,974,974,A new take on Clive Barker's 1987 horror class...,1
Kids,552,539,Pinocchio,3,552,552,It's been 29 years since someone lit the Black...,1
Mystery,795,770,Alone,3,795,794,A mysterious figure stalks a pop star when she...,2


In [15]:
train_data[train_data["movie_description"].isna()]

Unnamed: 0,movie_name,movie_description,target


# Getting rid of NaN data

In [9]:
train_data = train_data[train_data["movie_description"].notna()]
train_data[train_data["movie_description"].isna()]

Unnamed: 0,movie_name,movie_description,target


In [16]:
le = LabelEncoder()  # for label encoding!

y_train = le.fit_transform(train_data['target'])
y_test = le.transform(test_data['target'])

# Cross-validation and returning best model

In [17]:
def tune_catboost(x_train, y_train_, **kwargs):
    cv_data = Pool(
        data=x_train,
        label=y_train_,
        **kwargs
    )
    params = {
        'iterations': 200,
        'depth': 2,
        'loss_function': 'MultiClass',
        'verbose': False,
        'random_seed': 42
    }
    return cv(
        cv_data,
        params,
        fold_count=3,
        plot=True,
        return_models=True
    )

In [19]:
X_train = train_data['movie_description']
X_test = test_data['movie_description']
X_train

0       A new take on Clive Barker's 1987 horror class...
1       It's been 29 years since someone lit the Black...
2       In 1979, a group of young filmmakers set out t...
3       With the summer sun beating down on her rural ...
4       After a public controversy left him disgraced ...
                              ...                        
5507    Northern Ireland, 1960: Father Thomas Riley an...
5508    Traveling in Ireland, New York reporter Stephe...
5509    Sebastian Cabot narrates the adventures of bum...
5510    Unspoken class barriers that exist within a ho...
5511    Acting couple Joseph (Jack Benny) and Maria Tu...
Name: movie_description, Length: 5496, dtype: object

In [21]:
scores, models = tune_catboost(
    X_train,
    y_train,
    text_features=[0]  # dataframe contains only 1 text feature
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/3]

bestTest = 1.204157639
bestIteration = 399

Training on fold [1/3]

bestTest = 1.198274359
bestIteration = 399

Training on fold [2/3]

bestTest = 1.233432925
bestIteration = 398



In [22]:
model = models[-1] # select model from last fold

### Pool is an internal data structure that is used by CatBoost

In [24]:
cv_dataset_test = Pool(
    data=X_test,
    label=y_test,
    text_features=[0]
)

In [25]:
res2 = classification_report(
    y_test,
    np.argmax(model.predict(cv_dataset_test), axis=1),
    target_names=le.classes_
)

In [26]:
print(res2)

              precision    recall  f1-score   support

      Action       0.59      0.60      0.60       101
      Comedy       0.51      0.66      0.58       182
       Drama       0.58      0.65      0.61       172
      Horror       0.64      0.67      0.66       140
        Kids       0.73      0.47      0.57        79
     Mystery       0.66      0.33      0.44       114

    accuracy                           0.59       788
   macro avg       0.62      0.56      0.58       788
weighted avg       0.60      0.59      0.58       788

