In [1]:
%load_ext autoreload
%autoreload 2

# Import libs

In [2]:
import os

import numpy as np
import pandas as pd
from catboost import Pool
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder


# Reading data

In [3]:
path = "../"
train_data = pd.read_csv(os.path.join(path, "train.csv"))
test_data_labeled = pd.read_csv(os.path.join(path, "test-with-targets.csv"))
test_data = pd.read_csv(os.path.join(path, "test.csv"))
print(f"Number of rows and columns in the train data set: {train_data.shape}")
print(f"Number of rows and columns in the test data set: {test_data.shape}")


Number of rows and columns in the train data set: (5495, 4)
Number of rows and columns in the test data set: (788, 3)


In [4]:
train_data.head()

Unnamed: 0,id,movie_name,movie_description,target
0,3525e31d,Hellraiser,A new take on Clive Barker's 1987 horror class...,Horror
1,051f6309,Hocus Pocus 2,It's been 29 years since someone lit the Black...,Kids
2,12a9bfcf,X,"In 1979, a group of young filmmakers set out t...",Horror
3,e5373c77,Piggy,With the summer sun beating down on her rural ...,Horror
4,473cdb82,Deadstream,After a public controversy left him disgraced ...,Horror


In [5]:
train_data.groupby("target").describe()

Unnamed: 0_level_0,id,id,id,id,movie_name,movie_name,movie_name,movie_name,movie_description,movie_description,movie_description,movie_description
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Action,704,704,68cfcb79,1,704,697,Assault on Precinct 13,2,704,703,"Wisecracking mercenary Deadpool meets Russell,...",2
Comedy,1269,1269,f00aac8c,1,1269,1264,Father of the Bride,2,1269,1269,To show support for his neighborhood friends s...,1
Drama,1202,1202,6569d7af,1,1202,1196,Little Women,2,1202,1202,Vince used to be in the hottest boy band aroun...,1
Horror,974,974,3525e31d,1,974,942,Carrie,3,974,974,A new take on Clive Barker's 1987 horror class...,1
Kids,552,552,051f6309,1,552,539,Pinocchio,3,552,552,It's been 29 years since someone lit the Black...,1
Mystery,794,794,e5da8af4,1,794,769,Alone,3,794,793,A mysterious figure stalks a pop star when she...,2


In [6]:
le = LabelEncoder()  # for label encoding!

y_train = le.fit_transform(train_data["target"])
y_test_labeled = le.transform(test_data_labeled["target"])

# Cross-validation and returning best model

In [67]:
def tune_catboost(x_train, y_train_, **kwargs):
    cv_data = Pool(
        data=x_train,
        label=y_train_,
        **kwargs
    )
    params = {
        'iterations': 200,
        'depth': 2,
        'loss_function': 'MultiClass',
        'verbose': False,
        'random_seed': 42
    }
    return cv(
        cv_data,
        params,
        fold_count=3,
        plot=True,
        return_models=True
    )


In [68]:
X_train = train_data["movie_description"]
X_test = test_data_labeled["movie_description"]
X_train

0       A new take on Clive Barker's 1987 horror class...
1       It's been 29 years since someone lit the Black...
2       In 1979, a group of young filmmakers set out t...
3       With the summer sun beating down on her rural ...
4       After a public controversy left him disgraced ...
                              ...                        
5507    Northern Ireland, 1960: Father Thomas Riley an...
5508    Traveling in Ireland, New York reporter Stephe...
5509    Sebastian Cabot narrates the adventures of bum...
5510    Unspoken class barriers that exist within a ho...
5511    Acting couple Joseph (Jack Benny) and Maria Tu...
Name: movie_description, Length: 5496, dtype: object

In [69]:
scores, models = tune_catboost(
    X_train,
    y_train,
    text_features=[0]  # dataframe contains only 1 text feature
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/3]

bestTest = 1.271489642
bestIteration = 199

Training on fold [1/3]

bestTest = 1.267645209
bestIteration = 199

Training on fold [2/3]

bestTest = 1.294392309
bestIteration = 199



In [70]:
model = models[-1]  # select model from last fold

### Pool is an internal data structure that is used by CatBoost

In [71]:
cv_dataset_test = Pool(
    data=X_test,
    label=y_test_labeled,
    text_features=[0]
)

In [72]:
res2 = classification_report(
    y_test_labeled,
    np.argmax(model.predict(cv_dataset_test), axis=1),
    target_names=le.classes_
)

In [73]:
print(res2)

              precision    recall  f1-score   support

      Action       0.58      0.58      0.58       101
      Comedy       0.51      0.65      0.57       182
       Drama       0.56      0.65      0.60       172
      Horror       0.63      0.68      0.66       140
        Kids       0.73      0.47      0.57        79
     Mystery       0.63      0.30      0.40       114

    accuracy                           0.58       788
   macro avg       0.61      0.55      0.56       788
weighted avg       0.59      0.58      0.57       788



In [74]:
# Preparing data in Pool format
cv_dataset_test = Pool(
    data=X_test,
    text_features=[0]
)
predict_scores = model.predict(cv_dataset_test)
predictions = le.inverse_transform(np.argmax(predict_scores, axis=1))

In [75]:
sample_submission = pd.read_csv(os.path.join(path, "sample_submission.csv"))
sample_submission.head()

Unnamed: 0,target
0,Kids
1,Kids
2,Kids
3,Kids
4,Kids


In [76]:
sample_submission["target"] = predictions
sample_submission.head()

Unnamed: 0,target
0,Horror
1,Drama
2,Comedy
3,Horror
4,Comedy


In [52]:
sample_submission.to_csv(os.path.join(path, "submission.csv"), index=False)

In [None]:
# Accuracy 0.5761