In [1]:
%load_ext autoreload
%autoreload 2


# Import Libraries

In [2]:
import os

import pandas as pd
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


# Read Data

In [3]:
path = "../"
train_data = pd.read_csv(os.path.join(path, "train.csv"))
test_data = pd.read_csv(os.path.join(path, "test.csv"))
print(f"Number of rows and columns in the train data set: {train_data.shape}")
print(f"Number of rows and columns in the test data set: {test_data.shape}")
train_data.head()


Number of rows and columns in the train data set: (5495, 4)
Number of rows and columns in the test data set: (788, 3)


Unnamed: 0,id,movie_name,movie_description,target
0,3525e31d,Hellraiser,A new take on Clive Barker's 1987 horror class...,Horror
1,051f6309,Hocus Pocus 2,It's been 29 years since someone lit the Black...,Kids
2,12a9bfcf,X,"In 1979, a group of young filmmakers set out t...",Horror
3,e5373c77,Piggy,With the summer sun beating down on her rural ...,Horror
4,473cdb82,Deadstream,After a public controversy left him disgraced ...,Horror


In [4]:
train_data.groupby("target").describe()


Unnamed: 0_level_0,id,id,id,id,movie_name,movie_name,movie_name,movie_name,movie_description,movie_description,movie_description,movie_description
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Action,704,704,68cfcb79,1,704,697,Assault on Precinct 13,2,704,703,"Wisecracking mercenary Deadpool meets Russell,...",2
Comedy,1269,1269,f00aac8c,1,1269,1264,Father of the Bride,2,1269,1269,To show support for his neighborhood friends s...,1
Drama,1202,1202,6569d7af,1,1202,1196,Little Women,2,1202,1202,Vince used to be in the hottest boy band aroun...,1
Horror,974,974,3525e31d,1,974,942,Carrie,3,974,974,A new take on Clive Barker's 1987 horror class...,1
Kids,552,552,051f6309,1,552,539,Pinocchio,3,552,552,It's been 29 years since someone lit the Black...,1
Mystery,794,794,e5da8af4,1,794,769,Alone,3,794,793,A mysterious figure stalks a pop star when she...,2


# Encoding Categorial Target

In [5]:
le = LabelEncoder()  # for label encoding!

X_train = train_data["movie_description"]
y_train = le.fit_transform(train_data["target"])

X_test = test_data["movie_description"]


# Preparing the data and creating Catboost model

In [6]:
model = CatBoostClassifier(
    iterations=100,
    depth=5,
    random_seed=42
)

model.fit(
    X_train,
    y_train,
    text_features=[0],
    verbose=True
)


Learning rate set to 0.5
0:	learn: 1.4732582	total: 163ms	remaining: 16.1s
1:	learn: 1.3652846	total: 292ms	remaining: 14.3s
2:	learn: 1.3190536	total: 418ms	remaining: 13.5s
3:	learn: 1.3058702	total: 541ms	remaining: 13s
4:	learn: 1.2852828	total: 666ms	remaining: 12.7s
5:	learn: 1.2715606	total: 790ms	remaining: 12.4s
6:	learn: 1.2644465	total: 916ms	remaining: 12.2s
7:	learn: 1.2585924	total: 1.04s	remaining: 11.9s
8:	learn: 1.2497019	total: 1.17s	remaining: 11.8s
9:	learn: 1.2453763	total: 1.3s	remaining: 11.7s
10:	learn: 1.2412351	total: 1.43s	remaining: 11.6s
11:	learn: 1.2395000	total: 1.55s	remaining: 11.4s
12:	learn: 1.2360224	total: 1.67s	remaining: 11.2s
13:	learn: 1.2328154	total: 1.79s	remaining: 11s
14:	learn: 1.2276674	total: 1.92s	remaining: 10.9s
15:	learn: 1.2213552	total: 2.04s	remaining: 10.7s
16:	learn: 1.2158031	total: 2.16s	remaining: 10.6s
17:	learn: 1.2118280	total: 2.29s	remaining: 10.4s
18:	learn: 1.2019060	total: 2.41s	remaining: 10.3s
19:	learn: 1.1964487	

<catboost.core.CatBoostClassifier at 0x7f968797e2d0>

# Predict

In [7]:
# Preparing data in Pool format
dataset_test = Pool(
    data=X_test,
    text_features=[0]
)
predict_classes = model.predict(dataset_test)
predictions = le.inverse_transform(predict_classes)


  y = column_or_1d(y, warn=True)


# Create submission

In [8]:
sample_submission = pd.read_csv(os.path.join(path, "sample_submission.csv"))
sample_submission.head()


Unnamed: 0,id,target
0,d996f823,Kids
1,1cf01f9c,Kids
2,856ea05c,Kids
3,c97899ee,Kids
4,73f0740f,Kids


In [9]:
sample_submission["target"] = predictions
sample_submission.head()


Unnamed: 0,id,target
0,d996f823,Drama
1,1cf01f9c,Drama
2,856ea05c,Mystery
3,c97899ee,Horror
4,73f0740f,Comedy


In [10]:
sample_submission.to_csv("submission.csv", index=False)
