## Import Libraries

In [2]:
import os

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

## Read Dataset

In [6]:
path = "../"
train = pd.read_csv(os.path.join(path, "train.csv"))
test = pd.read_csv(os.path.join(path, "test.csv"))
print("Number of rows and columns in the train data set:", train.shape)
print("Number of rows and columns in the test data set:", test.shape)
train.head()

Number of rows and columns in the train data set: (5512, 3)
Number of rows and columns in the test data set: (788, 2)


Unnamed: 0,movie_name,movie_description,target
0,Hellraiser,A new take on Clive Barker's 1987 horror class...,Horror
1,Hocus Pocus 2,It's been 29 years since someone lit the Black...,Kids
2,X,"In 1979, a group of young filmmakers set out t...",Horror
3,Piggy,With the summer sun beating down on her rural ...,Horror
4,Deadstream,After a public controversy left him disgraced ...,Horror


## Delete NaN values

In [7]:
train[train["movie_description"].isnull()]

Unnamed: 0,movie_name,movie_description,target
217,Vivarium,,Mystery
755,Little Monsters,,Horror
2110,The Witch: Part 1 - The Subversion,,Action
2615,Hasan Minhaj: Homecoming King,,Comedy
2692,Babyteeth,,Comedy
3077,Ala Vaikunthapurramuloo,,Action
3364,Sorry We Missed You,,Drama
3746,Belzebuth,,Horror
3773,Sound of Violence,,Mystery
3800,Les misérables,,Drama


In [8]:
train = train[train["movie_description"].notna()]
train[train["movie_description"].isnull()]

Unnamed: 0,movie_name,movie_description,target


## Text Preprocessing

In [9]:
# Label Encoding
le = LabelEncoder()
le.fit(train["target"])
train["target"] = le.transform(train["target"])
train.head()

Unnamed: 0,movie_name,movie_description,target
0,Hellraiser,A new take on Clive Barker's 1987 horror class...,3
1,Hocus Pocus 2,It's been 29 years since someone lit the Black...,4
2,X,"In 1979, a group of young filmmakers set out t...",3
3,Piggy,With the summer sun beating down on her rural ...,3
4,Deadstream,After a public controversy left him disgraced ...,3


In [10]:
# init tfidf
vect_word = TfidfVectorizer(
    max_features=300, 
    lowercase=True, 
    analyzer="word", 
    stop_words= "english",
    ngram_range=(1,3),
    dtype=np.float32
    )

In [11]:
# train tfidf
X_train = vect_word.fit_transform(train["movie_description"])
# map tfidf on test
X_test = vect_word.transform(test["movie_description"])

In [12]:
y_train = train["target"]

## Model

In [13]:
# Init logreg model
logreg = LogisticRegression(
    C=2,
    random_state = 42,
    class_weight = "balanced"
    )

In [14]:
# train logreg
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
# predict probabilities
preds_proba = logreg.predict_proba(X_test)

In [16]:
# get classes
preds = np.argmax(preds_proba, axis=1)

In [17]:
pred_labels = le.inverse_transform(preds)

## Create submission

In [18]:
sample_submission = pd.read_csv(os.path.join(path, "sample_submission.csv"))
sample_submission.head()

Unnamed: 0,target
0,Kids
1,Kids
2,Kids
3,Kids
4,Kids


In [15]:
sample_submission["target"] = pred_labels
sample_submission.head()

Unnamed: 0,target
0,Horror
1,Comedy
2,Comedy
3,Kids
4,Mystery


In [19]:
sample_submission.to_csv(os.path.join(path, "submission.csv"), index=False)