In [1]:
from os import path, listdir
import string
import re
import pandas as pd
import codecs
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB

In [2]:
genres = ['Action', 'Comedy', 'Crime', 'Horror', 'Musical', 'Romance', 'War', 'Western']
columns = ['File', 'Genre', 'Hearing Impaired Subtitle']
target_col = 'Genre'
subtitle_col = 'Hearing Impaired Subtitle'

train_subtitles_path = "subtitles/TrainSubtitles"
test_subtitles_path = "subtitles/TestSubtitles"

In [3]:
def make_dataframe(path):
    df = pd.DataFrame(columns = columns)
    for genre in genres:
        #get every dataset in genre
        genre_path = "%s/%s" % (path, genre)
        
        for f in listdir(genre_path):
            hearing_descriptions = []
            file_path = "%s/%s" % (genre_path, f)
            
            with codecs.open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                #finds hearing descriptions
                for line in f:
                    res = re.findall(r'(\[.+\]|\(.+\))', line)
                    if res:
                        res[0] = re.sub(r'<.*?>', '', res[0])
                    hearing_descriptions.extend(list(res))
                #remove brackets
                hearing_descriptions = ' '.join([i[1:-1] for i in hearing_descriptions])
            data = {'File': file_path, 'Genre': genre, 'Hearing Impaired Subtitle': hearing_descriptions}
            df = df.append(data, ignore_index = True)
    return df

In [4]:
train_df = make_dataframe(train_subtitles_path)
test_df = make_dataframe(test_subtitles_path)

In [5]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop = set(stopwords.words('english'))
punc_table = str.maketrans('', '', string.punctuation)

def lowercase(text):
    return [i.lower() for i in text]

def eliminate_stopwords(text):
    return [word for word in text if word not in stopwords.words('english')]

def stem(text):
    return [stemmer.stem(word) for word in text]

def lemmatize(text):
    return [lemmatizer.lemmatize(word) for word in text]

def eliminate_punctuations(text):
    res = []
    stripped = [word.translate(punc_table) for word in text]
    for word in stripped:
        if (word):
            res.append(word)
    return res

def preprocess(text):
    res = word_tokenize(text)
    res = lowercase(res)
    res = eliminate_stopwords(res)
    res = lemmatize(res)
    return " ".join([word for word in res])
    
def preprocess_dataframe(df):
    df[subtitle_col] = df[subtitle_col].map(lambda s: preprocess(s))

In [6]:
preprocess_dataframe(train_df)
preprocess_dataframe(test_df)

In [7]:
#Vectorization
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df[subtitle_col])
y_train = train_df[target_col]
X_test = vectorizer.transform(test_df[subtitle_col])
y_test = test_df[target_col]

print(vectorizer.get_feature_names())



In [8]:
#find optimal alpha value
def get_optimal_alpha():
    alpha_values = [0.1, 0.5, 0.01, 0.05, 0.001, 0.005]
    acc_scores = []
    for a in alpha_values:
        clf = MultinomialNB(alpha=a)
        acc = 0
        clf.fit(X_train, y_train)

        predicted = clf.predict(X_test)
        acc += accuracy_score(y_test, predicted)*100
        acc_scores.append(float(acc/50))
    return alpha_values[acc_scores.index(max(acc_scores))]

optimal_alpha = get_optimal_alpha()

In [9]:
#fitting
nb_classifier = MultinomialNB(alpha=optimal_alpha).fit(X_train, y_train)

In [10]:
pred = nb_classifier.predict(X_test)
print(classification_report(y_test,pred))
print('Accuracy score: ', accuracy_score(y_test,pred))

              precision    recall  f1-score   support

      Action       0.25      0.23      0.24        88
      Comedy       0.30      0.35      0.33       105
       Crime       0.30      0.31      0.30        75
      Horror       0.58      0.50      0.54        86
     Musical       0.57      0.38      0.46        60
     Romance       0.25      0.36      0.29        98
         War       0.67      0.52      0.58        66
     Western       0.82      0.71      0.76        52

    accuracy                           0.40       630
   macro avg       0.47      0.42      0.44       630
weighted avg       0.43      0.40      0.41       630

Accuracy score:  0.4
