In [47]:
from os import path
from os import listdir
from os import mkdir
import re
import codecs
from itertools import groupby

import string

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer

# from tokenization import clean_stopword
# from tokenization import stemming

global_genres = ['Action', 'Comedy', 'Crime', 'Horror', 'Musical', 'Romance', 'War', 'Western']


def parse_subtitle(filename):
    # "chunk" our input file, delimited by blank lines
    with codecs.open(filename, 'r', encoding='utf-8', errors='ignore') as f:
        res = [list(g) for b,g in groupby(f, lambda x: bool(x.strip())) if b]

    subs = []

    for sub in res:
        if len(sub) >= 3: # not strictly necessary, but better safe than sorry
            sub = [x.strip() for x in sub]
            content = sub[2]
            subs.append(content)

    return subs


def remove_impaired(content):

    regex = re.compile(r'(\[.+\]|\(.+\))')
    res = re.sub(regex, "", content)
    res = re.sub(r'<.*?>', '', res)
    return res


def remove_punctuation(text):
    useless_strings = ['♪']
    translations = []
    translations.append(str.maketrans({key: None for key in string.punctuation}))
    translations.append(str.maketrans({key: None for key in useless_strings}))
    for trns in translations:
        text = text.translate(trns)

    return text


def parse(file_path, output_path):
    
    ps = PorterStemmer()
    
    content = parse_subtitle(file_path)

    # remove impaired parts
    content = " ".join([remove_impaired(mov) for mov in content])
    
    # remove stop words and stem
    stop_words = set(stopwords.words("english"))
    content = word_tokenize(content)
    content = [ps.stem(w.lower()) for w in content if w.isalpha() and not w in stop_words]

    # remove punctuation
    content = remove_punctuation(("\n".join(content)))
    
#     print("HMMM: ", content)


    if content:
        with open(output_path, 'w') as f:
            try:
                f.write(content)
            except:
                print("HMM:",output_path)


def preprocess_normal_text(input_folder, output_folder):
    subtitles_path = path.relpath(input_folder)
    output_path = path.relpath(output_folder)
    categories = global_genres

    # get lower bound and put equal amount of catagories in the train set.

    for category in categories:
        input_folder_path = "%s/%s" % (subtitles_path, category)
        output_folder_path = "%s/%s" % (output_path, category)

        print('.',)
        # Create folders
        try:
            if not path.isdir(output_folder_path):
                mkdir(output_folder_path, 0o755)
        except OSError:
            print("Directorty cannot be opened in %s" % output_folder_path)

        for f in listdir(input_folder_path):
            # Parse hearing descriptions in subtitles
            input_subtitle = "%s/%s" % (input_folder_path, f)
            output_subtitle = "%s/%s" % (output_folder_path, f)
            parse(input_subtitle, output_subtitle)


In [None]:
in_path = path.relpath("Subtitles")
out_path = path.relpath("ProcessedNormalText")

preprocess_normal_text(in_path, out_path)

.
HMM: ProcessedNormalText/Action/A View to a Kill (IMPAIRED).srt
HMM: ProcessedNormalText/Action/Alex Cross (IMPAIRED).srt
HMM: ProcessedNormalText/Action/Ghost Rider_ Spirit of Vengeance (IMPAIRED).srt
HMM: ProcessedNormalText/Action/Godzilla (IMPAIRED).srt
HMM: ProcessedNormalText/Action/GoldenEye (IMPAIRED).srt
HMM: ProcessedNormalText/Action/Hamburger Hill (IMPAIRED).srt
HMM: ProcessedNormalText/Action/Hollywood Homicide (IMPAIRED).srt
HMM: ProcessedNormalText/Action/Man of Steel (IMPAIRED).srt
HMM: ProcessedNormalText/Action/No Escape (IMPAIRED).srt
HMM: ProcessedNormalText/Action/On Her Majesty's Secret Service (IMPAIRED).srt
HMM: ProcessedNormalText/Action/R.I.P.D. (IMPAIRED).srt
HMM: ProcessedNormalText/Action/Rush (IMPAIRED).srt
HMM: ProcessedNormalText/Action/So Undercover (IMPAIRED).srt
HMM: ProcessedNormalText/Action/Son of Batman (IMPAIRED).srt
HMM: ProcessedNormalText/Action/Stalingrad (IMPAIRED).srt
HMM: ProcessedNormalText/Action/Starship Troopers_ Invasion (IMPAIRED).

In [77]:
import os
import pandas as pd

out_path = path.relpath("ProcessedNormalText")
output_path = path.relpath(out_path)
categories = global_genres

data = {"filepath": [], "genre": []}

for index, category in enumerate(categories):
        output_folder_path = "%s/%s" % (output_path, category)
        
        try:
            for filename in os.listdir(output_folder_path):
                data["genre"].append(index)
                data["filepath"].append(output_folder_path + '/' +  filename)
        except:
            pass
        
df = pd.DataFrame(data=data)
df

Unnamed: 0,filepath,genre
0,"ProcessedNormalText/Action/10,000 BC (IMPAIRED...",0
1,ProcessedNormalText/Action/16 Blocks (IMPAIRED...,0
2,ProcessedNormalText/Action/1941 (IMPAIRED).srt,0
3,ProcessedNormalText/Action/2 Fast 2 Furious (I...,0
4,ProcessedNormalText/Action/2 Guns (IMPAIRED).srt,0
...,...,...
3847,ProcessedNormalText/Western/_Wynonna Earp_ Lea...,7
3848,ProcessedNormalText/Western/_Wynonna Earp_ Pur...,7
3849,ProcessedNormalText/Western/_Wynonna Earp_ She...,7
3850,ProcessedNormalText/Western/_Wynonna Earp_ The...,7


In [76]:
import pickle
from os import path
from sklearn import svm
from sklearn.metrics import classification_report, precision_recall_fscore_support


class FullTextClassification:
    """
        Impaired classification API
    """
    #1
    def __init__(self, train_path, test_path):
        self.clf = None
        self.vectorizer = None
        self.test_path = path.relpath(test_path)

        self.tune_and_train()

    # 2
    def tune_and_train(self):

        clf = []
        for i in range(len(global_genres)):
            clf.append(svm.SVC(kernel='linear', probability=True))

        for idx, model in enumerate(clf):
            fname = str(global_genres[idx]) + "_model"
            f = open('bin_models_normal/' + fname, 'rb')
            clf[idx] = pickle.load(f)
            f.close()

        self.clf = clf

        f = open('bin_models_normal/vectorizer', 'rb')
        self.vectorizer = pickle.load(f)
        f.close()

    # 3
    def get_f1_scores(self):
        '''
        text, genre = tag_subtitles2(self.test_path)
        text, genre = randomize(text, genre)
        to_be_filtered = ['im', 'oh', 'dont', 'go', 'know', 'yeah', 'come', 'get',
                          'well']  # 'grunt', 'beep', 'grunts', ',', 'groan', 'speak', 'music']
        for i in range(len(text)):
            for f in to_be_filtered:
                text[i] = text[i].replace(f, '')
        # split train and test
        text = self.vectorizer.transform(text)
        text, genre = randomize(text, genre)
        accuracy = 0
        no_pridiction = 0
        predictions = []
        for i, data in enumerate(text):
            # print(test_genre[i])
            highest_prob = 0
            highest_index = -1
            for idx, model in enumerate(self.clf):
                # print(model.predict(data))
                curr = model.predict_proba(data)[0][0]
                if highest_prob < curr:
                    highest_prob = curr
                    highest_index = idx
            predictions.append(global_genres[highest_index])
            if global_genres[highest_index] == genre[i]:
                accuracy += 1
        a,a,f1,b = precision_recall_fscore_support(genre, predictions)
        fdict = dict(zip(global_genres, ["{0:.2f}".format(round(a, 2)) for a in f1]))'''
        # print(fdict)
        # print(classification_report(genre, predictions))
        # print("accuracy = " + str(accuracy * 100.0 / len(genre)))
        # print("no_pridiction = " + str(no_pridiction * 100.0 / len(genre)))
        #fdict = {'Comedy': 0.42, 'War': 0.77, 'Crime': 0.41, 'Musical': 0.73, 'Horror': 0.61, 'Action': 0.46, 'Romance': 0.25, 'Western': 0.87}
        #fdict = {'Comedy': 0.42, 'War': 0.77, 'Crime': 0.41, 'Musical': 0.73, 'Horror': 0.61, 'Action': 0.46, 'Romance': 0.25, 'Western': 0.87}
        #fdict = {'Action': 0.46, 'Western': 0.87, 'Comedy': 0.42, 'Crime': 0.41, 'War': 0.77, 'Romance': 0.25, 'Musical': 0.73, 'Horror': 0.61}
        fdict = {'Action': 0.68, 'Western': 0.95, 'Comedy': 0.70, 'Crime': 0.73, 'War': 0.89, 'Romance': 0.76, 'Musical': 0.87, 'Horror': 0.86}
        return fdict

    # 4
    def predict(self, filepath):
        print(filepath)
        paths = filepath.split('\\')
        print(paths)
        newpath = 'TestProcessed/{}/{}'.format(paths[-2], paths[-1])
        with codecs.open(newpath, 'r', encoding='utf-8', errors='ignore') as f:
            # finds hearing descriptions
            text = ' '.join(f.read().split('\n'))
        bow_tf = self.vectorizer.transform([text])

        highest_index = -1
        highest_prob = 0

        probNidx = []

        for idx, model in enumerate(self.clf):
            print(model.predict(data))
            curr = model.predict_proba(bow_tf)[0][0]
            probNidx.append((curr, idx))
            if highest_prob < curr:
                highest_prob = curr
                highest_index = idx

        probNidx = sorted(probNidx, key=lambda val: val[0])
        best3 = [(p[0],global_genres[p[1]]) for p in probNidx][-3:]
        best3.reverse()

        #return best3 #best3[0][1]
        return best3[0][1]

In [74]:
print('init:')
model = FullTextClassification(None, 'TestProcessed')
print('train,vectorize:')
model.tune_and_train()
print('get_f1')
print(model.get_f1_scores())
print('predict')
print(model.predict(path.relpath('TestProcessed/Western/Blindman (IMPAIRED).srt')))

init:




train,vectorize:
get_f1
{'Action': 0.68, 'Western': 0.95, 'Comedy': 0.7, 'Crime': 0.73, 'War': 0.89, 'Romance': 0.76, 'Musical': 0.87, 'Horror': 0.86}
predict
TestProcessed\Western\Blindman (IMPAIRED).srt
['TestProcessed', 'Western', 'Blindman (IMPAIRED).srt']


AttributeError: 'SVC' object has no attribute 'break_ties'