# Video Clip Search Engine

- Requires
  - nltk
    - stopwords('english')
    - RegexpTokenizer
  - pip install srt
  - pip install webvtt-py

In [1]:
import glob
from pprint import pprint
import pandas as pd
from collections import defaultdict
import srt
from nltk.tokenize import RegexpTokenizer # punctuation removal and tokenizing subtitle block
from nltk.corpus import stopwords
import string
import webvtt # for captioning in HTML playback

In [2]:
# Path where video and subtitle files are stored
#path = 'D:\\captures\\NPTEL\\'
path = 'D:\\captures\\NPTEL\\ANLP\\'

## Data Gathering

In [3]:
# Recursively search for MP4 and SRT files
file_types = ['*.mp4', '*.srt'] # the tuple of file types
files_grabbed = [glob.glob(path + '**/'+ e, recursive=True) for e in file_types]

In [4]:
list(map(len,files_grabbed))

[62, 28]

In [5]:
# Build a dictionary out of grabbed files
files_dict = defaultdict(list)
for f_type in files_grabbed:
    for f in f_type:
        v = f.split('.')
        files_dict[v[0]].append(v[1])

# Remove entries that do not have either .mp4 or .srt file
remove_list = [key for key in files_dict if(len(files_dict[key]) < len(file_types))]
for k in remove_list:
    del files_dict[k]

In [6]:
def convert_srt_to_vtt(srt_file_name,webvtt=webvtt):
    '''
    Input: Subtile file name which is in SRT file format (.srt)
    Output: Subtitle file in OTT file format (.vtt)
    Requires: webvtt module imported already
    '''
    webvtt = webvtt.from_srt(srt_file_name)
    webvtt.save()
    pass

In [7]:
# HTML5 supports only VTT caption format
# In order to play video with caption, we need VTT file
# So generate WebVTT file from available SRT files
for file_name in files_dict:
    srt_file_name = file_name+'.srt'
    convert_srt_to_vtt(srt_file_name)

In [8]:
# files_dict

In [9]:
# Convert dict to pandas dataframe for easy manipulation
df = pd.DataFrame(columns=['file','mp4', 'srt'])
for index, (key, val) in enumerate(files_dict.items()):
    df.loc[index] = [key.replace(path,'')] + val
    #df.loc[index] = [key] + val

In [10]:
df

Unnamed: 0,file,mp4,srt
0,Week1\mod01lec04-Vector Space models,mp4,srt
1,Week1\mod01lec06-Machine Translation,mp4,srt
2,Week1\mod01lec07-Preprocessing,mp4,srt
3,Week1\mod01lec09-Statistical Properties of Wor...,mp4,srt
4,"Week2\mod02lec15-Co-occurence matrix, n-grams",mp4,srt
5,"Week2\mod02lec16-Collocations, Dense word Vectors",mp4,srt
6,"Week2\mod02lec17-SVD, Dimensionality reduction...",mp4,srt
7,Week3\mod03lec20-Examples for word prediction,mp4,srt
8,Week3\mod03lec21-Introduction to Probability i...,mp4,srt
9,Week3\mod03lec23-The definition of probabilist...,mp4,srt


In [11]:
df.describe()

Unnamed: 0,file,mp4,srt
count,28,28,28
unique,28,1,1
top,Week1\mod01lec04-Vector Space models,mp4,srt
freq,1,28,28


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28 entries, 0 to 27
Data columns (total 3 columns):
file    28 non-null object
mp4     28 non-null object
srt     28 non-null object
dtypes: object(3)
memory usage: 896.0+ bytes


In [13]:
print(df.loc[0]['file'])

Week1\mod01lec04-Vector Space models


In [14]:
# D:\captures\NPTEL\Cloud Computing\Lecture 11_-mod03lec11

In [15]:
custom_stop_words = ['ah', 'music']
stop_words = set(stopwords.words('english') + custom_stop_words)

def pre_process_srt_block_content(doc):
    '''
    preprocesses the given document
    and returns it as a single string
    '''
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(doc)

    #translator = str.maketrans('', '', string.punctuation)
    #doc = doc.translate(translator)
    
    #tokens = doc.split() # split it into words
    tokens = [token.lower() for token in tokens] # normalize
    tokens = [token for token in tokens if token not in stop_words]
    return tokens
    #doc_str = ' '.join(tokens)
    #return doc_str

In [16]:
class SrtDocument:
    '''
    Contains SRT Block information
    start : SRT block start time  in datetime.timedelta format
    end : SRT block start time  in datetime.timedelta format
    content : SRT block as a single string
    '''
    __slots__ = ('start', 'end', 'tokens','content_str')
    
    def __repr__(self):
        return ' '.join (
                         [
                             '{', 
                            'start:', str(self.start),
                            'end:', str(self.end),
                            'content_str:', self.content,
                            '}'
                           ])
    def __str__(self):
        return ' '.join([str(self.start), str(self.end), self.content_str])

In [17]:
def parse_srt_file(srt_file):
    '''
    Input:
    srt_file: Subtile file opened in read-only mode
    
    Actions:
    Parse the SRT file
    converts each SRT block into a single string
    preprocess's the string and return it
    
    Returns:
        Generator providing SRT blocks in SrtDocument format
    '''
    subs = srt.parse(f_srt)
    for subtitle in subs:
        #print(subtitle)
        #print(subtitle.content.split())
        #print(' '.join(subtitle.content.split()))
        doc_content = pre_process_srt_block_content(subtitle.content)
        if(len(doc_content) == 0): # remove empty entries
            continue
        srt_doc = SrtDocument()
        srt_doc.start = subtitle.start
        srt_doc.end = subtitle.end
        srt_doc.tokens = doc_content
        srt_doc.content_str = ' '.join(doc_content)
        yield srt_doc

In [18]:
file_name = 'D:\\captures\\NPTEL\\ANLP\\Week1\mod01lec04-Vector Space models.srt'
with open(file_name) as f_srt:
    my_srt_parser = parse_srt_file(f_srt)
    for srt_block in my_srt_parser:
        print(srt_block)

0:00:14.620000 0:00:24.279000 fourth one really understand
0:00:18.849000 0:00:27.340000 words words
0:00:24.279000 0:00:30.189000 convey right something
0:00:27.340000 0:00:35.800000 need understand order us
0:00:30.189000 0:00:40.750000 go next level right
0:00:35.800000 0:00:43.420000 want find really
0:00:40.750000 0:00:47.739000 certain mathematical operations
0:00:43.420000 0:00:50.129000 text really convert text
0:00:47.739000 0:00:57.030000 vector form use
0:00:50.129000 0:01:00.969000 vector algebra certain vector
0:00:57.030000 0:01:04.030000 algebraic methods find certain inner
0:01:00.969000 0:01:05.800000 meanings find document
0:01:04.030000 0:01:09.909000 related looking
0:01:05.800000 0:01:13.510000 give one idea let suppose
0:01:09.909000 0:01:16.810000 document collection containing
0:01:13.510000 0:01:21.070000 billion words ten
0:01:16.810000 0:01:23.520000 thousand documents search
0:01:21.070000 0:01:26.799000 mechanism find
0:01:23.520000 0:01:29.740000 document 

- Each subtitle entry will be treated as a document
  - Sentences will be splitted and joined as a single sentence for easy document handling
  
- TO DO
  - Some subtitle entries have single words. How to handle that?

# Convert .SRT to .VTT

In [19]:
convert_srt_to_vtt('mod01lec01.srt')
convert_srt_to_vtt('test.srt')

# Play video from given position

In [20]:
from IPython.display import HTML

In [21]:
HTML("""
<video id="vid1" width="840" height="460" controls="">
    <source src="{0}">
    <track label="English" kind="subtitles" src="{1}" srclang="en" default="">
</video>
<script>
    document.getElementById('vid1').currentTime = "{2}"
</script>
""".format('mod01lec01.mp4" type="video/mp4', 'mod01lec01.vtt', 100))

In [22]:
# https://medium.com/@datamonsters/text-preprocessing-in-python-steps-tools-and-examples-bf025f872908