## Subtitle analysis

In [1]:
import os
import pysrt
import re
import time as tm
from collections import Counter

In [2]:
def sec2HMS(seconds):
    #seconds = int(seconds)
    return tm.strftime('%H:%M:%S', tm.gmtime(seconds))

def HMS2sec(time_str):
    h, m, s = time_str.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

In [3]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [6]:
subs = pysrt.open('../data/2006-01-02_0000_US_00001057_V11_M2_VHS10_H4_JA.srt')

In [7]:
full_srt = ' '.join([x.text for x in subs])
full_srt = re.sub('!|;|:|\.|,|\?','',full_srt)
full_srt = strip_tags(full_srt)

#### All subtitles near the boundaries

In [9]:
boundaries = [38,7241,10842,21593]

In [10]:
for sub in subs:
    time = list(sub.start)
    time = time[2] + 60*time[1] + 3600*time[0]
    for start in boundaries:
        if abs(start-time)<=300:
            print(sub.text)
            print(time)
            print()

    --<font color="#ffff00"> Captions by VITAC</font> --
        <font color="#00ffff"> www.vitac.com</font>
37

>>> A GREAT 2006 TO YOU IN
NO, CITY.
42

WE'LL GO AHEAD AND START THE
YEAR, FIRST MONDAY AND, BOY, IS
46

WEATHER E WORD TODAY FOR MU
OF TH U.S.
50

THE O MOST DESTRUCTIVE FORMS
OF WEATHER ARE OUT THERE IN
52

POLAR OPPOSITES.
WE HAVE FIRE AND RAIN.
56

FIRES SCORCHING THE SOUTHWEST
AND ENTI TOWNS HAVE BEEN WIPED
59

OUT.
WE HAVE LIVE TEAM COVERAGE AND
62

RAIN WH IT, FOODING AND MUD
SLIDES DRENCHING CALORNIA.
64

THE NORTHERN PART OF THE STATE
HEADED FOR MUCH OF THE WEEKEND
68

AND NOW SOUTHERN CALIFORNIA IS
GETTING SOAKED.
70

VEEACOVERAGE OF THE SOGGY
CONDITIONS AS WELL AHEAD ON CNN
73

"LIVE TODAY," FIRST A QUICK
CHECK OF OTHER STORIES HAPPENING
77

NOW IN THE NEWS.
>>> U.N. INVESTIGATORS SAY THEY
80

WANT TO INTERVIEW A SERIOUS
PRESIDENT IN LAST YEAR'S
83

ASSSINATION OF FORMER
LEBANESE LEADER.
86

THEY QUOTE THE VICE PRESIDENT A
SAYING ASSAD MADE DIRECT
89

PERSONAL TH

## Parsing for show names

#### Raw checking

In [28]:
show_names = ['CHANNEL 4 NEWS','NIGHTLY NEWS','EXTRA','ACCESS HOLLYWOOD','NEWSROOM','WORLD TODAY','LIVEFROM','LIVE FROM','LIVE-FROM','SITUATION ROOM','LOU DOBBS']
for sub in subs:
    time = list(sub.start)
    time = time[2] + 60*time[1] + 3600*time[0]
    for show in show_names[5:]:
        if show in sub.text:
            print(sub.text)
            print(sec2HMS(time))
            print()
            
#Nouns
#MS-celeb train, filter list of journalists

DEADLY ASK DESTRUCTIVE FLAMES.
WE ARE LIVE FROM THE FIRE LINES.
00:30:28

HE'S JOINING US LIVE FROM OUR
LONDON BUREAU.
01:47:00

TRAVEL TRENDS FOR 2006 AT THE
TOP OF THE HOUR ON "LIVE FROM."
02:26:59

MEANWHILE "YOUR WORLD TODAY"
CONTINUES AFTER A QUICK BREAK.
02:27:04

THIS IS "YOUR WORLD TODAY".
>> OUR HOUR OF INTERNATIONAL
02:45:26

A LOOK AT RESTOREATION EFFORTS
WHEN "YOUR WORLD TODAY"
02:50:14

>>> THAT'S "YOUR WORLD TODAY."
THE NEWS CONTINUES HERE ON CNN.
02:57:18

BRINGING IT TO YOU.
MORE LIVE FROM" RIGHT AFTER
03:13:54

YOU THROUGHOUT THE AFTERNOON ON
"LIVE FROM."
03:19:58

WE'LL STAY ON THE STORY.
"LIVE FROM" IS BACK AFTER A
03:25:09

WE'LL TAKE A QUICK BREAK.
MORE "LIVE FROM" RIGHT AFTER
03:34:15

>> SUZANNE MALVEAUX, LIVE FROM
THE WHITE HOUSE.
03:41:24

WE'LL GET HER STORY.
"LIVE FROM" IS BACK AFTER A
03:41:36

HIS RETURN TO THE U.S.
STRAIGHT AHEAD ON "LIVE FROM."
03:48:59

"LIVE FROM" IS BACK RIGHT AFTER
A QUICK BREAK.
03:54:18

WE'LL KEEP BRINGING IT TO YOU.
MORE "LIVE FRO

#### Rule based extraction of show names

In [31]:
patterns = ['YOU\'RE WATCHING','YOU ARE WATCHING','AFTER THE BREAK','AFTER THIS BREAK','WE ARE BACK','WE\'RE BACK','COMING UP ON','COMING RIGHT UP ON','NEXT ON','CONTINUES AFTER THE BREAK','AFTER THE BREAK','THANKS FOR WATCHING','THANK YOU FOR WATCHING','THANK YOU FOR JOINING ME','RETURNS AFTER THIS']

In [32]:
for sub in subs:
    for pattern in patterns:
        if pattern in sub.text:
            time = list(sub.start)
            time = time[2] + 60*time[1] + 3600*time[0]
            print(sub)
            print(sec2HMS(time))
            print()

1580
01:51:59,045 --> 01:52:01,847
AS THEY APPROACH RETIREMENT AGE.
WE'RE BACK AFTER THIS.

01:51:59

3843
04:38:21,918 --> 04:38:26,455
COMPLETE CHAOS.
THAT STORY NEXT ON "LIVE FROM."

04:38:21

5346
06:23:18,975 --> 06:23:21,643
I KNOW YOU'RE WATCHING US.
YOU'RE LISTENING TO ALL THE

06:23:18



## NLTK Named entity recognition

In [None]:
os.chdir('my_project/')

In [14]:
#https://gist.github.com/troyane/c9355a3103ea08679baf
from nltk.tag.stanford import StanfordNERTagger
st = StanfordNERTagger('stanford-ner/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar')
#print (st.tag('You can call me Billiy Bubu and I live in Amsterdam.'.split()))

In [15]:
def getPeople(text):
    """Returns full names of *people* found in the text"""
    all_tags = st.tag(text.split())
    size = len(all_tags)
    people = []
    count = 0
    is_curr_person = False #If the previous words were 'PERSON'
    while (count<size): #Consecutive words with 'PERSON' tag
        if all_tags[count][1] == 'PERSON':
            if is_curr_person:
                people[-1].append(all_tags[count][0])
            else:
                is_curr_person = True
                people.append([all_tags[count][0]])
        else:
            is_curr_person = False
        count +=1
    
    people = [' '.join(person) for person in people]
    return people

In [16]:
people = getPeople(full_srt)

In [18]:
people[:10]

['ASSAD',
 'RAFIK HARIRI',
 'FARRIS HASSAN',
 'DARYN KAGAN',
 'JEN RODGERS',
 'JEN RODGERS',
 'KATHERINE BARRETT',
 'DARYN',
 'KATHERINE BARRETT',
 'RINGO']

## Word presence - inference

In [None]:
names = ['DARYN','KAGAN','KYRA','PHILLIPS','ROSEMARY','CHURCH','FOREMAN','CHRISTINE','ROMANS']
for n in names:
    for p in people:
        if n in p:
            print(p)

In [101]:
show_hints = ['WELCOME TO', 'YOU\'RE WATCHING', 'THIS IS', 'TONIGHT ON', '' ]
for sub in subs:
    if 'I\'M' in sub.text:
        for hint in show_hints:
            if hint in sub.text:
                print(sub.text)
                print(sub.start)
                print()

    THIS IS MY HEART
THAT I'M MAKING HEALTHY.
00:15:33,266

I'M ROSALEE CHURCH.
>> THIS IS YOUR "YOUR WORLD
02:00:44,637

    THIS IS MY HEART
THAT I'M MAKING HEALTHY.
04:15:15,233



In [None]:
def word_count(string):
    counts = dict()
    words = string.split()

    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1

    return counts

In [None]:
words_counted = sorted(word_count(full_srt).items(), key = lambda x: -x[1])

In [None]:
names = ['DARYN','KAGAN','KYRA','PHILLIPS','ROSEMARY','CHURCH','FOREMAN','CHRISTINE','ROMANS']
for x in words_counted:
    for name in names:
        if name in x[0]:
            print(x)

In [None]:
for x in subs:
    greeting = 'I\'M'
    if greeting in x.text:
        print(x.text)
        print(x.start.to_time())
        print()

In [None]:
for x in subs:
    for name in names:
        greeting = 'I\'M '+name 
        if greeting in x.text:
            print(x.text)
            print(x.start.to_time())

In [None]:
for x in subs:
    for name in names[2:]: 
        if name in x.text:
            print(x.text)
            print(x.start.to_time())

## Azure trial

In [5]:
import requests
# If you are using a Jupyter notebook, uncomment the following line.
# %matplotlib inline
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
import os
import json

# Replace <Subscription Key> with your valid subscription key.
subscription_key = "8c7905bebc374835aeb117b453b0b540"
assert subscription_key

# You must use the same region in your REST call as you used to get your
# subscription keys. For example, if you got your subscription keys from
# westus, replace "westcentralus" in the URI below with "westus".
#
# Free trial subscription keys are generated in the "westcentralus" region.
# If you use a free trial subscription key, you shouldn't need to change
# this region.
vision_base_url = "https://westcentralus.api.cognitive.microsoft.com/vision/v2.0/"

analyze_url = vision_base_url + "analyze"

# Set image_path to the local path of an image that you want to analyze.
for img in os.listdir('azure-samples/'):
    image_path = 'azure-samples/'+img

    # Read the image into a byte array
    image_data = open(image_path, "rb").read()
    headers = {'Ocp-Apim-Subscription-Key': subscription_key,
               'Content-Type': 'application/octet-stream'}
    params = {'visualFeatures': 'Categories,Description,Color'}
    response = requests.post(
        analyze_url, headers=headers, params=params, data=image_data)
    response.raise_for_status()

    # The 'analysis' object contains various fields that describe the image. The most
    # relevant caption for the image is obtained from the 'description' property.
    analysis = response.json()
    print(json.dumps(analysis, indent=4))
    print()
    image_caption = analysis["description"]["captions"][0]["text"].capitalize()

    # Display the image and overlay it with the caption.
    # image = Image.open(BytesIO(image_data))
    # plt.imshow(image)
    # plt.axis("off")
    # _ = plt.title(image_caption, size="x-large", y=-0.1)
    plt.close('all')

{
    "categories": [
        {
            "name": "others_",
            "score": 0.0078125
        },
        {
            "name": "people_",
            "score": 0.6015625,
            "detail": {
                "celebrities": [
                    {
                        "name": "Daryn Kagan",
                        "confidence": 0.9999701976776123,
                        "faceRectangle": {
                            "left": 200,
                            "top": 191,
                            "width": 250,
                            "height": 250
                        }
                    }
                ]
            }
        },
        {
            "name": "people_portrait",
            "score": 0.35546875,
            "detail": {
                "celebrities": [
                    {
                        "name": "Daryn Kagan",
                        "confidence": 0.9999701976776123,
                        "faceRectangle": {
                            "l

{
    "categories": [
        {
            "name": "people_",
            "score": 0.3203125,
            "detail": {
                "celebrities": [
                    {
                        "name": "Christine Romans",
                        "confidence": 0.999870777130127,
                        "faceRectangle": {
                            "left": 180,
                            "top": 231,
                            "width": 280,
                            "height": 280
                        }
                    }
                ]
            }
        },
        {
            "name": "people_portrait",
            "score": 0.67578125,
            "detail": {
                "celebrities": [
                    {
                        "name": "Christine Romans",
                        "confidence": 0.999870777130127,
                        "faceRectangle": {
                            "left": 180,
                            "top": 231,
                        