In [26]:
import numpy
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
import re
import os
import textblob
import collections
from textblob import Word
from textblob.tokenizers import WordTokenizer
from spellchecker import SpellChecker
from bs4 import BeautifulSoup
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
import time
from pycontractions import Contractions


spell = SpellChecker()

In [27]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
def countWords(string): 
    OUT = False
    IN = True
    state = OUT 
    wc = 0
  
    # Scan all characters one by one 
    for i in range(len(string)): 
  
        # If next character is a separator,  
        # set the state as OUT 
        if (string[i] == ' ' or string[i] == '\n' or
            string[i] == '\t'): 
            state = OUT 
  
        # If next character is not a word  
        # separator and state is OUT, then  
        # set the state as IN and increment  
        # word count 
        elif state == OUT: 
            state = IN 
            wc += 1
            
    return wc

cont = Contractions(api_key="glove-twitter-25")
cont.load_models()

def contract_handle(st):
    #print('Doing text %d...' %texts.index(st))
    t = list(cont.expand_texts([st.replace("’","'")]))[0]
    tags = nltk.pos_tag(word_tokenize(str(t)))
    temp = []
    print('%d tags created...' %len(tags))
    for tag in tags:
        temp.append(tag[0])
            
    return ' '.join(temp)

In [30]:
# getting the scripts and titles fo each episode into lists
script_list = []
title_list = []
# grabbing named entities
with open('named_entity_tz.txt', 'r', encoding='utf-8') as myfile:
    ner = myfile.read()
    
#loading the named entities into the spell checker for possible spell checking
names = []
for item in ner.split('\n'):
    names.append(item.split(',',1)[0])

spell.word_frequency.load_words(names)

# actually adding the scripts and episode titles to their respective lists with some cleaning
for i in os.listdir('twilightzone'):
    with open('twilightzone/' + i, 'r', encoding='utf-8') as myfile:
        r = re.sub(r'(^[\s\w\-\"]*)(1)',r'\2',str(myfile.read().replace('\n',' ').lower()))

        script_list.append(r)
        
        title_list.append(i.replace('_',' ').replace('.txt',''))    

In [31]:
# in order to grab the dates of each episode, we had to scrape the wiki 
# grabbing the wiki and parsing it correctly
wiki = 'https://en.wikipedia.org/wiki/List_of_The_Twilight_Zone_(1959_TV_series)_episodes#Concept_(1958)'
s = simple_get(wiki)
html = BeautifulSoup(s, 'html.parser')

In [32]:
#isolating the dates from the wikipedia page
table = html.find_all('table',{'class':'wikitable plainrowheaders wikiepisodetable'})
table = str(table)

#encoding/decoding the scrapped data due to formatting errors
markup = table.encode("utf-8")
content = markup.decode("utf-8", "ignore")
table2 = content.replace('\n','')

#grabbing the actual dates
d = re.findall(r'(<\/td><td style="text-align:center">)([\w\s\,]*)(<span)',str(table2))

In [33]:
# placing all the dates from every episode into a ist
dates_for_all = []
for i,p in enumerate(d):
    dates_for_all.append(str(p[1]).replace('\xa0',' '))

#getting the dates for only the episodes needed
eps = [29,72,42,122,80,1,33,41,40,6,21,7,88,4,0,63]
dates_list = []
for i in eps:
    dates_list.append(dates_for_all[i])

In [34]:
#Dates of each episode

#  'A Stop at Willoughby',                 May 6, 1960
#  "It's a Good Life",                     Nov 3, 1961
#  'Nick of Time',                         Nov 18, 1960
#  'Nightmare At 20000 Feet',              Oct 11, 1963
#  'Nothing in the Dark',                  Jan 5, 1962
#  'One for the Angels',                   Oct 9, 1959
#  'The After Hours',                      June 10, 1960
#  'The Eye of the Beholder',              Nov 11, 1960
#  'The Howling Man',                      Nov 4, 1960
#  'The Lonely',                           Nov 13, 1959
#  'The Monsters Are Due on Maple Street', March 4, 1960
#  'Time Enough at Last',                  Nov 20, 1959
#  'To Serve Man',                         March 2, 1962
#  'Walking Distance',                     Oct 30, 1959
#  'Where is Everybody',                   Oct 2, 1959
#  'Will the Real Martian Please Stand Up  May 26, 1961

In [41]:
# Using pycontractions to expand contractions in the scripts
spell.word_frequency.load_words(ner)
text = list(tz['Text'])

contractions = []
for story in text:
    temp_story = contract_handle(str(story))
    
    contractions.append(temp_story)

7653 tags created...
8133 tags created...
6161 tags created...
4719 tags created...
3739 tags created...
7230 tags created...
6130 tags created...
6934 tags created...
4919 tags created...
9444 tags created...
7270 tags created...
6966 tags created...
7589 tags created...
8073 tags created...
10046 tags created...
5703 tags created...


In [48]:
# throwing everything into a dataframe
data = {'Source': 'Twilight Zone', 
        'Title':title_list, 
        'Date':dates_list, 
        'Text': contractions}
tz = pd.DataFrame(data)
tz

Unnamed: 0,Source,Title,Date,Text
0,Twilight Zone,A Stop at Willoughby,"May 6, 1960",1. int . conference room [ day ] this is a big...
1,Twilight Zone,It's a Good Life,"November 3, 1961",the twilight zone season three `` it is a good...
2,Twilight Zone,Nick of Time,"November 18, 1960",the twilight zone `` the nick of time '' writt...
3,Twilight Zone,Nightmare At 20000 Feet,"October 11, 1963",the twilight zone episode 503 : `` nightmare a...
4,Twilight Zone,Nothing in the Dark,"January 5, 1962",the twilight zone `` nothing in the dark '' wr...
5,Twilight Zone,One for the Angels,"October 9, 1959",102 : `` one for the angels '' written by rod ...
6,Twilight Zone,The After Hours,"June 10, 1960",the twilight zone season one `` the after hour...
7,Twilight Zone,The Eye of the Beholder,"November 11, 1960",the twilight zone season two `` the eye of the...
8,Twilight Zone,The Howling Man,"November 4, 1960",the twilight zone `` the howling man '' writte...
9,Twilight Zone,The Lonely,"November 13, 1959",107 : `` the lonely '' written by rod serling ...


In [None]:
# pushing it to a csv
# tz.to_csv(path_or_buf='C:\\Users\\ced4689\\Desktop\\TVF',sep=('|'))