<a href="https://colab.research.google.com/github/emiliatjsp/emiliatjsp/blob/main/CC_assignment_210833458.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<center><h1><b>ECS7022P: Computational Creativity Assignment</b></h1>
<h2>Emilia Pietras, ID: 210833458<br/></center>

## 1. Logistics Code

In [None]:
#!pip install bs_ds
#!pip install fake_useragent
#!pip install lxml
#!pip install --upgrade --no-cache-dir gdown

In [None]:
from bs4 import BeautifulSoup
import requests
import re


def get_movies(genre):
    """
    Gets a genre and returns a list of the movie titles on imsdb.com in that 
    genre.

    Parameters
    ----------
    genre : str
        A string describing the genre of movies we will scrape

    Returns
    -------
    movies : list
        A list of strings with movie titles 
    """    

    # the complete list of genres we can choose
    genres = ['Action','Adventure','Animation','Comedy','Crime','Drama',
              'Family','Fantasy','Film-Noir','Horror','Musical','Mystery',
              'Romance','Sci-Fi','Short','Thriller','War','Western']

    if genre not in genres:
        raise ValueError("the input 'genre' must be one of the following: "+str(genres))
    
    url = 'https://imsdb.com/genre/'+genre 
    
    # Connecting to the url using requests.get and add timeout (mult. of 3), the status code is 200 if success
    response = requests.get(url, timeout=3)

    page_content = response.content
    soup = BeautifulSoup(page_content,'lxml') 

    # The third table (column) on the webpage contains the links to all the movies
    find_tables = soup.findAll('td', valign='top') # the three tables have the tag <td>
    all_movies = find_tables[2].findAll('a') # <a> --> links!

    # Build the final list of tuples, which is to be returned
    movies = [re.split("[,.]",movie_info.string)[0].replace(' ', '-')
              for movie_info in all_movies]
    return movies

def handle_movie(movie):
    """
    Gets a movie title and returns a list containing lines of the script

    Parameters
    ----------
    movie : str
        A movie title

    Returns
    -------
    text : list
        A list of strings, which are the lines of the script
    """    

    text = []
    url = u'http://www.imsdb.com/scripts/' + movie + '.html' # full url to the movie script
    response = requests.get(url, timeout=3)
    if response.status_code==200:
        page_content = response.content
        soup = BeautifulSoup(page_content,'lxml')
        for b_tag in soup.find_all('b'): # checking for the <b> tag
            text.append(b_tag.text)
            if (b_tag.next_sibling != "") and b_tag.next_sibling not in soup.find_all('b'):                
            #text.append(b_tag.text)
                text.append(b_tag.next_sibling)
    else: 
        print('Error. Check status code table.\n\n')
    
    return text

In [None]:
### SCRAPING COMEDY MOVIES
"""
comedy_movies = get_movies('Comedy')
all_text = []
for movie in comedy_movies[:70]: # keeping the text file under 10MB
    all_text.append(handle_movie(movie))

all_lines = ""
for doc in all_text:
    for line in doc:
        all_lines = all_lines + str(line)
"""

'\ncomedy_movies = get_movies(\'Comedy\')\nall_text = []\nfor movie in comedy_movies[:70]: # keeping the text file under 10MB\n    all_text.append(handle_movie(movie))\n\nall_lines = ""\nfor doc in all_text:\n    for line in doc:\n        all_lines = all_lines + str(line)\n'

In [None]:
# WRITING COMEDY MOVIES TRAINING DATA
#f = open("comedy_movies.txt", "w") # creating a .txt file
#f.write(all_lines)
#f.close()

In [None]:
"""
horror_movies = get_movies('Horror')
all_text = []
for movie in horror_movies[:70]: # keeping the text file under 10MB
    all_text.append(handle_movie(movie))

all_lines = ""
for doc in all_text:
    for line in doc:
        all_lines = all_lines + str(line)
"""        

'\nhorror_movies = get_movies(\'Horror\')\nall_text = []\nfor movie in horror_movies[:70]: # keeping the text file under 10MB\n    all_text.append(handle_movie(movie))\n\nall_lines = ""\nfor doc in all_text:\n    for line in doc:\n        all_lines = all_lines + str(line)\n'

In [None]:
#f = open("horror_movies.txt", "w") # creating a .txt file
#f.write(all_lines)
#f.close()

## 2. Training Code

In [None]:
#!pip install -q gpt-2-simple
#import gpt_2_simple as gpt2
#from datetime import datetime
#from google.colab import files

In [None]:
#gpt2.download_gpt2(model_name="124M") # downloading the 124M GPT-2 model

In [None]:
#gpt2.mount_gdrive()

In [None]:
"""
import tensorflow as tf
import json

tf.compat.v1.reset_default_graph() 

sess = gpt2.start_tf_sess()

# finetuning
gpt2.finetune(sess,
              dataset="horror_movies.txt",
              model_name='124M',
              steps=2000,
              restore_from= 'latest', #'fresh
              overwrite = True, #!!!
              run_name='horror',
              print_every=10,
              sample_every=200,
              save_every=400,
	          only_train_transformer_layers = True,
	          accumulate_gradients = 1,
              )
"""

'\nimport tensorflow as tf\nimport json\n\ntf.compat.v1.reset_default_graph() \n\nsess = gpt2.start_tf_sess()\n\n# finetuning\ngpt2.finetune(sess,\n              dataset="horror_movies.txt",\n              model_name=\'124M\',\n              steps=2000,\n              restore_from= \'latest\', #\'fresh\n              overwrite = True, #!!!\n              run_name=\'horror\',\n              print_every=10,\n              sample_every=200,\n              save_every=400,\n\t          only_train_transformer_layers = True,\n\t          accumulate_gradients = 1,\n              )\n'

In [None]:
#gpt2.copy_checkpoint_to_gdrive(run_name='horror', copy_folder=True)

## 3. Generation Code

In [None]:
!pip install -q gpt-2-simple
!pip install --upgrade --no-cache-dir gdown




In [None]:
import gpt_2_simple as gpt2
gpt2.mount_gdrive()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import gdown, os

url = "https://drive.google.com/drive/folders/1tqdVl1dOVdeqGaSIJWuuyRukg3OOH81c"

download_successful = None # A workaround to make sure that gdown downloads the whole folder successfully, see https://github.com/wkentaro/gdown/issues/43
while download_successful == None:
  download_successful = gdown.download_folder(url, quiet=True, use_cookies=False)
  os.system('rm ~/.cache/gdown/cookies.json')

In [None]:
# loading in one model to allow for "reuse" in the text generation section
sess = gpt2.start_tf_sess()
gpt2.load_gpt2(sess, run_name='run1')

Loading checkpoint checkpoint/run1/model-4000
INFO:tensorflow:Restoring parameters from checkpoint/run1/model-4000


In [None]:
#@title Text Generation

genre = "horror" #@param['comedy', 'horror']
if genre == "comedy":
    genre = "run1"
    sess = gpt2.start_tf_sess()
    gpt2.load_gpt2(sess, run_name='run1', reuse=True)
else:
    sess = gpt2.start_tf_sess()
    gpt2.load_gpt2(sess, run_name='horror', reuse=True)

prefix = "CUT TO: "  # @param {type: "string"}

output_length = 512 # @param {type: "integer"}
nsamples =  2# @param {type: "integer"}
temperature = 1.0 # @param
top_p = 0.9 # @param
batch_size = nsamples

gpt2.generate(sess, run_name=genre, prefix=prefix, length=output_length, batch_size = batch_size, nsamples=nsamples, temperature = temperature)

Loading checkpoint checkpoint/horror/model-4000
INFO:tensorflow:Restoring parameters from checkpoint/horror/model-4000
CUT TO:                                                       79

                                  CARRIE
                 Emily's right. I was closing that file...

                                 CARRIE
                 Excuse me, you were looking at names!
                 Please... please... give me an e-
                 X-Ray... assume you're dead.

                                CARRIE
                 That's irresponsible! Now go ahead...

     Ana, Carrae and Marko are there watching as Carrae rounds Leila's

      B.G. Penalty for different actions on amigos...
                                                              85.




                                                           CUT TO:                    81.


41  INT. CARRIE'S PELLEA TRAILER - NIGHT      
CUT TO:         82 WATCH
           102 OMITTED 
                          
               

20/04/2020 Edit:

Link to GitHub