In [1]:
dataset_source = 'https://www.kaggle.com/jrobischon/wikipedia-movie-plots'

In [2]:
import keras
import os
import csv
import matplotlib.pyplot as plt
import numpy as np

Using TensorFlow backend.


In [3]:
data_file = '/Users/gursharan/Desktop/wiki_movie_plots_deduped.csv'
input_length = 500
compressed_length = 300

data_reader = csv.reader(open(data_file,'r'))
next(data_reader) #skip the header
text_corpus = []
for row in data_reader:
    plot = row[-1].upper() #upper case only
    
    text = []
    plot_index = 0
    while( plot_index < len(plot) and len(text) < input_length ):
        char = plot[ plot_index ]
        #limit to 
        if( ord(char) < 128 ):
            text.append(char)
        plot_index = plot_index + 1
    
    plot = ' '.join((''.join(text)).split()[:-1])
    if len(plot) < input_length:
        plot = plot + ' '*(input_length-len(plot)) #padded by space at last
        
    text_corpus.append(plot)

In [4]:
print(text_corpus[3])

LASTING JUST 61 SECONDS AND CONSISTING OF TWO SHOTS, THE FIRST SHOT IS SET IN A WOOD DURING WINTER. THE ACTOR REPRESENTING THEN VICE-PRESIDENT THEODORE ROOSEVELT ENTHUSIASTICALLY HURRIES DOWN A HILLSIDE TOWARDS A TREE IN THE FOREGROUND. HE FALLS ONCE, BUT RIGHTS HIMSELF AND COCKS HIS RIFLE. TWO OTHER MEN, BEARING SIGNS READING "HIS PHOTOGRAPHER" AND "HIS PRESS AGENT" RESPECTIVELY, FOLLOW HIM INTO THE SHOT; THE PHOTOGRAPHER SETS UP HIS CAMERA. "TEDDY" AIMS HIS RIFLE UPWARD AT THE TREE AND FELLS  


In [5]:
def text_to_numbers( text ):
    array = []
    for char in text:
        array.append( ord(char) )
    return np.array(array)

def numbers_to_text( array ):
    text = []
    for number in array:
        text.append( chr(number) )
    return ''.join(text)

In [6]:
numbers = text_to_numbers( text_corpus[3] )
text = numbers_to_text( numbers )
print( numbers )
print( text )

[76 65 83 84 73 78 71 32 74 85 83 84 32 54 49 32 83 69 67 79 78 68 83 32
 65 78 68 32 67 79 78 83 73 83 84 73 78 71 32 79 70 32 84 87 79 32 83 72
 79 84 83 44 32 84 72 69 32 70 73 82 83 84 32 83 72 79 84 32 73 83 32 83
 69 84 32 73 78 32 65 32 87 79 79 68 32 68 85 82 73 78 71 32 87 73 78 84
 69 82 46 32 84 72 69 32 65 67 84 79 82 32 82 69 80 82 69 83 69 78 84 73
 78 71 32 84 72 69 78 32 86 73 67 69 45 80 82 69 83 73 68 69 78 84 32 84
 72 69 79 68 79 82 69 32 82 79 79 83 69 86 69 76 84 32 69 78 84 72 85 83
 73 65 83 84 73 67 65 76 76 89 32 72 85 82 82 73 69 83 32 68 79 87 78 32
 65 32 72 73 76 76 83 73 68 69 32 84 79 87 65 82 68 83 32 65 32 84 82 69
 69 32 73 78 32 84 72 69 32 70 79 82 69 71 82 79 85 78 68 46 32 72 69 32
 70 65 76 76 83 32 79 78 67 69 44 32 66 85 84 32 82 73 71 72 84 83 32 72
 73 77 83 69 76 70 32 65 78 68 32 67 79 67 75 83 32 72 73 83 32 82 73 70
 76 69 46 32 84 87 79 32 79 84 72 69 82 32 77 69 78 44 32 66 69 65 82 73
 78 71 32 83 73 71 78 83 32 82 69 65 68 73 78 71 32

In [7]:
#write the network
model = keras.models.Sequential()
model.add( keras.layers.Dense(1024, activation='relu', input_shape=(input_length,)) )
model.add( keras.layers.Dense(512, activation='relu' ))
model.add( keras.layers.Dense(compressed_length, activation='relu' ))
model.add( keras.layers.Dense(512, activation='relu' ))
model.add( keras.layers.Dense(1024, activation='relu' ))
model.add( keras.layers.Dense(input_length, activation='relu' ))
model.compile( loss='mae', optimizer='adadelta', metrics=['mse'] )

In [8]:
#normalize the data and create train, validation, test subsets
all_data = np.array( [ text_to_numbers(x) for x in text_corpus ] )

In [9]:
normalized_data = all_data/128
data_indices = np.random.permutation(len(all_data))
train = normalized_data[ data_indices[: int(len(all_data)*0.8) ] ]
val = normalized_data[data_indices[int(len(all_data)*0.8):]]

In [10]:
history = model.fit(train, train, epochs=10, validation_data=(val,val) )

Train on 27908 samples, validate on 6978 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
random_example = text_corpus[ np.random.randint(len(text_corpus)) ]
input_is = text_to_numbers( random_example )/128
output_is = model.predict( input_is.reshape(1,500) )
output_is = (output_is*128).astype('int')
predicted_text = numbers_to_text(output_is[0])
print(random_example)
print(predicted_text)

THE STORY IS PRESENTED AS A NARRATED DOCUMENTARY, SET IN A NEAR-FUTURE 1970S ENGLAND, AND CONCERNING A DISILLUSIONED POP SINGER, STEVEN SHORTER (PAUL JONES), WHO IS THE MOST-LOVED CELEBRITY IN THE COUNTRY. HIS STAGE SHOW INVOLVES HIM APPEARING ON STAGE IN A JAIL CELL WITH HANDCUFFS, BEATEN BY POLICE, TO THE HORROR AND SYMPATHY OF THE AUDIENCE. IT IS DESCRIBED THAT THE TWO MAIN PARTIES OF ENGLAND HAVE FORMED A COALITION GOVERNMENT AND ENCOURAGE THE SUCCESS OF SHORTER TO PLACATE THE MASSES AND    
NHCCCI JEKBFJ LDL HEE GADK BJ FFE IGG IIGH JFDONDEIHEGKJG   BIKHMGKEKIHBFKEFJ NAHJ DJJFDJK C GFGJDGIBHHFIIDFKDOGHGGGGLHJJI KDGHFD?LIFKGIHOHKH ID IGIGJFMICJKJHJDKGHLFJKIFCM JLKG CMMDCJGK DLLFBLHCCEF JMNIKHIGGJLDMFO  FFDNPDJ DDIFGNH  FF  IDI@HMMJ BB FMGEELHEEIGGGJFK FOEFNHF@MG K AAQ GM  GKLEGIJJBS FMIJLDEHJIIHJ  EKKFINMB FIJJKGFDJLDFNF DFDII E LHE CKIHFGFILK KH FK EEKOJHKFG HIFFH LGGFNEHHJ GIKJ  FHKEHGDIMKDKH@FFGKHHIFCGLHJ IHJJFFGJPDKGGK ONDJL HHMB LIKIOJDJMKMKLJ JIMOGLOIONIJMHMGMLLM QJKMOL WO/ 

### It doesn't seem to work, even the training seems too slow. I need to re-think what the loss function should be. I guess mistake was to use ordinal values for characters as it's representation - even if it ends up predicting all Es as Fs, the loss will be less. We need to define one-hot vectors so that Es and Fs are distinguished!

In [51]:
input_length = 200
compressed_length = 100

data_file = '/Users/gursharan/Desktop/wiki_movie_plots_deduped.csv'

data_reader = csv.reader(open(data_file,'r'))
next(data_reader) #skip the header
text_corpus = []
for row in data_reader:
    plot = row[-1].upper() #upper case only

    text = []
    plot_index = 0
    while( plot_index < len(plot) and len(text) < input_length ):
        char = plot[ plot_index ]
        #limit to A-Z only
        if( char>='A' and char<='Z' ):
            text.append(char)
        if( char == ' '):
            text.append('@')
        plot_index = plot_index + 1
    
    plot = ''.join(text)
    if len(plot) < input_length:
        plot = plot + '@'*(input_length-len(plot)) #padded by space at last
        
    text_corpus.append(plot)

def text_to_numbers( text ):
    array = []
    #64-90
    for char in text:
        val = np.zeros(27)
        val[ord(char)-ord('@')] = 1
        array.extend( val )
    return np.array(array)

def numbers_to_text( array ):
    text = []
    for total in range(input_length):
        numbers = array[total*27:(total+1)*27]
        text.append( chr(np.argmax(numbers)+ord('@')) )
    return ''.join(text)

In [52]:
text = text_corpus[3]
numbers = text_to_numbers(text)
text_d = numbers_to_text(numbers)
print(text)
print(text_d)

LASTING@JUST@@SECONDS@AND@CONSISTING@OF@TWO@SHOTS@THE@FIRST@SHOT@IS@SET@IN@A@WOOD@DURING@WINTER@THE@ACTOR@REPRESENTING@THEN@VICEPRESIDENT@THEODORE@ROOSEVELT@ENTHUSIASTICALLY@HURRIES@DOWN@A@HILLSIDE@TO
LASTING@JUST@@SECONDS@AND@CONSISTING@OF@TWO@SHOTS@THE@FIRST@SHOT@IS@SET@IN@A@WOOD@DURING@WINTER@THE@ACTOR@REPRESENTING@THEN@VICEPRESIDENT@THEODORE@ROOSEVELT@ENTHUSIASTICALLY@HURRIES@DOWN@A@HILLSIDE@TO


In [53]:
#write the network
model = keras.models.Sequential()
model.add( keras.layers.Dense(1024, activation='relu', input_shape=(input_length*27,)) )
model.add( keras.layers.Dense(512, activation='relu' ))
model.add( keras.layers.Dense(compressed_length, activation='relu' ))
model.add( keras.layers.Dense(512, activation='relu' ))
model.add( keras.layers.Dense(1024, activation='relu' ))
model.add( keras.layers.Dense(input_length*27, activation='relu' ))
model.compile( loss='binary_crossentropy', optimizer='adadelta')

In [54]:
#normalize the data and create train, validation, test subsets
all_data = np.array( [ text_to_numbers(x) for x in text_corpus ] )

In [55]:
data_indices = np.random.permutation(len(all_data))
train = all_data[ data_indices[: int(len(all_data)*0.8) ] ]
val = all_data[data_indices[int(len(all_data)*0.8):]]
print('total examples : ', len(all_data))
print('total training examples : ', len(train))
print('total validation examples : ', len(val))

total examples :  34886
total training examples :  27908
total validation examples :  6978


In [56]:
history = model.fit(train, train, epochs=10, validation_data=(val,val) )

Train on 27908 samples, validate on 6978 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [64]:
text = text_corpus[3]
numbers = text_to_numbers(text)
numbers = numbers.reshape(1,5400)
output = model.predict(numbers)
pred = numbers_to_text(output[0])
print('Original :', text)
print('Predicted :', pred)

Original : LASTING@JUST@@SECONDS@AND@CONSISTING@OF@TWO@SHOTS@THE@FIRST@SHOT@IS@SET@IN@A@WOOD@DURING@WINTER@THE@ACTOR@REPRESENTING@THEN@VICEPRESIDENT@THEODORE@ROOSEVELT@ENTHUSIASTICALLY@HURRIES@DOWN@A@HILLSIDE@TO
Predicted : THE@@@@@@M@@@@@@@@@@@@@@@@@@@@@@@@@C@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@E@@@@@@@@@@@@@@@@@@@@@EE@@@@@@@@@@E@@@@@@@@@@@@@@@@E@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@


In [65]:
history_2 = model.fit(train, train, epochs=10, validation_data=(val,val) )

Train on 27908 samples, validate on 6978 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [66]:
text = text_corpus[3]
numbers = text_to_numbers(text)
numbers = numbers.reshape(1,5400)
output = model.predict(numbers)
pred = numbers_to_text(output[0])
print('Original :', text)
print('Predicted :', pred)

Original : LASTING@JUST@@SECONDS@AND@CONSISTING@OF@TWO@SHOTS@THE@FIRST@SHOT@IS@SET@IN@A@WOOD@DURING@WINTER@THE@ACTOR@REPRESENTING@THEN@VICEPRESIDENT@THEODORE@ROOSEVELT@ENTHUSIASTICALLY@HURRIES@DOWN@A@HILLSIDE@TO
Predicted : FN@@@@@@@@@@@@@@@@@@@@@@@@@@D@@@@@@@@M@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@O@@@@@M@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@E@@@@@@@@@@@@@@@@@@@@@@@@@@@@L@@@@@@@


### No luck!!! Now it's going to be just alphabatical characters!!

In [67]:
input_length = 200
compressed_length = 100

data_file = '/Users/gursharan/Desktop/wiki_movie_plots_deduped.csv'

data_reader = csv.reader(open(data_file,'r'))
next(data_reader) #skip the header
text_corpus = []
for row in data_reader:
    plot = row[-1].upper() #upper case only

    text = []
    plot_index = 0
    while( plot_index < len(plot) and len(text) < input_length ):
        char = plot[ plot_index ]
        #limit to A-Z only
        if( char>='A' and char<='Z' ):
            text.append(char)
        plot_index = plot_index + 1
    
    plot = ''.join(text)
    if len(plot) < input_length:
        plot = plot + 'A'*(input_length-len(plot)) #padded by A at last
        
    text_corpus.append(plot)

def text_to_numbers( text ):
    array = []
    #64-90
    for char in text:
        val = np.zeros(26)
        val[ord(char)-ord('A')] = 1
        array.extend( val )
    return np.array(array)

def numbers_to_text( array ):
    text = []
    for total in range(input_length):
        numbers = array[total*26:(total+1)*26]
        text.append( chr(np.argmax(numbers)+ord('A')) )
    return ''.join(text)

In [68]:
text = text_corpus[3]
numbers = text_to_numbers(text)
text_d = numbers_to_text(numbers)
print(text)
print(text_d)

LASTINGJUSTSECONDSANDCONSISTINGOFTWOSHOTSTHEFIRSTSHOTISSETINAWOODDURINGWINTERTHEACTORREPRESENTINGTHENVICEPRESIDENTTHEODOREROOSEVELTENTHUSIASTICALLYHURRIESDOWNAHILLSIDETOWARDSATREEINTHEFOREGROUNDHEFALL
LASTINGJUSTSECONDSANDCONSISTINGOFTWOSHOTSTHEFIRSTSHOTISSETINAWOODDURINGWINTERTHEACTORREPRESENTINGTHENVICEPRESIDENTTHEODOREROOSEVELTENTHUSIASTICALLYHURRIESDOWNAHILLSIDETOWARDSATREEINTHEFOREGROUNDHEFALL


In [73]:
#write the network
model = keras.models.Sequential()
model.add( keras.layers.Dense(1024, activation='relu', input_shape=(input_length*26,)) )
model.add( keras.layers.Dense(512, activation='relu' ))
model.add( keras.layers.Dense(compressed_length, activation='relu' ))
model.add( keras.layers.Dense(512, activation='relu' ))
model.add( keras.layers.Dense(1024, activation='relu' ))
model.add( keras.layers.Dense(input_length*26, activation='relu' ))
model.compile( loss='mae', optimizer='adadelta', metrics=['mse'] )

In [74]:
#normalize the data and create train, validation, test subsets
all_data = np.array( [ text_to_numbers(x) for x in text_corpus ] )

In [75]:
data_indices = np.random.permutation(len(all_data))
train = all_data[ data_indices[: int(len(all_data)*0.8) ] ]
val = all_data[data_indices[int(len(all_data)*0.8):]]
print('total examples : ', len(all_data))
print('total training examples : ', len(train))
print('total validation examples : ', len(val))

total examples :  34886
total training examples :  27908
total validation examples :  6978


In [76]:
history = model.fit(train, train, epochs=10, validation_data=(val,val) )

Train on 27908 samples, validate on 6978 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 

In [78]:
text = text_corpus[3]
numbers = text_to_numbers(text)
numbers = numbers.reshape(1,5200)
output = model.predict(numbers)
pred = numbers_to_text(output[0])
print('Original :', text)
print('Predicted :', pred)

Original : LASTINGJUSTSECONDSANDCONSISTINGOFTWOSHOTSTHEFIRSTSHOTISSETINAWOODDURINGWINTERTHEACTORREPRESENTINGTHENVICEPRESIDENTTHEODOREROOSEVELTENTHUSIASTICALLYHURRIESDOWNAHILLSIDETOWARDSATREEINTHEFOREGROUNDHEFALL
Predicted : AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA


## NEED TO GIVE IT ANOTHER TRY!!!!