In [1]:
import os
import pandas as pd
import numpy as np
import nltk
from collections import namedtuple
from itertools import groupby

from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## Read subtitles file, and convert to pandas

In [3]:
def load_and_parse_subtitle(pathdir):
    with open(pathdir, 'rb') as f:
        res = [list(g) for b,g in groupby(f, lambda x: bool(x.strip())) if b]

    Subtitle = namedtuple('Subtitle', 'number start end content at_minute at_seconds')

    subs = []
    number = 0
    for sub in res:
        if len(sub) >= 3: # not strictly necessary, but better safe than sorry
            sub = [x.strip() for x in sub]
            try:
                number = sub[0].decode("UTF-8")
            except:
                number += 1
            start_end = sub[1].decode("UTF-8")
            content = sub[2]
            if len(start_end.split(' --> ')) == 2:
                start, end = start_end.split(' --> ') # e.g. 02:14:53,085

                if len(start) >= 12 and len(end) >= 12:
                    start = start[:12] #for truncating unnecessary fields, if any
                    end = end[:12] #for truncating unnecessary fields, if any
                    try:
                        at_minute = int(start[:2]) * 60 + int(start[3:5])
                        at_seconds = int(start[:2]) * 3600 + int(start[3:5]) * 60 + int(start[6:8])
                    except:
                        at_minute = 0
                        at_seconds = 0
                        #continue
                    subs.append(Subtitle(number, start, end, content, at_minute, at_seconds))

    return subs

In [46]:
def process_title(title_name):
    result = title_name.replace(' (IMPAIRED).srt', '') 
    result = result.replace('"', '')
    return result

In [73]:
def process_subtitles(dirs):
    directory = [dirname[0] for dirname in os.walk(dirs)][1:]
    
    for dirname in directory:
        filelist = [filename for filename in os.listdir(dirname)]
        print(len(filelist))
        for filename in filelist:
            pathdir = (dirname + '/' + filename)
            subs = load_and_parse_subtitle(pathdir)
            
            if len(subs) <= 0:
                continue
            word_count = 0
            for sub in subs:
                word_count += len(str(sub.content).split(" "))

            movie_time_minute = 60 * int(subs[-1].start.split(":")[0]) + int(subs[-2].start.split(":")[1])
#             print(process_title(filename), subs[-1].start.split(":"), subs[-1].start.split(":"))

            word_per_minute = word_count / movie_time_minute
            dialog_per_minute = len(subs) / movie_time_minute

            tmp = {'title': process_title(filename), 'word per min': word_per_minute, 'dialog_per_min': dialog_per_minute,
                   'genre': dirname.split('/')[1]}
            datasets.append(tmp)
        

In [102]:
datasets = []
process_subtitles('Subtitles/')

496
583
601
609
600
252
462
588
444


In [104]:
df_ori = pd.DataFrame(datasets).sample(frac=1).reset_index(drop=True)

In [105]:
def remove_punc(df):
    df['title'] = df['title'].str.replace('[^\w\s]', '')
    df['title'] = df['title'].str.lower()

In [106]:
def preprocess_title(df):
    remove_punc(df)
    df['title'] = df['title'].str.lower()
    df['tokenized_title'] = df.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)

In [215]:
df = df_ori.copy()
df.drop_duplicates(subset='title')
preprocess_title(df)

In [223]:
df_ori[df_ori['title']=='Jersey Girl']

Unnamed: 0,title,word per min,dialog_per_min,genre
5,Jersey Girl,75.148515,15.0,Comedy
230,Jersey Girl,75.148515,15.0,Romance


In [216]:
df['genre'].value_counts()

Romance      609
Comedy       598
Horror       595
Action       586
War          579
Crime        496
Musical      462
Western      444
Adventure    245
Name: genre, dtype: int64

## TrainTestSplit

In [23]:
X = df.drop(columns='genre')
y = df['genre']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=126)

In [38]:
y_train.value_counts()

Horror       491
Romance      486
Action       481
Comedy       480
War          465
Crime        387
Musical      364
Western      347
Adventure    190
Name: genre, dtype: int64

## Feature Engineering

In [24]:
# Word2Vec
model = Word2Vec(df, size=150, window=10, min_count=2, workers=10, iter=10)

In [25]:
# tfidf
def tfidf(dataset_train, dataset_test):
#     print(dataset_train.shape)
    vectorizer = TfidfVectorizer(input='content')
    features_transformed = vectorizer.fit_transform(dataset_train['title'])
    features_test_transformed = vectorizer.transform(dataset_test['title'])
    df_tfidf = pd.DataFrame(features_transformed.toarray(), columns=vectorizer.get_feature_names())
    df_tfidf_test = pd.DataFrame(features_test_transformed.toarray(), columns=vectorizer.get_feature_names())
#     print(df_tfidf.shape)
    new_df = dataset_train.drop(['title', 'tokenized_title'], axis=1).reset_index(drop=True)
    new_test_df = dataset_test.drop(['title', 'tokenized_title'], axis=1).reset_index(drop=True)
#     print(new_df.shape)
    df_tfidf = pd.concat([df_tfidf, new_df], axis=1)
    df_tfidf_test = pd.concat([df_tfidf_test, new_test_df], axis=1)
#     print(df_tfidf.shape)
    return df_tfidf, df_tfidf_test

In [26]:
tfidf_train, tfidf_test = tfidf(X_train, X_test)

# ML Model

In [27]:
classifier = SVC(C=100)
classifier.fit(tfidf_train, y_train)
y_pred = classifier.predict(tfidf_test)

In [28]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

      Action       0.22      0.16      0.19       141
   Adventure       0.00      0.00      0.00         0
      Comedy       0.61      0.24      0.34       302
       Crime       0.00      0.00      0.00         0
      Horror       0.61      0.23      0.34       272
     Musical       0.32      0.84      0.46        37
     Romance       0.25      0.21      0.23       145
         War       0.07      0.35      0.12        23
     Western       0.03      1.00      0.06         3

    accuracy                           0.25       923
   macro avg       0.23      0.34      0.19       923
weighted avg       0.47      0.25      0.30       923



  _warn_prf(average, modifier, msg_start, len(result))


In [299]:
classifier = AdaBoostClassifier(n_estimators=100)
classifier.fit(tfidf_train, y_train)
y_pred = classifier.predict(tfidf_test)

ValueError: Found input variables with inconsistent numbers of samples: [3691, 2287]

In [30]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

      Action       0.05      0.16      0.07        31
   Adventure       0.05      0.14      0.08        21
      Comedy       0.72      0.22      0.34       389
       Crime       0.00      0.00      0.00         1
      Horror       0.66      0.20      0.30       351
     Musical       0.33      0.84      0.47        38
     Romance       0.03      0.40      0.06        10
         War       0.23      0.81      0.36        32
     Western       0.39      0.76      0.52        50

    accuracy                           0.28       923
   macro avg       0.27      0.39      0.24       923
weighted avg       0.60      0.28      0.32       923



## Combining with movie plot

In [61]:
df_ori

Unnamed: 0,title,word per min,dialog_per_min,genre,tokenized_title
0,mash hawkeye,73.541667,16.750000,War,"[mash, hawkeye]"
1,mr mrs smith srt,71.666667,13.720430,Comedy,"[mr, mrs, smith, srt]"
2,game of thrones the wars to come,54.660000,13.920000,Romance,"[game, of, thrones, the, wars, to, come]"
3,ninja assassin,37.933333,8.811111,Crime,"[ninja, assassin]"
4,the boy,50.021978,12.021978,Horror,"[the, boy]"
...,...,...,...,...,...
18451,dilwale dulhania le jayenge,37.505319,7.558511,Musical,"[dilwale, dulhania, le, jayenge]"
18452,hell on wheels timshel,47.642857,11.333333,Western,"[hell, on, wheels, timshel]"
18453,war and remembrance part i,53.022388,13.179104,War,"[war, and, remembrance, part, i]"
18454,dawn of the dead,42.121212,10.676768,Action,"[dawn, of, the, dead]"


In [347]:
df_plot = pd.read_csv('wiki_movie_plots.csv')[['Title','Genre','Plot', 'Origin']]

In [348]:
df_plot

Unnamed: 0,Title,Genre,Plot,Origin
0,Kansas Saloon Smashers,unknown,"A bartender is working at a saloon, serving dr...",American
1,Love by the Light of the Moon,unknown,"The moon, painted with a smiling face hangs ov...",American
2,The Martyred Presidents,unknown,"The film, just over a minute long, is composed...",American
3,"Terrible Teddy, the Grizzly King",unknown,Lasting just 61 seconds and consisting of two ...,American
4,Jack and the Beanstalk,unknown,The earliest known adaptation of the classic f...,American
...,...,...,...,...
34881,The Water Diviner,unknown,"The film begins in 1919, just after World War ...",Turkish
34882,Çalgı Çengi İkimiz,comedy,"Two musicians, Salih and Gürkan, described the...",Turkish
34883,Olanlar Oldu,comedy,"Zafer, a sailor living with his mother Döndü i...",Turkish
34884,Non-Transferable,romantic comedy,The film centres around a young woman named Am...,Turkish


In [349]:
df_plot.rename(columns = {'Title':'title', 'Genre':'genre', 
                              'Plot':'plot', 'Origin':'origin'}, inplace = True) 

In [344]:
df_plot = df_plot[df_plot['origin']=='American']

In [350]:
df_plot['origin'].unique()

array(['American', 'Australian', 'Bangladeshi', 'British', 'Canadian',
       ' or Up with Dead People', 'Chinese', 'Egyptian', 'Hong Kong',
       'Filipino', 'Assamese', 'Bengali', 'Bollywood', 'Kannada',
       'Malayalam', 'Marathi', 'Punjabi', 'Tamil', 'Telugu', 'Japanese',
       'Gate: Fuka Ryōiki no Déjà vu', 'Malaysian', 'Maldivian',
       'Russian', 'South_Korean', 'Turkish'], dtype=object)

In [358]:
df_plot = df_plot[(df_plot['origin']=='American') | (df_plot['origin']=='British')]

In [359]:
df_plot = df_plot.groupby('title').agg({'genre':'first', 'origin': 'first',
                             'plot': '. '.join}).reset_index()

In [360]:
rslt_df = df_plot[df_plot.title.isin(df_ori.title.values)].reset_index(drop=True)

In [361]:
rslt_df

Unnamed: 0,title,genre,origin,plot
0,'71,unknown,British,"Gary Hook, a new recruit to the British Army, ..."
1,10 Cloverfield Lane,science fiction psychological thriller,American,"After breaking up with her boyfriend Ben, Mich..."
2,10 Things I Hate About You,romantic comedy,American,"Cameron James, a new student at Padua High Sch..."
3,100 Rifles,western,American,"In 1912 Sonora, Mexico, African American Lyede..."
4,101 Dalmatians II: Patch's London Adventure,animated,American,The Radcliffe family and their 101 Dalmatians ...
...,...,...,...,...
2200,Your Highness,"comedy, fantasy",American,Thadeous and Fabious are sons of King Tallious...
2201,Youth in Revolt,comedy-drama,American,"Shy, socially inept teenager Nick Twisp lives ..."
2202,Zelig,mockumentary,American,"Set in the 1920s and 1930s, the film focuses o..."
2203,Zookeeper,"comedy, family",American,A zookeeper named Griffin Keyes (Kevin James) ...


In [362]:
rslt_df = rslt_df.drop(columns='genre')

In [363]:
df_merge = pd.merge(df_ori, rslt_df, on="title")

In [415]:
df_merge[12:]

Unnamed: 0,title,word per min,dialog_per_min,genre,origin,plot,tokenized_text
12,Crimson Tide,58.057143,13.933333,War,American,in post soviet russia civil war erupts as a r...,post soviet russia civil war erupts result ong...
13,The Return of a Man Called Horse,15.275000,3.775000,Western,American,trappers with government support force the yel...,trapper government support force yellow hand s...
14,Lone Wolf McQuade,30.878788,7.030303,Western,American,the main character j j mcquade norris is ...,main character j j mcquade norris former marin...
15,Edge of Tomorrow,83.841121,17.429907,Action,American,in an alien race called mimics arrive in...,alien race called mimic arrive germany storm c...
16,Edge of Tomorrow,83.841121,17.429907,Adventure,American,in an alien race called mimics arrive in...,alien race called mimic arrive germany storm c...
...,...,...,...,...,...,...,...
2773,Closer,57.350000,17.530000,Romance,American,in the opening scene year old alice ayres...,opening scene year old alice ayres portman dan...
2774,High Plains Drifter,45.805825,9.475728,Western,American,a mysterious stranger rides out of the desert ...,mysterious stranger ride desert isolated minin...
2775,Blown Away,50.054545,11.709091,Action,American,after her mother dies in a mysterious car acci...,mother dy mysterious car accident year old meg...
2776,Ghost Town,34.400000,10.062500,Western,American,the film begins as married new york city busin...,film begin married new york city businessman f...


In [364]:
df_merge.drop_duplicates(subset='title')

Unnamed: 0,title,word per min,dialog_per_min,genre,origin,plot
0,Malcolm X,65.005208,15.072917,Romance,American,Malcolm X follows the life of African-American...
1,The Postman Always Rings Twice,64.321429,12.973214,Romance,American,Frank Chambers (John Garfield) is a hobo who s...
2,The Transporter,41.000000,12.351648,Crime,American,Frank Martin (Jason Statham) is a highly skill...
4,Titanic,55.365385,11.730769,Romance,American,"At the last minute, a wealthy American expatri..."
5,Underworld: Awakening,42.097561,11.500000,Action,American,A few years after the events of the second fil...
...,...,...,...,...,...,...
2773,Closer,57.350000,17.530000,Romance,American,"In the opening scene, 24-year-old ""Alice Ayres..."
2774,High Plains Drifter,45.805825,9.475728,Western,American,A mysterious stranger rides out of the desert ...
2775,Blown Away,50.054545,11.709091,Action,American,After her mother dies in a mysterious car acci...
2776,Ghost Town,34.400000,10.062500,Western,American,The film begins as married New York City busin...


## Preprocessing

In [365]:
def clean_plot(dataset):
    dataset['plot'] = dataset['plot'].str.replace('[^a-zA-Z]', ' ') # remove non alphabet character
    dataset['plot'] = dataset['plot'].str.lower() # lower all text

In [366]:
def preprocess_plot(dataset):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    lemmatizer = nltk.stem.WordNetLemmatizer()
    dataset['tokenized_text'] = dataset.apply(lambda row: nltk.word_tokenize(row['plot']), axis=1)
    dataset['tokenized_text'] = dataset['tokenized_text'].apply(lambda word: [item for item in word if not item in stop_words])
    dataset['tokenized_text'] = dataset['tokenized_text'].apply(lambda word: [lemmatizer.lemmatize(item) for item in word])
    dataset['tokenized_text']=[" ".join(text) for text in dataset['tokenized_text'].values]

In [367]:
clean_plot(df_merge)
preprocess_plot(df_merge)

In [368]:
df_merge

Unnamed: 0,title,word per min,dialog_per_min,genre,origin,plot,tokenized_text
0,Malcolm X,65.005208,15.072917,Romance,American,malcolm x follows the life of african american...,malcolm x follows life african american activi...
1,The Postman Always Rings Twice,64.321429,12.973214,Romance,American,frank chambers john garfield is a hobo who s...,frank chamber john garfield hobo stop rural di...
2,The Transporter,41.000000,12.351648,Crime,American,frank martin jason statham is a highly skill...,frank martin jason statham highly skilled driv...
3,The Transporter,41.000000,12.351648,Action,American,frank martin jason statham is a highly skill...,frank martin jason statham highly skilled driv...
4,Titanic,55.365385,11.730769,Romance,American,at the last minute a wealthy american expatri...,last minute wealthy american expatriate europe...
...,...,...,...,...,...,...,...
2773,Closer,57.350000,17.530000,Romance,American,in the opening scene year old alice ayres...,opening scene year old alice ayres portman dan...
2774,High Plains Drifter,45.805825,9.475728,Western,American,a mysterious stranger rides out of the desert ...,mysterious stranger ride desert isolated minin...
2775,Blown Away,50.054545,11.709091,Action,American,after her mother dies in a mysterious car acci...,mother dy mysterious car accident year old meg...
2776,Ghost Town,34.400000,10.062500,Western,American,the film begins as married new york city busin...,film begin married new york city businessman f...


In [383]:
X = df_merge.drop(columns=['genre', 'title', 'plot','origin'])
y = df_merge['genre']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=126)

In [384]:
y_train

730     Adventure
887        Comedy
2336      Western
158         Crime
485         Crime
          ...    
196        Action
336        Action
643        Comedy
1793    Adventure
2671      Romance
Name: genre, Length: 2222, dtype: object

In [398]:
# tfidf
def tfidf(dataset_train, dataset_test):
    vectorizer = TfidfVectorizer(min_df=10, max_features=20000, smooth_idf=True, norm="l2", sublinear_tf=False, ngram_range=(1,4))
    print(dataset_train.shape)
    features_transformed = vectorizer.fit_transform(dataset_train['tokenized_text'])
    features_test_transformed = vectorizer.transform(dataset_test['tokenized_text'])
    
    df_tfidf = pd.DataFrame(features_transformed.toarray(), columns=vectorizer.get_feature_names())
    df_tfidf_test = pd.DataFrame(features_test_transformed.toarray(), columns=vectorizer.get_feature_names())

    new_df = dataset_train.drop(['tokenized_text'], axis=1).reset_index(drop=True)
    new_test_df = dataset_test.drop(['tokenized_text'], axis=1).reset_index(drop=True)

#     df_tfidf = pd.concat([df_tfidf, new_df], axis=1)
#     df_tfidf_test = pd.concat([df_tfidf_test, new_test_df], axis=1)

    return df_tfidf, df_tfidf_test

In [399]:
tfidf_plot_train, tfidf_plot_test = tfidf(X_train, X_test)

(2222, 3)


In [400]:
tfidf_plot_train.shape

(2222, 10056)

In [401]:
tfidf_plot_train

Unnamed: 0,aaron,abandon,abandoned,abandoning,abby,abdomen,abduct,abducted,abduction,abducts,...,younger brother,younger self,younger sister,youngest,youth,zero,zeus,zombie,zone,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [402]:
classifier = SVC(C=100)
classifier.fit(tfidf_plot_train, y_train)
y_pred = classifier.predict(tfidf_plot_test)

In [403]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

      Action       0.25      0.19      0.21       112
   Adventure       0.05      0.07      0.06        30
      Comedy       0.20      0.19      0.20       111
       Crime       0.36      0.31      0.33        80
      Horror       0.38      0.50      0.43        42
     Musical       0.12      0.45      0.20        11
     Romance       0.41      0.26      0.32       118
         War       0.40      0.52      0.45        33
     Western       0.38      0.89      0.53        19

    accuracy                           0.29       556
   macro avg       0.28      0.38      0.30       556
weighted avg       0.30      0.29      0.28       556



In [427]:
from xgboost import XGBClassifier

In [431]:
xgbc = XGBClassifier(learning_rate=0.01)
xgbc.fit(tfidf_plot_train, y_train)
ypred = xgbc.predict(tfidf_plot_test)

In [432]:
print(classification_report(ypred, y_test))

              precision    recall  f1-score   support

      Action       0.25      0.21      0.23       100
   Adventure       0.05      0.07      0.06        29
      Comedy       0.20      0.25      0.22        85
       Crime       0.29      0.30      0.29        67
      Horror       0.46      0.45      0.46        58
     Musical       0.20      0.31      0.24        26
     Romance       0.39      0.29      0.33       104
         War       0.55      0.42      0.47        55
     Western       0.49      0.69      0.57        32

    accuracy                           0.31       556
   macro avg       0.32      0.33      0.32       556
weighted avg       0.33      0.31      0.31       556



In [414]:
max_features = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie revi

# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 128)         2560000   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         98816     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 2,757,761
Trainable params: 2,757,761
Non-trainable params: 0
_________________________________________________________________


In [424]:
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])
model.fit(tfidf_plot_train, y_train, batch_size=32, epochs=2)

RuntimeError: Data adapters should be mutually exclusive for handling inputs. Found multiple adapters [<class 'tensorflow.python.keras.engine.data_adapter.TensorLikeDataAdapter'>, <class 'tensorflow.python.keras.engine.data_adapter.GeneratorDataAdapter'>] to handle input: <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.series.Series'>

In [419]:
X_train = keras.preprocessing.sequence.pad_sequences(X_train['tokenized_text'], maxlen=maxlen)
X_test = keras.preprocessing.sequence.pad_sequences(X_test['tokenized_text'], maxlen=maxlen)

ValueError: invalid literal for int() with base 10: ' looking mid credit scene pym show van dyne new wasp prototype suit offer post credit scene wilson steve rogers bucky barnes custody unable contact tony stark accord n wilson mention know someone hel

In [420]:
X_train

Unnamed: 0,word per min,dialog_per_min,tokenized_text
730,55.870690,15.534483,scientist hank pym resigns h e l discovering a...
887,59.670213,15.531915,steve barker johnny knoxville hate job two yea...
2336,63.162393,13.111111,miscarriage justice land willis newton prison ...
158,50.268817,13.451613,victor maynard bill nighy experienced efficien...
485,42.177419,11.774194,dr hannibal lecter attends orchestral performa...
...,...,...,...
196,42.652482,13.297872,harry tasker lead double life wife helen daugh...
336,73.185185,15.546296,london british mob bos lenny cole tom wilkinso...
643,81.400000,22.380952,peter sanderson steve martin uptight workaholi...
1793,26.938776,7.755102,world war ii pevensie child peter susan edmund...
