In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import requests
import pickle
import text_hammer as th

In [2]:
from convokit import Corpus, download
corpus = Corpus(filename=download("movie-corpus"))

Downloading movie-corpus to /Users/fabio/.convokit/downloads/movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


In [3]:
utt_df = corpus.get_utterances_dataframe().drop(columns=['vectors'])
convo_df = corpus.get_conversations_dataframe().drop(columns=['vectors'])
speaker_df = corpus.get_speakers_dataframe().drop(columns=['vectors'])

In [4]:
convo_df = convo_df.reset_index()
utt_df = utt_df.reset_index()
speaker_df = speaker_df.reset_index()

In [5]:
convo_df = convo_df.rename(columns={"id": "conversation_id"})
speaker_df = speaker_df.rename(columns={"id": "speaker"})

In [6]:
dataset = pd.merge(convo_df, utt_df, on=["conversation_id"])
dataset = pd.merge(dataset, speaker_df, on=["speaker"])

In [7]:
dataframe = dataset[["id", "conversation_id", "meta.movie_idx_x", "speaker", "meta.character_name", "meta.gender", "reply_to", "text"]]
dataframe

Unnamed: 0,id,conversation_id,meta.movie_idx_x,speaker,meta.character_name,meta.gender,reply_to,text
0,L1045,L1044,m0,u0,BIANCA,f,L1044,They do not!
1,L985,L984,m0,u0,BIANCA,f,L984,I hope so.
2,L925,L924,m0,u0,BIANCA,f,L924,Let's go.
3,L872,L870,m0,u0,BIANCA,f,L871,Okay -- you're gonna need to learn how to lie.
4,L870,L870,m0,u0,BIANCA,f,,I'm kidding. You know how sometimes you just ...
...,...,...,...,...,...,...,...,...
304708,L666522,L666520,m616,u9034,VEREKER,?,L666521,So far only their scouts. But we have had repo...
304709,L666520,L666520,m616,u9034,VEREKER,?,,"Well I assure you, Sir, I have no desire to cr..."
304710,L666372,L666369,m616,u9034,VEREKER,?,L666371,I think Chelmsford wants a good man on the bor...
304711,L666370,L666369,m616,u9034,VEREKER,?,L666369,I'm to take the Sikali with the main column to...


In [8]:
dataframe['text'].values[315]

'Unwelcome.'

In [9]:
from classes.cleaner import Cleaner
C = Cleaner()

In [10]:
dataframe = C.text_preprocessing(dataframe, 'text', 'cleaned_text')

100%|██████████| 304713/304713 [00:00<00:00, 3813723.86it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_col] = tqdm(df[column].apply(lambda x: str(x).lower()))
100%|██████████| 304713/304713 [00:00<00:00, 5191900.34it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_col] = tqdm(df[new_col].apply(lambda x: th.cont_exp(x)))
100%|██████████| 304713/304713 [00:00<00:00, 4130205.19it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation

In [11]:
dataframe = dataframe[dataframe['cleaned_text'].map(lambda d: len(d)) > 0]


In [12]:
with open('datasets/processed/movie_scripts_df.pickle', "wb") as w:
    pickle.dump(dataframe, w)

In [13]:
with open("datasets/processed/movie_scripts_df.pickle", "rb") as r:
    dataframe = pickle.load(r)

In [14]:
dataframe

Unnamed: 0,id,conversation_id,meta.movie_idx_x,speaker,meta.character_name,meta.gender,reply_to,text,cleaned_text
0,L1045,L1044,m0,u0,BIANCA,f,L1044,They do not!,they do not
1,L985,L984,m0,u0,BIANCA,f,L984,I hope so.,i hope so
2,L925,L924,m0,u0,BIANCA,f,L924,Let's go.,let us go
3,L872,L870,m0,u0,BIANCA,f,L871,Okay -- you're gonna need to learn how to lie.,okay you re go to need to learn how to lie
4,L870,L870,m0,u0,BIANCA,f,,I'm kidding. You know how sometimes you just ...,i am kid you know how sometimes you just becom...
...,...,...,...,...,...,...,...,...,...
304708,L666522,L666520,m616,u9034,VEREKER,?,L666521,So far only their scouts. But we have had repo...,so far only their scout but we have have repor...
304709,L666520,L666520,m616,u9034,VEREKER,?,,"Well I assure you, Sir, I have no desire to cr...",well i assure you sir i have no desire to crea...
304710,L666372,L666369,m616,u9034,VEREKER,?,L666371,I think Chelmsford wants a good man on the bor...,i think chelmsford want a good man on the bord...
304711,L666370,L666369,m616,u9034,VEREKER,?,L666369,I'm to take the Sikali with the main column to...,i am to take the sikali with the main column t...


In [15]:
from classes.vectorizer import Vectorizer
V = Vectorizer()

In [16]:
dataframe['pad_vectorized_text'] = list(V.vectorize_keras_df(dataframe, 'cleaned_text'))

In [17]:
with open("datasets/processed/vectorized_scripts_df.pickle", "wb") as w:
    pickle.dump(dataframe, w)