In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("wiki_movie_plots_deduped.csv")

In [3]:
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [4]:
import sqlite3
con = sqlite3.connect('movie_plots.db')

Cleaning function

In [5]:
import re
def clean(s):
    s = s.replace(r'<lb>', "\n")
    s = s.replace(r'<tab>', "\i")
    s = re.sub(r'<br */*>', "\n", s)
    s = s.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
    s = s.replace("&amp;", "&")
    # markdown urls
    s = re.sub(r'\(https*://[^\)]*\)', "", s)
    # normal urls
    s = re.sub(r'https*://[^\s]*', "", s)
    s = re.sub(r'_+', ' ', s)
    s = re.sub(r'"+', '"', s)
    return str(s)

In [6]:
num_rows = len(df)
num_rows

34886

In [7]:
df = df[:5000]

In [8]:
df["plot_clean"] = ''

In [9]:
col_types = df.dtypes
col_types

Release Year         int64
Title               object
Origin/Ethnicity    object
Director            object
Cast                object
Genre               object
Wiki Page           object
Plot                object
plot_clean          object
dtype: object

In [10]:
for i, row in df.iterrows():
    df.at[i, "plot_clean"] = clean(row.Plot)

In [11]:
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,plot_clean
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...","A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...","The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...","The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,The earliest known adaptation of the classic f...


NLP

Load spacy

In [12]:
import spacy 
nlp = spacy.load("en_core_web_sm")

Perform NLP on the dataset

In [13]:
for i, row in df.iterrows():
    if i % 1000 == 0:
        print(i)
    if(row["plot_clean"] and len(str(row["plot_clean"])) < 1000000):
        doc = nlp(str(row["plot_clean"]))
        adjectives = []
        nouns = []
        verbs = []
        lemmas = []

        for token in doc:
            lemmas.append(token.lemma_)
            if token.pos_ == "ADJ":
                adjectives.append(token.lemma_)
            if token.pos_ == "NOUN" or token.pos_ == "PROPN":
                nouns.append(token.lemma_)
            if token.pos_ == "VERB":
                verbs.append(token.lemma_)
                
        df.at[i, "plot_lemma"] = " ".join(lemmas)                
        df.at[i, "plot_nouns"] = " ".join(nouns)
        df.at[i, "plot_adjectives"] = " ".join(adjectives)
        df.at[i, "plot_verbs"] = " ".join(verbs)
        df.at[i, "plot_nav"] = " ".join(nouns+adjectives+verbs)
        df.at[i, "no_tokens"] = len(lemmas)

0
1000
2000
3000
4000


In [14]:
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,plot_clean,plot_lemma,plot_nouns,plot_adjectives,plot_verbs,plot_nav,no_tokens
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...","A bartender is working at a saloon, serving dr...","a bartender be work at a saloon , serve drink ...",bartender drink customer man bucket beer Carri...,saloon irish irish,work serve fill burst assault pull dump begin ...,bartender drink customer man bucket beer Carri...,96.0
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...","The moon, painted with a smiling face hangs ov...","the moon , paint with a smile face hang over a...",moon face park night couple fence learn railin...,young big last well,paint smile hang walk look smile embrace get s...,moon face park night couple fence learn railin...,99.0
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...","The film, just over a minute long, is composed...","the film , just over a minute long , be compos...",film minute shot girl base altar tomb face cam...,first portal second long,compose sit hide view run,film minute shot girl base altar tomb face cam...,94.0
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,Lasting just 61 seconds and consisting of two ...,"last just 61 second and consist of two shot , ...",second shot shot wood winter actor vice - pres...,first other common second different,last consist set represent hurry fall right co...,second shot shot wood winter actor vice - pres...,183.0
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,The earliest known adaptation of the classic f...,the early know adaptation of the classic fairy...,adaptation fairytale film Jack cow bean mother...,early classic deposed giant able,know show trade force drop force sleep visit s...,adaptation fairytale film Jack cow bean mother...,157.0


Save to database

In [15]:
df.to_sql('plot_nlp', con)

5000

In [16]:
# filter warnings on depreciation etc.
import warnings
warnings.filterwarnings("ignore")

Library import & Settings

In [17]:
# import pandas, numpy
import pandas as pd
import numpy as np

# adjust pandas display
pd.options.display.max_columns = 30
pd.options.display.max_rows = 100
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.precision = 2
pd.options.display.max_colwidth = -1

In [18]:
# Import matplotlib and seaborn and adjust some defaults
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

from matplotlib import pyplot as plt
plt.rcParams['figure.dpi'] = 100

import seaborn as sns
sns.set_style("whitegrid")

Basic Properties of the dataset

In [20]:
df.dtypes

Release Year        int64  
Title               object 
Origin/Ethnicity    object 
Director            object 
Cast                object 
Genre               object 
Wiki Page           object 
Plot                object 
plot_clean          object 
plot_lemma          object 
plot_nouns          object 
plot_adjectives     object 
plot_verbs          object 
plot_nav            object 
no_tokens           float64
dtype: object

In [23]:
# select a sample of some data frame columns
df[['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast', 'Genre']] \
  .sample(4, random_state=42)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre
1501,1933,Ex-Lady,American,Robert Florey,"Bette Davis, Gene Raymond, Claire Dodd",comedy
2586,1939,The Hound of the Baskervilles,American,Sidney Lanfield,"Basil Rathbone, Nigel Bruce, Richard Greene",mystery
2653,1939,Raffles,American,Sam Wood,"David Niven, Olivia De Havilland",crime comedy
1055,1931,24 Hours,American,Marion Gering,"Kay Francis, Miriam Hopkins, Regis Toomey",drama


In [24]:
len(df)

5000

In [26]:
df.count()

Release Year        5000
Title               5000
Origin/Ethnicity    5000
Director            5000
Cast                4895
Genre               5000
Wiki Page           5000
Plot                5000
plot_clean          5000
plot_lemma          5000
plot_nouns          5000
plot_adjectives     5000
plot_verbs          5000
plot_nav            5000
no_tokens           5000
dtype: int64

In [27]:
# size info, including memory consumption
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Release Year      5000 non-null   int64  
 1   Title             5000 non-null   object 
 2   Origin/Ethnicity  5000 non-null   object 
 3   Director          5000 non-null   object 
 4   Cast              4895 non-null   object 
 5   Genre             5000 non-null   object 
 6   Wiki Page         5000 non-null   object 
 7   Plot              5000 non-null   object 
 8   plot_clean        5000 non-null   object 
 9   plot_lemma        5000 non-null   object 
 10  plot_nouns        5000 non-null   object 
 11  plot_adjectives   5000 non-null   object 
 12  plot_verbs        5000 non-null   object 
 13  plot_nav          5000 non-null   object 
 14  no_tokens         5000 non-null   float64
dtypes: float64(1), int64(1), object(13)
memory usage: 52.8 MB


 Exploring Column Summaries