In [None]:
# Film Corpus 2.0
# Overview: This corpus is an updated version of the Film Corpus 1.0. It contains complete texts for the scripts of 1068 films in txt files, scraped from imsdb.com on Nov, 2015 using scrapy. It also contains 960 film scripts where the dialog in the film has been separated from the scene descriptions.

# The Data: Film scripts are classified by genre,  but one film can be in multiple genres. There are fewer than 1068 separated scripts because we use our own script to automatically separate the dialog and scene descriptions.

# Corpus from: https://nlds.soe.ucsc.edu/fc2

# Only 10 movies from the genres: Action, Comedy, Drama, Romance, Thriller were selected due to size of files

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize
import glob2

In [17]:
import pandas as pd
import numpy as np
import nltk
import csv
import re
import json
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import f1_score

In [2]:
movie_meta = pd.read_csv("C:/Users/yeungf8452/MovieSummaries/movie.metadata.tsv", sep = '\t', header = None)

In [9]:
plots = []

with open("C:/Users/yeungf8452/MovieSummaries/plot_summaries.txt", 'r', encoding="utf8") as f:
       reader = csv.reader(f, dialect='excel-tab') 
       for row in reader:
            plots.append(row)

In [10]:
movie_id = []
plot = []

# extract movie Ids and plot summaries
for i in plots:
  movie_id.append(i[0])
  plot.append(i[1])

# create dataframe
movies = pd.DataFrame({'movie_id': movie_id, 'plot': plot})

In [15]:
# rename columns
movie_meta.columns = ["movie_id",1,"movie_name",3,4,5,6,7,"genre"]

In [16]:
# change datatype of 'movie_id'
movie_meta['movie_id'] = movie_meta['movie_id'].astype(str)

# merge meta with movies
movies = pd.merge(movies, movie_meta[['movie_id', 'movie_name', 'genre']], on = 'movie_id')

movies.head()

Unnamed: 0,movie_id,plot,movie_name,genre
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",Taxi Blues,"{""/m/07s9rl0"": ""Drama"", ""/m/03q4nz"": ""World ci..."
1,31186339,The nation of Panem consists of a wealthy Capi...,The Hunger Games,"{""/m/03btsm8"": ""Action/Adventure"", ""/m/06n90"":..."
2,20663735,Poovalli Induchoodan is sentenced for six yea...,Narasimham,"{""/m/04t36"": ""Musical"", ""/m/02kdv5l"": ""Action""..."
3,2231378,"The Lemon Drop Kid , a New York City swindler,...",The Lemon Drop Kid,"{""/m/06qm3"": ""Screwball comedy"", ""/m/01z4y"": ""..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...,A Cry in the Dark,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."


In [18]:
# an empty list
genres = [] 

# extract genres
for i in movies['genre']: 
  genres.append(list(json.loads(i).values())) 

# add to 'movies' dataframe  
movies['genre_new'] = genres

In [19]:
# function for text cleaning 
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text

In [20]:
# remove samples with 0 genre tags
movies_new = movies[~(movies['genre_new'].str.len() == 0)]

movies_new['clean_plot'] = movies_new['plot'].apply(lambda x: clean_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [21]:
# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

movies_new['clean_plot'] = movies_new['clean_plot'].apply(lambda x: remove_stopwords(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [23]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(movies_new['genre_new'])

# transform target variable
y = multilabel_binarizer.transform(movies_new['genre_new'])

In [24]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)

In [25]:
# split dataset into training and validation set
xtrain, xval, ytrain, yval = train_test_split(movies_new['clean_plot'], y, test_size=0.2, random_state=9)

# create TF-IDF features
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

In [27]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

# fit model on train data
clf.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
y_pred = clf.predict(xval_tfidf)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


In [28]:
multilabel_binarizer.inverse_transform(y_pred)[3]

('Action', 'Drama')

## BERT Transformers

In [None]:
# Transformers documentation: https://huggingface.co/transformers/
# From https://huggingface.co/transformers/task_summary.html - Sequence Classification for sentiment analysis
# pip install git+https://github.com/huggingface/transformers.git

In [31]:
from transformers import pipeline

# Defaults to distilBERT
# Light version of BERT
classifier = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
Downloading: 100%|██████████| 629/629 [00:00<00:00, 210kB/s]
Downloading: 100%|██████████| 256M/256M [01:43<00:00, 2.59MB/s]
All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
Downloading: 100%|██████████| 48.0/48.0 [00:00<00:00, 12.0kB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 643kB/s]


In [32]:
result = classifier("I hate you")

In [33]:
result

[{'label': 'NEGATIVE', 'score': 0.9991129040718079}]