## process genre data

In [320]:
import pandas as pd
import nltk
nltk.download('punkt')
import json
from functools import reduce
from collections import Counter
import re

[nltk_data] Downloading package punkt to /Users/kexinchen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [321]:
with open("./MovieSummaries/plot_summaries.txt") as file:
    lines = file.readlines()

summary_data = []
for line in lines:
    line = line.split("\t")
    line = [x.strip() for x in line]
    if len(nltk.word_tokenize(line[1])) < 100:
        continue
    summary_data.append(line)
    
    

In [322]:
len(summary_data)

31559

In [323]:
summary_df = pd.DataFrame(summary_data, columns=['Movie ID', 'Summary'])

In [324]:
summary_df

Unnamed: 0,Movie ID,Summary
0,31186339,The nation of Panem consists of a wealthy Capi...
1,20663735,Poovalli Induchoodan is sentenced for six yea...
2,2231378,"The Lemon Drop Kid , a New York City swindler,..."
3,595909,Seventh-day Adventist Church pastor Michael Ch...
4,5272176,The president is on his way to give a speech. ...
...,...,...
31554,2867597,"An attempt to bring the famed ""Mr. Bill"" clay ..."
31555,1096473,"In 1928 Hollywood, director Leo Andreyev look..."
31556,35102018,American Luthier focuses on Randy Parsons’ tra...
31557,8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


In [325]:
header =  ["Wikipedia movie ID", "Freebase movie ID", "Movie name", "Movie release date", "Movie box office revenue", "Movie runtime", 
"Movie languages", "Movie countries", "Movie genres"]
metadata_df = pd.read_csv('./MovieSummaries/movie.metadata.tsv', sep='\t', names=header)
metadata_df = metadata_df[['Wikipedia movie ID', 'Freebase movie ID', "Movie name", "Movie genres"]]


In [326]:
metadata_df

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie genres
0,975900,/m/03vyhn,Ghosts of Mars,"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,"{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,"{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,"{""/m/07s9rl0"": ""Drama""}"
...,...,...,...,...
81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,"{""/m/07s9rl0"": ""Drama""}"
81737,34980460,/m/0g4pl34,Knuckle,"{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0..."
81738,9971909,/m/02pygw1,Another Nice Mess,"{""/m/06nbt"": ""Satire"", ""/m/01z4y"": ""Comedy""}"
81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,"{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ..."


In [327]:
df = pd.concat([metadata_df, summary_df], axis=1, join="inner")
df

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie genres,Movie ID,Summary
0,975900,/m/03vyhn,Ghosts of Mars,"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",31186339,The nation of Panem consists of a wealthy Capi...
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,"{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",20663735,Poovalli Induchoodan is sentenced for six yea...
2,28463795,/m/0crgdbh,Brun bitter,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",2231378,"The Lemon Drop Kid , a New York City swindler,..."
3,9363483,/m/0285_cd,White Of The Eye,"{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",595909,Seventh-day Adventist Church pastor Michael Ch...
4,261236,/m/01mrr1,A Woman in Flames,"{""/m/07s9rl0"": ""Drama""}",5272176,The president is on his way to give a speech. ...
...,...,...,...,...,...,...
31554,11178030,/m/02r2pm6,A.I. Assault,"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",2867597,"An attempt to bring the famed ""Mr. Bill"" clay ..."
31555,29896629,/m/0fqp5zx,Santa and the Ice Cream Bunny,"{""/m/0bj8m2"": ""Children's""}",1096473,"In 1928 Hollywood, director Leo Andreyev look..."
31556,17672486,/m/047s548,The Bottom of the Bottle,"{""/m/02l7c8"": ""Romance Film"", ""/m/07s9rl0"": ""D...",35102018,American Luthier focuses on Randy Parsons’ tra...
31557,14132268,/m/03cvl41,Beethoven Lives Upstairs,"{""/m/0hj3myq"": ""Children's/Family"", ""/m/04xvh5...",8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


In [328]:
def get_genres(x):
    genre_dict = json.loads(x)
    return list(genre_dict.values())

df["Movie genres"] = df["Movie genres"].map(lambda x: get_genres(x))

In [329]:
flatlist = reduce(lambda a,b:a+b, df["Movie genres"].values.tolist())
genre_set = set(flatlist)
genre_set

{'Absurdism',
 'Acid western',
 'Action',
 'Action Comedy',
 'Action Thrillers',
 'Action/Adventure',
 'Addiction Drama',
 'Adult',
 'Adventure',
 'Adventure Comedy',
 'Airplanes and airports',
 'Albino bias',
 'Alien Film',
 'Alien invasion',
 'Americana',
 'Animal Picture',
 'Animals',
 'Animated Musical',
 'Animated cartoon',
 'Animation',
 'Anime',
 'Anthology',
 'Anthropology',
 'Anti-war',
 'Anti-war film',
 'Apocalyptic and post-apocalyptic fiction',
 'Archaeology',
 'Archives and records',
 'Art film',
 'Auto racing',
 'Avant-garde',
 'B-Western',
 'B-movie',
 'Backstage Musical',
 'Baseball',
 'Beach Film',
 'Beach Party film',
 'Bengali Cinema',
 'Biker Film',
 'Biographical film',
 'Biography',
 'Biopic [feature]',
 'Black comedy',
 'Black-and-white',
 'Blaxploitation',
 'Bloopers & Candid Camera',
 'Bollywood',
 'Boxing',
 'Breakdance',
 'British Empire Film',
 'British New Wave',
 'Bruceploitation',
 'Buddy cop',
 'Buddy film',
 'Business',
 'C-Movie',
 'Camp',
 'Caper sto

In [330]:
Counter(flatlist).most_common()

[('Drama', 13232),
 ('Comedy', 6311),
 ('Romance Film', 3988),
 ('Black-and-white', 3432),
 ('Thriller', 3384),
 ('Action', 3376),
 ('Short Film', 3096),
 ('Crime Fiction', 2737),
 ('World cinema', 2708),
 ('Indie', 2582),
 ('Documentary', 2130),
 ('Horror', 2060),
 ('Silent film', 1930),
 ('Adventure', 1906),
 ('Action/Adventure', 1803),
 ('Family Film', 1728),
 ('Comedy film', 1703),
 ('Musical', 1644),
 ('Romantic drama', 1317),
 ('Animation', 1312),
 ('Mystery', 1246),
 ('Science Fiction', 1212),
 ('Fantasy', 1040),
 ('Romantic comedy', 1030),
 ('War film', 1020),
 ('Western', 918),
 ('Japanese Movies', 864),
 ('Crime Thriller', 804),
 ('Period piece', 711),
 ('Comedy-drama', 684),
 ('Chinese Movies', 591),
 ('Film adaptation', 583),
 ('Biography', 559),
 ('Psychological thriller', 523),
 ('Bollywood', 515),
 ('Sports', 490),
 ('Biographical film', 467),
 ('LGBT', 450),
 ('Black comedy', 405),
 ('Family Drama', 405),
 ('Music', 403),
 ('Television movie', 400),
 ('Parody', 395),
 (

In [331]:
len(genre_set)

350

In [332]:
raw_movie_genres = df[["Wikipedia movie ID", "Movie genres"]]
raw_movie_genres

Unnamed: 0,Wikipedia movie ID,Movie genres
0,975900,"[Thriller, Science Fiction, Horror, Adventure,..."
1,3196793,"[Mystery, Biographical film, Drama, Crime Drama]"
2,28463795,"[Crime Fiction, Drama]"
3,9363483,"[Thriller, Erotic thriller, Psychological thri..."
4,261236,[Drama]
...,...,...
31554,11178030,"[Thriller, Science Fiction, Action]"
31555,29896629,[Children's]
31556,17672486,"[Romance Film, Drama]"
31557,14132268,"[Children's/Family, Costume drama, Family Dram..."


In [333]:
expand_raw_movie_genres = raw_movie_genres.explode("Movie genres")
expand_raw_movie_genres = expand_raw_movie_genres.dropna()
expand_raw_movie_genres

Unnamed: 0,Wikipedia movie ID,Movie genres
0,975900,Thriller
0,975900,Science Fiction
0,975900,Horror
0,975900,Adventure
0,975900,Supernatural
...,...,...
31557,14132268,Family Film
31558,5090583,Musical
31558,5090583,Drama
31558,5090583,Bollywood


In [138]:
def categorize_genres(x):
    # category = {"Comedy": ["Comedy"], "Romance": ["Romance", "Romantic"], "Action": ["Action"], 
    #             "Thriller": ["Thriller"], "Horror": ["Horror"], "Crime": ["Crime"], 
    #             "Science";["Science", "Sci-Fi"], "Fantasy": ["Fantasy"]}
    genres = set()
    if "Comedy" in x:
        genres.add("Comedy")
    if "Romance" in x or "Romantic" in x:
        genres.add("Romance")
    if "Action" in x:
        genres.add("Action")
    if "Thriller" in x:
        genres.add("Thriller")
    if "Horror" in x:
        genres.add("Horror")
    if "Crime" in x:
        genres.add("Crime")
    if "Science" in x or "Sci-Fi" in x:
        genres.add("Science")
    if "Fantasy" in x:
        genres.add("Fantasy")

    return list(genres)
    
   
    

expand_raw_movie_genres["Movie Category"] = expand_raw_movie_genres["Movie genres"].map(lambda x: categorize_genres(x))
        

In [None]:
expand_raw_movie_genres

In [139]:
expand_movie_category = expand_raw_movie_genres.explode("Movie Category").dropna()

In [140]:
expand_movie_category = expand_movie_category.reset_index()
expand_movie_category

Unnamed: 0,index,Wikipedia movie ID,Movie genres,Movie Category
0,0,975900,Thriller,Thriller
1,0,975900,Science Fiction,Science
2,0,975900,Horror,Horror
3,0,975900,Action,Action
4,1,3196793,Crime Drama,Crime
...,...,...,...,...
34519,31554,11178030,Thriller,Thriller
34520,31554,11178030,Science Fiction,Science
34521,31554,11178030,Action,Action
34522,31556,17672486,Romance Film,Romance


In [141]:
expand_movie_category.dtypes

index                  int64
Wikipedia movie ID     int64
Movie genres          object
Movie Category        object
dtype: object

In [142]:
summary_df = summary_df.astype({"Movie ID": int})

In [143]:
movie_category_summary = pd.merge(expand_movie_category, summary_df, left_on='Wikipedia movie ID', right_on='Movie ID')

movie_category_summary


Unnamed: 0,index,Wikipedia movie ID,Movie genres,Movie Category,Movie ID,Summary
0,0,975900,Thriller,Thriller,975900,"Set in the second half of the 22nd century, th..."
1,0,975900,Science Fiction,Science,975900,"Set in the second half of the 22nd century, th..."
2,0,975900,Horror,Horror,975900,"Set in the second half of the 22nd century, th..."
3,0,975900,Action,Action,975900,"Set in the second half of the 22nd century, th..."
4,3,9363483,Thriller,Thriller,9363483,A series of murders of rich young women throug...
...,...,...,...,...,...,...
19170,31544,24409284,Thriller,Thriller,24409284,"Suresh Gopi plays the role of Vishwam, a man w..."
19171,31544,24409284,Crime Fiction,Crime,24409284,"Suresh Gopi plays the role of Vishwam, a man w..."
19172,31545,8623704,Comedy,Comedy,8623704,"Desperate to pay their rent, the Stooges get s..."
19173,31551,19270580,Romance Film,Romance,19270580,The focus is on hero Raj and not so much on t...


In [144]:
movie_category_summary.groupby(['Movie Category'])['Movie Category'].count()

Movie Category
Action      3176
Comedy      4673
Crime       1984
Fantasy      843
Horror      1376
Romance     3633
Science      821
Thriller    2669
Name: Movie Category, dtype: int64

In [146]:
movie_category_summary = movie_category_summary[["Movie ID", "Movie Category", "Summary"]]
# movie_category_summary.to_csv("movie_data.csv", index=False)

In [148]:
movie_category_summary = movie_category_summary.drop_duplicates()

In [149]:
movie_category_summary

Unnamed: 0,Movie ID,Movie Category,Summary
0,975900,Thriller,"Set in the second half of the 22nd century, th..."
1,975900,Science,"Set in the second half of the 22nd century, th..."
2,975900,Horror,"Set in the second half of the 22nd century, th..."
3,975900,Action,"Set in the second half of the 22nd century, th..."
4,9363483,Thriller,A series of murders of rich young women throug...
...,...,...,...
19170,24409284,Thriller,"Suresh Gopi plays the role of Vishwam, a man w..."
19171,24409284,Crime,"Suresh Gopi plays the role of Vishwam, a man w..."
19172,8623704,Comedy,"Desperate to pay their rent, the Stooges get s..."
19173,19270580,Romance,The focus is on hero Raj and not so much on t...


In [150]:
movie_category_list_summary = movie_category_summary.groupby('Movie ID')['Movie Category'].apply(list)

In [151]:
movie_category_list_summary

Movie ID
3217        [Horror, Action, Fantasy, Comedy]
3947                        [Thriller, Crime]
4227                                [Romance]
4231                 [Action, Horror, Comedy]
4727                [Action, Fantasy, Comedy]
                          ...                
36457881                            [Fantasy]
36548689                            [Romance]
36567002                              [Crime]
37257312                             [Comedy]
37373877                             [Horror]
Name: Movie Category, Length: 8867, dtype: object

In [55]:
unique_ids = set(movie_category_summary["Movie ID"].values.tolist())

In [56]:
unique_id_summary_str = {}
for id in unique_ids:
    summary = movie_category_summary.loc[movie_category_summary['Movie ID'] == id, 'Summary'].iloc[0]
    
    summary = summary.replace('"', r'\"')
    unique_id_summary_str[id] = summary

In [97]:
len(unique_id_summary_str)

8867

In [98]:
unique_id_summary_str_df = pd.DataFrame(unique_id_summary_str.items(), columns=['Movie ID', 'Summary'])

In [100]:
unique_id_summary_str_df.to_csv("unique_movie_data.csv", index=False)

## sentiment analysis

In [334]:
pip install --upgrade pip

Collecting pip
  Downloading pip-23.1.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1
    Uninstalling pip-23.1:
      Successfully uninstalled pip-23.1
Successfully installed pip-23.1.1
Note: you may need to restart the kernel to use updated packages.


##### training

In [None]:


unique_ids_list = list(unique_ids)
unique_ids_sentiment = []

# load the model
tagger = Classifier.load('sentiment')

for id in unique_ids_list:
    
    sentence = Sentence(unique_id_summary_str[id])

    # predict NER tags
    tagger.predict(sentence)

    # print sentence with predicted tags
    unique_ids_sentiment.append(sentence.get_labels()[0].value)

In [None]:
sentiment_data = {"Movie ID": unique_ids_list, "Sentiment": unique_ids_sentiment}
sentiment_df = pd.DataFrame(sentiment_data)

In [None]:
sentiment_df

In [None]:
movie_category_sentiment_summary = pd.merge(movie_category_summary, sentiment_df, on="Movie ID")

In [None]:

movie_category_sentiment_summary.to_csv("movie_data.csv", index=False)

In [157]:
df = pd.read_csv("movie_data.csv")

In [159]:
df = df.drop_duplicates()

In [161]:
df.to_csv("movie_data.csv", index=False)

## zero shot classification

In [101]:
unique_id_summary_str_df

Unnamed: 0,Movie ID,Summary
0,32604160,Mahesh is a thug in Thanjavur. His only ambit...
1,24608768,"At a midwestern college, student Mary Crawford..."
2,23199746,{{Plot}} The movie starts off in 1979 as a man...
3,18317315,Nihal Singh is a teen from Chandigarh who lov...
4,983044,"Once every generation, there is an inter-dimen..."
...,...,...
8862,25493478,"Visu , is a computer science student, who love..."
8863,7471088,The story is based loosely on real events and ...
8864,9273330,A woman's infidelity leads her into a web of l...
8865,4292601,{{Plot}} At a plush cocktail party at Reisner ...


In [152]:
movie_category_list_summary

Movie ID
3217        [Horror, Action, Fantasy, Comedy]
3947                        [Thriller, Crime]
4227                                [Romance]
4231                 [Action, Horror, Comedy]
4727                [Action, Fantasy, Comedy]
                          ...                
36457881                            [Fantasy]
36548689                            [Romance]
36567002                              [Crime]
37257312                             [Comedy]
37373877                             [Horror]
Name: Movie Category, Length: 8867, dtype: object

In [153]:
unique_summary_genre_list_df = unique_id_summary_str_df.merge(movie_category_list_summary, on='Movie ID')

In [300]:
sample = unique_summary_genre_list_df.sample(10, random_state = 2)

In [301]:
summaries = list(sample['Summary'].values)
genre_list = list(sample['Movie Category'].values)

In [310]:
sample

Unnamed: 0,Movie ID,Summary,Movie Category
8852,17923997,"In 1970s New York City, David Marks , the son ...","[Thriller, Crime, Romance]"
5008,673694,"{{plot}} John Tunstall , an educated Englishma...","[Thriller, Crime, Action, Comedy]"
8214,12515074,"Shaan is a charming, rustic, god fearing local...","[Crime, Action]"
8790,26967746,In a world where 12 year old Michael Gerber ha...,[Comedy]
2295,3842309,After claiming he is an extraterrestrial from ...,[Science]
2440,31204098,"On the eve of his society wedding, Dudley Leak...",[Comedy]
4668,14205601,Introduction: Like all Sam Sheepdog and Ralph ...,[Comedy]
900,5541106,"Awaking alone in the middle of a dark forest, ...","[Thriller, Horror, Crime]"
774,6097565,"A woman named Karen , who is suffering from a ...",[Horror]
1195,3510662,Set in the rural West Country in Victorian Eng...,[Romance]


## valhalla/distilbart-mnli-12-6

In [302]:
from transformers import pipeline
from sklearn.preprocessing import normalize

classifier = pipeline(task='zero-shot-classification', model='valhalla/distilbart-mnli-12-6')


In [307]:
def compute_average_score(original_genres, labels, scores):
    scores = normalize([scores], norm='max')[0]
    sum = 0
    for g in original_genres:
        index = labels.index(g)
        score = scores[index]
        sum += score
    return sum / len(original_genres)

In [308]:
start = timeit.default_timer()

avg_scores_list = []

for i in tqdm(range(len(summaries))):
    summary = summaries[i]
    genres = genre_list[i]
    sequences = summary
    candidate_labels = ["Comedy", "Romance", "Action", "Thriller", "Horror", "Crime", "Science", "Fantasy"]
    hypothesis_template = "This is a summary of movie plot. The genre of this movie is {}."
    prediction = classifier(sequences = sequences, candidate_labels = candidate_labels, hypothesis_template = hypothesis_template, multi_label=True)
    labels = prediction['labels']
    scores = prediction['scores']
    avg_score = compute_average_score(genres, labels, scores)
    avg_scores_list.append(avg_score)
    
stop = timeit.default_timer()
print('Time: ', stop - start)  

100%|███████████████████████████████████████████| 10/10 [02:32<00:00, 15.23s/it]

Time:  152.28476549999323





In [316]:
from statistics import mean

avg_score = mean(avg_scores_list)
avg_score

0.9581879754092969

In [317]:
avg_scores_list

[0.9658642092684833,
 0.9597331493892444,
 0.9732998693672754,
 0.7211156318997959,
 1.0,
 0.9775997177871604,
 0.996761697128782,
 0.9875054792522274,
 1.0,
 1.0]

In [318]:
len([s for s in avg_scores_list if s >= avg_score])

9

In [311]:
print(list(sample.loc[sample['Movie ID'] == 26967746, 'Summary']))

['In a world where 12 year old Michael Gerber has no friends, he finds that sometimes, the best friendships come from within. Michael is an awkward young boy who is about to start junior high and has no real friends. His only companion is an imaginary friend he calls Fuzzbucket. In the first act of the film, both the viewer and every character except Michael is left to make the assumption that Fuzzbucket is an imaginary friend. When Michael cooks up a strange, green concoction for his friend to drink, Fuzzbucket turns visibile again. Astonished to see Fuzzbucket for the first time, young Michael becomes overjoyed. His newly visible friend exuberantly leaps around Michael\'s treehouse in celebration. The two friends share a heartfelt moment in his treehouse when Fuzzbucket traces their hands with crayons, a ceremony that makes them blood brothers. At Fuzzbucket\'s urgent request for \\"Toons! Toons!\\" Michael switches on his television and the two sit together watching classic cartoons

##### test

In [339]:
sequences = "This story is about a police officer, who has been undercover for two years, who is convinced by a doctor, Dr. Mahmud, to be right, and is persuaded to give them information. He meets a prostitute, Mahi, and the two become lovers. He gets a contract from Mahmud to get information on the two, but he gets killed in a car accident. The police detective, who helped him get killed, is Mohamad Khan, a madman from whom he tells the secret story of his lover. The film is a foreshadowing of the story of Salman Rushdie, who is in jail. The story is told in flashback, and it unfolds in a way. The story of Salman Rushdie starts in a small town in Pakistan. He is a young man who lives under the name of Mohammed. Mohammed, a beautiful young woman, is the widow of one of Mahmud and Mahi Mahmud's nephew. They have a son, Mohamad, who is married, and they have two children who are of varying ages and gender. Mohammed's friends and family are very strict and suspicious of him. He is very quiet, as well as peaceful. He is very understanding towards people, but still he is reluctant to talk about his past. There is a lot of emotion in his life. He wants to go away, but unfortunately for him, his father dies in a car accident. This enrages Mohammed, and she lies to him about her past, about what happened to Mohammed. When she has a baby, Mohammed asks her to tell him about her past. He is puzzled at first, but then, at the same time, he begins to realize that his life was different from the lives of other people. He feels sorry for her, and hopes that she will forgive him. He gives her a diary of her past, and asks her to tell him about her father. She does. After that, it is the family that is transformed. They have a home, and they have a son. His life is changed. They are for the first time. The family is different than before, as Mohammed is the son of Mahmud. Mohammed wants to get out of prison, but he is afraid of being in prison. Mohamad decides to get out. He tells Mohammed that he will go to the police, whom he once he gets out. The police detective tells Mohammed that he must get out of his lover, and Mohammed goes to get in a few days before beginning at the end."
candidate_labels = ["Comedy", "Romance", "Action", "Thriller", "Horror", "Crime", "Science", "Fantasy"]
hypothesis_template = "This is a summary of movie plot. The genre of this is {}."

In [340]:
prediction = classifier(sequences = sequences, candidate_labels = candidate_labels, hypothesis_template = hypothesis_template, multi_label=True)


In [341]:
prediction

{'sequence': "This story is about a police officer, who has been undercover for two years, who is convinced by a doctor, Dr. Mahmud, to be right, and is persuaded to give them information. He meets a prostitute, Mahi, and the two become lovers. He gets a contract from Mahmud to get information on the two, but he gets killed in a car accident. The police detective, who helped him get killed, is Mohamad Khan, a madman from whom he tells the secret story of his lover. The film is a foreshadowing of the story of Salman Rushdie, who is in jail. The story is told in flashback, and it unfolds in a way. The story of Salman Rushdie starts in a small town in Pakistan. He is a young man who lives under the name of Mohammed. Mohammed, a beautiful young woman, is the widow of one of Mahmud and Mahi Mahmud's nephew. They have a son, Mohamad, who is married, and they have two children who are of varying ages and gender. Mohammed's friends and family are very strict and suspicious of him. He is very q

## facebook/bart-large-mnli

In [290]:
!pip3 install tqdm
from tqdm import tqdm
import timeit





##### 200text 40min

In [289]:
from transformers import pipeline
classifier = pipeline(task='zero-shot-classification', model='facebook/bart-large-mnli')

In [256]:
def compute_average_score(original_genres, labels, scores):
    sum = 0
    for g in original_genres:
        index = labels.index(g)
        score = scores[index]
        sum += score
    return sum / len(original_genres)

In [293]:
start = timeit.default_timer()

avg_scores_list = []

for i in tqdm(range(len(summaries))):
    summary = summaries[i]
    genres = genre_list[i]
    sequences = summary
    candidate_labels = ["Comedy", "Romance", "Action", "Thriller", "Horror", "Crime", "Science", "Fantasy"]
    hypothesis_template = "This is a summary of movie plot. The genre of this movie is {}."
    prediction = classifier(sequences = sequences, candidate_labels = candidate_labels, hypothesis_template = hypothesis_template, multi_label=True)
    labels = prediction['labels']
    scores = prediction['scores']
    avg_score = compute_average_score(genres, labels, scores)
    avg_scores_list.append(avg_score)
    
stop = timeit.default_timer()
print('Time: ', stop - start)  


100%|███████████████████████████████████████████| 20/20 [08:22<00:00, 25.12s/it]

Time:  502.4917369999748





In [None]:
from statistics import mean

avg_score = mean(avg_scores_list)
avg_score

In [294]:
avg_scores_list

[0.6664131879806519,
 0.9398733675479889,
 0.6285017430782318,
 0.10646507889032364,
 0.6937809586524963,
 0.7451310157775879,
 0.7456769347190857,
 0.9214355746905009,
 0.9246132373809814,
 0.8937371373176575,
 0.19186416019996008,
 0.9033108353614807,
 0.6912443041801453,
 0.596308171749115,
 0.6105490326881409,
 0.04410838335752487,
 0.7132342755794525,
 0.6979719400405884,
 0.7617500424385071,
 0.32131947576999664]

In [205]:
len([s for s in avg_scores_list if s >= avg_score])

461

In [276]:
sample

Unnamed: 0,Movie ID,Summary,Movie Category
8852,17923997,"In 1970s New York City, David Marks , the son ...","[Thriller, Crime, Romance]"
5008,673694,"{{plot}} John Tunstall , an educated Englishma...","[Thriller, Crime, Action, Comedy]"
8214,12515074,"Shaan is a charming, rustic, god fearing local...","[Crime, Action]"
8790,26967746,In a world where 12 year old Michael Gerber ha...,[Comedy]
2295,3842309,After claiming he is an extraterrestrial from ...,[Science]
...,...,...,...
5748,6115870,"Milton Parker , an elderly and eccentric game ...",[Comedy]
3766,21869908,"The film follows Cross Carlton, a Los Angeles ...",[Comedy]
8118,25687480,Ross McEwen pulls an unusual bank job in the N...,[Action]
5005,30984089,When two newly delivered helicopters crash mys...,"[Thriller, Action]"


In [277]:
print(list(sample.loc[sample['Movie ID'] == 17923997, 'Summary']))

["In 1970s New York City, David Marks , the son of a powerful real estate tycoon, marries a beautiful working-class student, Katie McCarthy . Together they flee the city for country life in Vermont &mdash; only to be lured back by David's father. Upon their return, Katie goes back to college and eventually applies to medical school while trying to understand David's mood swings and unwillingness to have children. As she becomes increasingly independent, David mysteriously turns more violent and controlling. Family secrets are slowly revealed and then Katie disappears without a trace. Years later, when David's best friend turns up dead, the 20-year-old case is re-opened, with David as the main suspect, and the Marks family’s dark secrets pave the way to a disturbing truth."]


In [282]:
sequences =  "In 1970s New York City, David Marks , the son of a powerful real estate tycoon, marries a beautiful working-class student, Katie McCarthy . Together they flee the city for country life in Vermont &mdash; only to be lured back by David's father. Upon their return, Katie goes back to college and eventually applies to medical school while trying to understand David's mood swings and unwillingness to have children. As she becomes increasingly independent, David mysteriously turns more violent and controlling. Family secrets are slowly revealed and then Katie disappears without a trace. Years later, when David's best friend turns up dead, the 20-year-old case is re-opened, with David as the main suspect, and the Marks family’s dark secrets pave the way to a disturbing truth."


hypothesis_template = "This is a summary of movie plot. The genre of this movie is {}."

In [285]:
prediction = classifier(sequences = sequences, candidate_labels = candidate_labels, hypothesis_template = hypothesis_template, multi_label = True)


In [286]:
prediction

{'sequence': "In 1970s New York City, David Marks , the son of a powerful real estate tycoon, marries a beautiful working-class student, Katie McCarthy . Together they flee the city for country life in Vermont &mdash; only to be lured back by David's father. Upon their return, Katie goes back to college and eventually applies to medical school while trying to understand David's mood swings and unwillingness to have children. As she becomes increasingly independent, David mysteriously turns more violent and controlling. Family secrets are slowly revealed and then Katie disappears without a trace. Years later, when David's best friend turns up dead, the 20-year-old case is re-opened, with David as the main suspect, and the Marks family’s dark secrets pave the way to a disturbing truth.",
 'labels': ['Horror',
  'Crime',
  'Thriller',
  'Romance',
  'Science',
  'Fantasy',
  'Action',
  'Comedy'],
 'scores': [0.9488498568534851,
  0.9461238384246826,
  0.9104977250099182,
  0.892758786678

In [58]:
unique_ids_list = list(unique_ids)

unique_ids_summary_df = summary_df[summary_df['Movie ID'].isin(unique_ids_list)]

In [59]:
unique_ids_summary_df["Summary"].map(lambda x: len(x.split()))

1         496
5         763
10         90
12       1295
23        155
         ... 
31548     263
31549     547
31551     979
31552     874
31558     556
Name: Summary, Length: 8867, dtype: int64

In [60]:
!pip install transformers datasets evaluate


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [61]:
import numpy as np
from sklearn.model_selection import train_test_split

In [62]:
movie_category_summary

Unnamed: 0,Movie ID,Movie Category,Summary
0,975900,Thriller,"Set in the second half of the 22nd century, th..."
1,975900,Science,"Set in the second half of the 22nd century, th..."
2,975900,Horror,"Set in the second half of the 22nd century, th..."
3,975900,Action,"Set in the second half of the 22nd century, th..."
4,9363483,Thriller,A series of murders of rich young women throug...
...,...,...,...
19170,24409284,Thriller,"Suresh Gopi plays the role of Vishwam, a man w..."
19171,24409284,Crime,"Suresh Gopi plays the role of Vishwam, a man w..."
19172,8623704,Comedy,"Desperate to pay their rent, the Stooges get s..."
19173,19270580,Romance,The focus is on hero Raj and not so much on t...


In [63]:
summary = list(movie_category_summary['Summary'].values)
label = list(movie_category_summary['Movie Category'].values)
label_idx = {"Comedy": 0, "Romance": 1, "Action": 2, "Thriller": 3, "Horror": 4, "Crime": 5, "Science": 6, "Fantasy": 7}
label = [label_idx[l] for l in label]

In [64]:
X_temp, X_test, y_temp, y_test = train_test_split(summary, label, test_size=0.2, random_state=42, stratify=label)

In [65]:
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp)

In [35]:
from transformers import AutoTokenizer, XLNetForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")


  from .autonotebook import tqdm as notebook_tqdm


In [66]:
from transformers import DistilBertTokenizerFast, DistilBertConfig
tokenizer = DistilBertTokenizerFast.from_pretrained(pretrained_model_name_or_path='distilbert-base-uncased')
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [68]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
val_encodings = tokenizer(X_val, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [69]:
import torch

class MovieDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MovieDataset(train_encodings, y_train)
val_dataset = MovieDataset(val_encodings, y_val)
test_dataset = MovieDataset(test_encodings, y_test)

In [75]:
train_dataset[1]

{'input_ids': tensor([  101, 19817,  9626,  3077,  1005,  1055,  4517, 13308,  2038,  2042,
          7183,  1998,  1996, 16371, 27052,  8067,  3840,  2008,  6787,  2009,
          2038,  5100,  1037,  2047,  2267,  1010,  1996, 19817,  9626,  3077,
          2820,  1997,  2974,  1010,  2503,  1996,  2640,  1010,  2004,  2019,
          3947,  2000,  2012,  5643,  2005,  1996,  2824,  1997,  1996,  2034,
          2143,  1012,  2284,  2503,  1996,  4517,  3269,  1010,  2003,  2073,
          2934, 12621,  2040,  2038,  3819,  2098,  1037,  2679,  1997,  1005,
          4942,  1011, 28051,  2015,  1005,  1025,  2542,  9552,  2302,  6699,
          1010,  2040,  2031,  2042,  2580,  1998, 16984,  2000,  4685,  2273,
          4818,  8518,  1012,  2043,  2082,  6398,  5074,  3044,  6010,  1037,
          3376,  4942, 28600,  6761,  3593,  2315,  3848,  1010,  2027,  2991,
          1999,  2293,  1012,  2174,  1010,  1996,  7329,  2031,  1037, 11765,
          2000,  2175,  2046, 17630, 14

In [71]:
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", problem_type="multi_label_classification")

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [76]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device("mps")

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

KeyboardInterrupt: 