### Imports

In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

### Data loading

In [2]:
#load data/moviesummaries/plot_summaries.txt
plot_summaries = pd.read_csv('data/moviesummaries/plot_summaries.txt', sep='\t', header=None)

plot_summaries.columns = ["Wikipedia movie ID", "Summary"]
plot_summaries.head()

Unnamed: 0,Wikipedia movie ID,Summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


### LDA 

In [13]:
# Perform LDA on the summaries

# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(stop_words='english')
count_data = count_vectorizer.fit_transform(plot_summaries['Summary'])

# Create and fit the LDA model
for i in range(5, 50, 5):
    lda = LatentDirichletAllocation(n_components=i, random_state=0)
    lda.fit(count_data)
    print_infos(lda, count_vectorizer)

Number of topics:5

Topics found via LDA:
Topic 0:
film new team world time group war ship dr killed
Topic 1:
love father family life son film daughter village police story
Topic 2:
tom jerry away man house tries time head tells goes
Topic 3:
joe father tells man king men town new wife home
Topic 4:
mother father tells home man life house family new finds
Top documents for each topic:
Topic 0:
 'In April 1, 2011, Kamen Rider OOO finds himself battling three Mole Imagin, unaware they are Imagin until Ankh arrives to confirm that the monsters are not Yummies. The three Imagin see a boy and jump into him to escape into time before the DenLiner suddenly appears. Kotaro Nogami and Teddy emerge from the train to pinpoint the Mole Imagin\'s destination to be November 11, 1971, confused on how the boy\'s memories exceed that far in time. When Ankh demands answers, Teddy explains who he and Eiji Hino are to Kotaro, who then introduces himself while explaining their mission to deal with the rogu

In [12]:
def print_infos(lda, count_vectorizer):
    # Print the topics found by the LDA model
    print("Number of topics:" + str(lda.n_components) + "\n")
    print("Topics found via LDA:")
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([count_vectorizer.get_feature_names_out()[i]
                        for i in topic.argsort()[:-10 - 1:-1]]))
        
    # for each topic, find the most relevant documents
    n_top_docs = 3
    topic_values = lda.transform(count_data)
    topic_values.shape

    # Create a dataframe with the top n documents in each topic
    top_docs = pd.DataFrame()
    for topic_idx, topic in enumerate(lda.components_):
        top_docs[str(topic_idx)] = topic_values[:,topic_idx].argsort()[:-n_top_docs-1:-1]

    # Add the top documents to the dataframe
    for col in top_docs.columns:
        top_docs[col] = top_docs[col].apply(lambda x: plot_summaries.iloc[x]['Summary'])

    # Print the top documents for each topic
    print("Top documents for each topic:")
    for topic_idx, topic in enumerate(top_docs.columns):
        print("Topic %d:" % (topic_idx))
        print(top_docs[topic].values)
    
    print("\n")
    print("---------------------------------------------------------")
    print("\n")

In [15]:
#load data/moviesummaries/character.metadata.tsv
character_metadata = pd.read_csv('data/moviesummaries/character.metadata.tsv', sep='\t', header=None)

#load data/moviesummaries/movie.metadata.tsv
movie_metadata = pd.read_csv('data/moviesummaries/movie.metadata.tsv', sep='\t', header=None)

#rename columns to match documentation 

character_metadata.columns = ["Wikipedia movie ID", "Freebase movie ID", "Movie release date", "Character name", "Actor date of birth", "Actor gender", 
                              "Actor height", "Actor ethnicity", "Actor name", "Actor age", "Freebase character/actor map ID", 
                              "Freebase character ID", "Freebase actor ID"]
                              
#rename columns to match documentation

movie_metadata.columns = ["Wikipedia movie ID", "Freebase movie ID", "Movie name", "Movie release date", "Movie revenue", "Movie runtime",
                          "Movie languages", "Movie countries", "Movie genres"]

In [16]:
movie_plot = pd.merge(movie_metadata, plot_summaries, on='Wikipedia movie ID', how='inner')

In [17]:
movie_plot.head()

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Summary
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","Set in the second half of the 22nd century, th..."
1,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",A series of murders of rich young women throug...
2,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}","Eva, an upper class housewife, becomes frustra..."
3,18998739,/m/04jcqvw,The Sorcerer's Apprentice,2002,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0hzlz"": ""South Africa""}","{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant...","Every hundred years, the evil Morgana returns..."
4,6631279,/m/0gffwj,Little city,1997-04-04,,93.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/0hj3n0w"": ...","Adam, a San Francisco-based artist who works a..."


In [30]:
#list all the values of the movie genres column in movies_metadata
genres = movie_plot['Movie genres'].unique()

#turn all values in genres into dicts
genres = [list(eval(genre).values()) for genre in genres]

#flatten the list of lists
genres = [item for sublist in genres for item in sublist]

#remove duplicates
genres = list(set(genres))

#order alphabetically
genres.sort()

In [33]:
#for each value in movie_plot, only keep the values of the dict as a list
movie_plot['Movie genres'] = [list(eval(genre).values()) for genre in movie_plot['Movie genres']]
movie_plot['Movie languages'] = [list(eval(genre).values()) for genre in movie_plot['Movie languages']]
movie_plot['Movie countries'] = [list(eval(genre).values()) for genre in movie_plot['Movie countries']]
movie_plot.head()

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Summary
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","[Thriller, Science Fiction, Horror, Adventure,...","Set in the second half of the 22nd century, th..."
1,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","[Thriller, Erotic thriller, Psychological thri...",A series of murders of rich young women throug...
2,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}",[Drama],"Eva, an upper class housewife, becomes frustra..."
3,18998739,/m/04jcqvw,The Sorcerer's Apprentice,2002,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0hzlz"": ""South Africa""}","[Family Film, Fantasy, Adventure, World cinema]","Every hundred years, the evil Morgana returns..."
4,6631279,/m/0gffwj,Little city,1997-04-04,,93.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","[Romantic comedy, Ensemble Film, Comedy-drama,...","Adam, a San Francisco-based artist who works a..."


In [31]:
genres

['Absurdism',
 'Acid western',
 'Action',
 'Action Comedy',
 'Action Thrillers',
 'Action/Adventure',
 'Addiction Drama',
 'Adult',
 'Adventure',
 'Adventure Comedy',
 'Airplanes and airports',
 'Albino bias',
 'Alien Film',
 'Alien invasion',
 'Americana',
 'Animal Picture',
 'Animals',
 'Animated Musical',
 'Animated cartoon',
 'Animation',
 'Anime',
 'Anthology',
 'Anthropology',
 'Anti-war',
 'Anti-war film',
 'Apocalyptic and post-apocalyptic fiction',
 'Archaeology',
 'Archives and records',
 'Art film',
 'Auto racing',
 'Avant-garde',
 'B-Western',
 'B-movie',
 'Backstage Musical',
 'Baseball',
 'Beach Film',
 'Beach Party film',
 'Bengali Cinema',
 'Biker Film',
 'Biographical film',
 'Biography',
 'Biopic [feature]',
 'Black comedy',
 'Black-and-white',
 'Blaxploitation',
 'Bloopers & Candid Camera',
 'Bollywood',
 'Boxing',
 'Breakdance',
 'British Empire Film',
 'British New Wave',
 'Bruceploitation',
 'Buddy Picture',
 'Buddy cop',
 'Buddy film',
 'Business',
 'C-Movie',
 '

In [32]:
len(genres)

363

In [None]:
keep = ['Alien Film', 'Alien invasion','Anti-war', 'Anti-war film','Apocalyptic and post-apocalyptic fiction','Cold War', 'Combat Films',
        'Computers', 'Conspiracy fiction', 'Costume Horror', 'Dystopia', 'Environmental Science', 'Future noir', 'Gulf War', 'Natural disaster', 
        'Natural horror films', 'Nuclear warfare', 'Plague', 'Sci-Fi Horror', 'War film', ]

- check for movie genres in the topics
- check topics for movies with genre in keep