# Movies influence

In [1]:
import csv
import networkx as nx
import pandas as pd
from itables import init_notebook_mode
import itables.options as opt
from itables import show
import re
import math

init_notebook_mode(all_interactive=True)


<IPython.core.display.Javascript object>

In [2]:
print('Loading references data...')
references = []
with open("data/references.csv") as r_file:
    csv_reader = csv.reader(r_file, delimiter=',')
    for row in csv_reader:
        references.append((row[0], row[1]))

print('Loading movies data...')
moves_data = {}
with open("data/movies_with_references.csv") as m_file:
    csv_reader = csv.reader(m_file, delimiter=',')

    for row in csv_reader:
        moves_data[row[0]] = {
            'id': row[0],
            'title': row[3],
            'startYear': int(row[4]) if row[4] != '\\N' else 0,
#             'endYear': int(row[5]) if row[5] != '\\N' else 0,
            'inputRefs': int(row[1]),
            'outputRefs': int(row[2]),
            'genres': row[6],
            'pRank': 0,
            'rating': 0.0,
            'country': "",
            'runtime': 0,
            'budget': 0,
            'gross': 0,
            'userReviews': 0,
            'criticReviews': 0,
            'pr*outputRefs': 0,
            'pr/runtime': 0,
            'pr/budget': 0,
            'pr*gross': 0,
        }

with open("data/base_movies_data.csv") as m_file:
    csv_reader = csv.reader(m_file, delimiter=',')
    
    for i, row in enumerate(csv_reader):
        # Convert runtime to minutes
        totalMinutes = 0
        hours = re.search(r'(\d)h', row[3])
        if hours is not None:
            totalMinutes += int(hours.group(1))*60
        minutes = re.search(r'(\d)min', row[3])
        if minutes is not None:
            totalMinutes += int(minutes.group(1))
        
        # Convert user reviwes to int
        totalUserReviews = 0
        thousands = re.search(r'((\d|\.)+)K', row[6])
        if thousands is not None:
            totalUserReviews += int(float(thousands.group(1))*1000)
        else:
            totalUserReviews = int(row[6]) if row[6] != "" else 0

        # Convert critic reviwes to int
        totalCriticReviews = 0
        thousands = re.search(r'((\d|\.)+)K', row[7])
        if thousands is not None:
            totalCriticReviews += int(float(thousands.group(1))*1000)
        else:
            totalCriticReviews = int(row[7]) if row[7] != "" else 0

        moves_data[row[0]]['rating'] = float(row[1]) if row[1] != "" else 0.0
        moves_data[row[0]]['country'] = row[2].split(',')[0]
        moves_data[row[0]]['runtime'] = totalMinutes
        moves_data[row[0]]['budget'] = row[4][1:].replace(",","").replace(".","") if len(row[4]) > 1 and row[4][0] == '$' else 0
        moves_data[row[0]]['gross'] = row[5][1:].replace(",","").replace(".","") if len(row[5]) > 1 and row[5][0] == '$' else 0
        moves_data[row[0]]['userReviews'] = totalUserReviews
        moves_data[row[0]]['criticReviews'] = totalCriticReviews


Loading references data...
Loading movies data...


In [3]:
print('Calculating Movie Rank...')
G = nx.DiGraph()
G.add_edges_from(references)
mr = nx.pagerank(G, alpha=0.85)

for movId in mr:
    if movId not in moves_data:
        continue
    moves_data[movId]["pRank"] = mr[movId]
#     moves_data[movId]['pr*outputRefs'] = (mr[movId]*moves_data[movId]['outputRefs']) if moves_data[movId]['outputRefs'] != 0 else 0
#     moves_data[movId]['pr/runtime'] = (mr[movId]/moves_data[movId]['runtime']) if moves_data[movId]['runtime'] != 0 and moves_data[movId]['runtime'] > 20 else 0
#     moves_data[movId]['pr/budget'] = (mr[movId]/int(moves_data[movId]['budget'])) if moves_data[movId]['budget'] != 0 and mr[movId] > 0.0001 else 0
#     moves_data[movId]['pr*gross'] = (mr[movId]*int(moves_data[movId]['gross'])) if moves_data[movId]['gross'] != 0 else 0



Calculating Movie Rank...


In [4]:

filtered_movies_data = [d for d in moves_data.values() if d['inputRefs'] > 10]
sorted_filteres_movies_data = sorted(filtered_movies_data, key=lambda item: item["pRank"], reverse=True)
sorted_filteres_movies_data = [{**{"#": i+1}, **e} for i, e in enumerate(sorted_filteres_movies_data)]

print('Total number of movies: %d' % len(moves_data.values()))
print('Total number of filtered movies: %d' % len(sorted_filteres_movies_data))


opt.lengthMenu = [20]
opt.maxBytes = 0
x = pd.DataFrame.from_dict(sorted_filteres_movies_data)
show(x, order=[7, "desc"])


Total number of movies: 217587
Total number of filtered movies: 7818


#,id,title,startYear,inputRefs,outputRefs,genres,pRank,rating,country,runtime,budget,gross,userReviews,criticReviews,pr*outputRefs,pr/runtime,pr/budget,pr*gross


In [119]:
from tabulate import tabulate

max_title_length = 34
display_data = lambda data, num: [
    {
        '#': m['#'],
        'title': m['title'][:max_title_length]+"..." if len(m['title']) > max_title_length else m['title'],
        'year': m['startYear'],
        'inputRefs': m['inputRefs'],
        'pR': m['pRank'],
        'rating': m['rating'],
        'country': m['country'],

    } for m in data[:num]]

sorted_filteres_movies_data_df = pd.DataFrame.from_dict(display_data(sorted_filteres_movies_data, 100)).set_index('#')
print("Movies sorted by PR")
print(tabulate(sorted_filteres_movies_data_df, headers='keys', tablefmt='psql'))

animation = [m for m in sorted_filteres_movies_data if 'Animation' in m['genres']]
print("Animated movies sorted by PR")
print(tabulate(pd.DataFrame.from_dict(display_data(animation, 20)), headers='keys', tablefmt='psql'))

adult = [m for m in sorted_filteres_movies_data if 'Adult' in m['genres']]
print("Adult movies sorted by PR")
print(tabulate(pd.DataFrame.from_dict(display_data(adult, 10)), headers='keys', tablefmt='psql'))


Movies sorted by PR
+-----+---------------------------------------+--------+-------------+-------------+----------+----------------+
|   # | title                                 |   year |   inputRefs |          pR |   rating | country        |
|-----+---------------------------------------+--------+-------------+-------------+----------+----------------|
|   1 | The Wizard of Oz                      |   1939 |        3405 | 0.00740026  |      8   | United States  |
|   2 | Citizen Kane                          |   1941 |        1163 | 0.00553099  |      8.3 | United States  |
|   3 | Star Wars: Episode IV - A New Hope    |   1977 |        5370 | 0.00455308  |      8.6 | United States  |
|   4 | King Kong                             |   1933 |        1039 | 0.00293679  |      7.9 | United States  |
|   5 | Metropolis                            |   1927 |         283 | 0.00281159  |      8.3 | Germany        |
|   6 | Star Trek: The Original Series        |   1966 |        2694 | 0.002

In [112]:
the_smallest_rating = 10
the_smallest_rating_elem = [{}]
for (i, elem) in enumerate(sorted_filteres_movies_data[:100]):
    if elem["rating"] < the_smallest_rating and elem["rating"] > 0:
        the_smallest_rating = elem["rating"]
        the_smallest_rating_elem[0] = elem
print("Movie with the smallest PR in the top 100 movies sorted by PR")
print(tabulate(pd.DataFrame.from_dict(display_data(the_smallest_rating_elem, 1)).set_index('#'), headers='keys', tablefmt='psql'))


Movie with the smallest PR in the top 100 movies sorted by PR
+-----+-----------------------+--------+-------------+-----------+----------+---------------+
|   # | title                 |   year |   inputRefs |        pR |   rating | country       |
|-----+-----------------------+--------+-------------+-----------+----------+---------------|
|  10 | The Birth of a Nation |   1915 |         102 | 0.0021417 |      6.3 | United States |
+-----+-----------------------+--------+-------------+-----------+----------+---------------+


In [115]:
movies_data_by_decade = {}
sum_pr_by_decade = {}
num_movies_by_decade = {}

for movieData in sorted_filteres_movies_data:
    if movieData["startYear"] == 0:
        continue
    normalize_year = math.floor(movieData["startYear"]/10) * 10
    if normalize_year in movies_data_by_decade:
        movies_data_by_decade[normalize_year].append(movieData)
        sum_pr_by_decade[normalize_year] = sum_pr_by_decade[normalize_year] + movieData["pRank"]
        num_movies_by_decade[normalize_year] = num_movies_by_decade[normalize_year] + 1
    else:
        movies_data_by_decade[normalize_year] = [movieData]
        sum_pr_by_decade[normalize_year] = movieData["pRank"]
        num_movies_by_decade[normalize_year] = 1

sorted_data_by_decade = dict(sorted(movies_data_by_decade.items()))

pr_by_decades_df = [
    {
        "decade": k,
        "numMovies": num_movies_by_decade[k], 
        "sumPr": v
    } for (k, v) in sum_pr_by_decade.items()]

pr_by_decades_df_sorted = sorted(pr_by_decades_df, key=lambda item: item["sumPr"], reverse=True)

print("Summirized PR by decade")
print(tabulate(pd.DataFrame.from_dict(pr_by_decades_df_sorted), headers='keys', tablefmt='psql'))

print("Top 10 movies by PR by decade")
for decade, data in sorted_data_by_decade.items():
    print(decade)
    sorted_decade_by_pr = sorted(data, key=lambda item: item["pRank"], reverse=True)
    print(tabulate(pd.DataFrame.from_dict(display_data(sorted_decade_by_pr, 10)).set_index('#'), headers='keys', tablefmt='psql'))


Summirized PR by decade
+----+----------+-------------+-------------+
|    |   decade |   numMovies |       sumPr |
|----+----------+-------------+-------------|
|  0 |     1980 |        1341 | 0.0745303   |
|  1 |     1970 |         880 | 0.0646023   |
|  2 |     1960 |         642 | 0.0625806   |
|  3 |     1990 |        1404 | 0.0514167   |
|  4 |     1950 |         443 | 0.0426823   |
|  5 |     2000 |        1427 | 0.0357362   |
|  6 |     1930 |         183 | 0.0347147   |
|  7 |     1940 |         222 | 0.0286511   |
|  8 |     2010 |        1125 | 0.0202689   |
|  9 |     1920 |          56 | 0.0127906   |
| 10 |     1910 |          10 | 0.00298165  |
| 11 |     2020 |          83 | 0.00174769  |
| 12 |     1900 |           2 | 0.000436643 |
+----+----------+-------------+-------------+
Top 10 movies by PR by decade
1900
+------+-------------------------+--------+-------------+-------------+----------+---------------+
|    # | title                   |   year |   inputRefs |   

In [116]:
movies_data_by_country = {}

for movieData in sorted_filteres_movies_data:
    if movieData["country"] == "":
        continue
    if movieData["country"] in movies_data_by_country:
        movies_data_by_country[movieData["country"]].append(movieData)
    else:
        movies_data_by_country[movieData["country"]] = [movieData]

print("Top 10 movies by PR by contries")
for country, data in movies_data_by_country.items():
    print(country)
    sorted_country_by_pr = sorted(data, key=lambda item: item["pRank"], reverse=True)
    print(tabulate(pd.DataFrame.from_dict(display_data(sorted_country_by_pr, 10)).set_index('#'), headers='keys', tablefmt='psql'))


Top 10 movies by PR by contries
United States
+-----+------------------------------------+--------+-------------+------------+----------+---------------+
|   # | title                              |   year |   inputRefs |         pR |   rating | country       |
|-----+------------------------------------+--------+-------------+------------+----------+---------------|
|   1 | The Wizard of Oz                   |   1939 |        3405 | 0.00740026 |      8   | United States |
|   2 | Citizen Kane                       |   1941 |        1163 | 0.00553099 |      8.3 | United States |
|   3 | Star Wars: Episode IV - A New Hope |   1977 |        5370 | 0.00455308 |      8.6 | United States |
|   4 | King Kong                          |   1933 |        1039 | 0.00293679 |      7.9 | United States |
|   6 | Star Trek: The Original Series     |   1966 |        2694 | 0.0025909  |      8.3 | United States |
|   7 | Casablanca                         |   1942 |        1501 | 0.00243122 |      8.5 

In [117]:
sort_by_refs = sorted(sorted_filteres_movies_data, key=lambda item: item["inputRefs"], reverse=True)
sort_by_user_reviews = sorted(sorted_filteres_movies_data, key=lambda item: item["userReviews"], reverse=True)
sort_by_critic_reviews = sorted(sorted_filteres_movies_data, key=lambda item: item["criticReviews"], reverse=True)

display_data_reviews = lambda data, num: [
    {
        '#': m['#'],
        'title': m['title'][:35]+"..." if len(m['title']) > 35 else m['title'],
        'year': m['startYear'],
        'rating': m['rating'],
        'userReviews': m['userReviews'],
        'criticReviews': m['criticReviews'],

    } for m in data[:num]]

print("Top 20 by input refs")
print(tabulate(pd.DataFrame.from_dict(display_data(sort_by_refs, 20)).set_index('#'), headers='keys', tablefmt='psql'))
print("Top 20 by user reviews")
print(tabulate(pd.DataFrame.from_dict(display_data_reviews(sort_by_user_reviews, 20)).set_index('#'), headers='keys', tablefmt='psql'))
print("Top 20 by critic reviews")
print(tabulate(pd.DataFrame.from_dict(display_data_reviews(sort_by_critic_reviews, 20)).set_index('#'), headers='keys', tablefmt='psql'))



Top 20 by input refs
+-----+---------------------------------------+--------+-------------+-------------+----------+----------------+
|   # | title                                 |   year |   inputRefs |          pR |   rating | country        |
|-----+---------------------------------------+--------+-------------+-------------+----------+----------------|
|   3 | Star Wars: Episode IV - A New Hope    |   1977 |        5370 | 0.00455308  |      8.6 | United States  |
|   1 | The Wizard of Oz                      |   1939 |        3405 | 0.00740026  |      8   | United States  |
|   6 | Star Trek: The Original Series        |   1966 |        2694 | 0.0025909   |      8.3 | United States  |
|  16 | Star Wars: Episode V - The Empire ... |   1980 |        1835 | 0.00148389  |      8.7 | United States  |
|  13 | The Godfather                         |   1972 |        1785 | 0.00191543  |      9.2 | United States  |
|  14 | Jaws                                  |   1975 |        1578 | 0.00