In [1]:
import csv
import networkx
import itertools
from statistics import mean, mode
import matplotlib.pyplot as plt

In [2]:
with open('casts.csv') as csvDataFile:
    data = []
    csvReader = csv.reader(csvDataFile, delimiter=';')
    for row in csvReader:
        data.append(row)

In [3]:
dic_actors = {}
g = networkx.Graph()
for film in data:
    g.add_node(film[2])
    if film[1] not in dic_actors:
        dic_actors[film[1]] = [film[2]]   
    else:
        dic_actors[film[1]].append(film[2])
dic_actors['Pygmalion']

['Leslie Howard',
 'Wendy Hiller',
 'Wilfrid Lawson',
 'Scott Sunderland',
 'Marie Lohr',
 'David Tree',
 'Esme Percy',
 'Everley Gregg',
 'Jean Cadell',
 'Lily Bouwmeester',
 'Eduard Verkade',
 'Johann deMeester',
 'Jenny Ingo',
 'Gustaf Grundgens']

In [4]:
for film in dic_actors:
    for a, b in itertools.combinations(dic_actors[film], 2):
        g.add_edge(a, b)

# Compute general statistics
### Number of nodes/edges

In [5]:
nodes = g.number_of_nodes()
edges =  g.number_of_edges()
density = edges / (nodes * (nodes - 1) / 2)
number_of_components = networkx.number_connected_components(g)
print("Number of nodes: ", nodes)
print("Number of edges: ", edges)
print("Density: ", density)
print("Number of components: ", number_of_components)

Number of nodes:  16615
Number of edges:  156042
Density:  0.0011305686849167414
Number of components:  637


In [6]:
centr = [networkx.degree_centrality, networkx.eigenvector_centrality]
for c in centr:
    deg_centrality = c(g)
    for actor, centrality_val in deg_centrality.items():
        g.node[actor][c.__name__] = centrality_val

    centrality_sort = sorted(deg_centrality.items(), key=lambda element: element[1], reverse=True)
    print("\n",c.__name__)
    for i in range(10):
        print(centrality_sort[i][0], ": ", centrality_sort[i][1])    


 degree_centrality
s a :  0.19934994582881907
Humphrey Bogart :  0.025941976646201997
James Stewart :  0.022511135187191524
Gary Cooper :  0.022270374383050438
John Gielgud :  0.022270374383050438
John Carradine :  0.022089803779944624
Peter Lorre :  0.021668472372697724
C.Aubrey Smith :  0.02028409774888648
Henry Fonda :  0.019622005537498495
Burt Lancaster :  0.018779342723004695

 eigenvector_centrality
s a :  0.32924414626843707
C.Aubrey Smith :  0.08663264933518978
John Carradine :  0.08522237742466378
James Stewart :  0.08332764191436194
John Gielgud :  0.08103633788469895
Peter Lorre :  0.07850585718647131
Gary Cooper :  0.07744182444301097
Basil Rathbone :  0.07490069815853961
Henry Fonda :  0.07489297835968156
Humphrey Bogart :  0.07423477650463556


# Describe top clusters/communities

In [7]:
communities = {node:cid+1 for cid,community in enumerate(networkx.algorithms.community.k_clique_communities(g,10))\
               for node in community}

In [8]:
communities_actors = {}
for key, val in communities.items():
    if val not in communities_actors:
        communities_actors[val] = []
    communities_actors[val].append(key)
community_sorted = sorted(communities_actors.items(), key=lambda element: len(element[1]), reverse=True)
for i in range(10):
    print("\nActors in community: ", len(community_sorted[i][1]))
    print("\n".join(community_sorted[i][1]))


Actors in community:  33
Lela Mourad
Bruce Robinson
Antionio Pierfrederici
Natasha Peryy
Sir Godrey Teale
Esmeralda Ruspoli
Harry Hilliard
Maria Gasperini
George A. Lessey
Francesca Bertini
John McEnery
Mario Caserini
Virginia Hammond
Rosemarie Dexter
Conway Tearle
Roberto Bisacco
Leonard Whiting
Mary Malone
Gustav Serena
Julia M. Taylor
Beverly Bane
Gulio Garbinetti
Ibrahim Hamouda
Susan Shentall
Enzo Fiermonte
Bill Travers
Florence Lawrence
Meynier
Paul Panzer
Aldo Zollo
Nietta Zocchi
Paul Hardwick
td> Claire Danes<

Actors in community:  26
Louis Armstrong
George Baker
Anouska Hempel
Irvin Allen
Steve Plytas
Sylvana Henriques
Ingri Black
Dani Sheridan
Bernard Horsfall
Helena Ronee
Julie Ege
George Lazenby
Terry Mountain
Besie Love
Jenny Hanley
Ilse Steppat
Geoffrey Cheshire
Gabrielle Ferzetti
Brian Worth
Diana Rigg
James Bree
Mona Chong
Zara
Yuri Borienko
Virginia North
Catherina vonSchell

Actors in community:  25
Nelson Eddy
Heather Sears
J.Edward Bromberg
Chester Concklin
Lon Ch

# Describe „Kevin Bacon“ numbers

In [9]:
def KB_numbers(actor):
    length_list = []
    print(actor)
    shortest_path = networkx.single_source_shortest_path_length(g, actor)
    attr_name = actor.split(" ")[1] + "_number"
    for actor in g.nodes():
        g.node[actor][attr_name] = -1
    for actor, length in shortest_path.items():
        g.node[actor][attr_name] = length
        length_list.append(length)
    print("Top actors with the highest number")
    for items in list(shortest_path.items())[1:11]:
        print(items)
    print("Top actors with the lowest number")
    for items in list(shortest_path.items())[-10:]:
        print(items)
    print("Mean: ", mean(length_list))
    print("Mode: ", mode(length_list), '\n')

In [10]:
actors = ['Jack Nicholson', 'Humphrey Bogart']
for actor in actors:
    KB_numbers(actor)

Jack Nicholson
Top actors with the highest number
('Marlon Brando', 1)
('Randy Quaid', 1)
('Kathleen Lloyd', 1)
('Frederic Forrest', 1)
('Harry Dean Stanton', 1)
('John McLiam', 1)
('Karen Black', 1)
('Karen Anspach', 1)
('Lorna Thayer', 1)
('Billy "Green" Bush', 1)
Top actors with the lowest number
('Cecilia Roth', 5)
('Leonor Benedetto', 5)
('Silvia Pal', 5)
('Haren Chatterjee', 5)
('Haradhan Banerjee', 5)
('Paredes', 5)
('Antonia SanJuan', 5)
('Elisa Touati', 5)
('Marbel Verdu', 5)
('Maria deMederios', 5)
Mean:  2.579803478953658
Mode:  3 

Humphrey Bogart
Top actors with the highest number
('Leslie Howard', 1)
('Bette Davis', 1)
('Genevieve Tobin', 1)
('Dick Foran', 1)
('Joe Sawyer', 1)
('Porter Hall', 1)
('Charley Grapewin', 1)
('Paul Harvey', 1)
('s a', 1)
('Erin OBrienMoore', 1)
Top actors with the lowest number
('Haradhan Banerjee', 5)
('Robert Castle', 5)
('Barbara Dennek', 5)
('Jacqueline Lecomte', 5)
('Henri Piccoli', 5)
('Paredes', 6)
('Antonia SanJuan', 6)
('Elisa Touati',

* Every node represented by an actor name and all calculated attributes.

In [11]:
list(g.node.items())[0]

('Leslie Howard',
 {'Bogart_number': 1,
  'Nicholson_number': 2,
  'degree_centrality': 0.008727579150114361,
  'eigenvector_centrality': 0.039725355713338094})

In [12]:
networkx.write_gexf(g, "g_full.gexf")

## Visualisations from networkx.
* A graph which represents films with 3 actors.

In [None]:
short_dic = {}
g_short = networkx.Graph()
for film in data:
    g.add_node(film[2])
for item in dic_actors.items():
    if len(item[1]) == 3:
        short_dic[item[0]] = item[1]
for film in short_dic:
    for a, b in itertools.combinations(short_dic[film], 2):
        g_short.add_edge(a, b)

In [None]:
plt.clf()
networkx.draw_networkx(g_short, with_labels= True, node_size= 20, font_size = 5)
plt.savefig("graph_short.png", dpi=1000)