In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# This analysis used Graph theory to explore complex relationships


<p> It litereally follows every step of the tutorial 
http://programminghistorian.github.io/ph-submissions/lessons/published/exploring-and-analyzing-network-data-with-python
</p>

In [43]:
df = pd.read_csv("Z:00_ETL/CustomerBehaviour/rawdata2.txt", sep = "\t", encoding = "ISO-8859-1")

In [44]:
print(df.shape)
df.tail()

(15695999, 9)


Unnamed: 0,series_or_movie_name,encrypted_customer_id,offer_group_desc,first_genre,entity_type,content_age,really_frist_stream,transaction_date_local,display_price
15695994,Royal Pains [dt./OV],AWHGX01JJFP1A,PRIME,comedy,TV Show,,2016-10-01,,
15695995,Cloverfield [dt./OV],AXNCSJSOYNU7Q,PRIME,action,Movie,,2015-11-14,,
15695996,Mamma Mia! - Der Film [dt./OV],AXNCSJSOYNU7Q,PRIME,comedy,Movie,,2017-12-23,,
15695997,Walhalla Rising,AY4FUNR962OYO,PRIME,adventure,Movie,,2014-12-21,,
15695998,The Zero Theorem [dt./OV],AYYHG5BNPF7M5,PRIME,comedy,Movie,,2016-12-08,,


In [45]:
df['encrypted_customer_id'].nunique()

184163

In [46]:
df['series_or_movie_name'].nunique()

36856

# Take sample to create toy sample

In [47]:
toy = df.sample(frac=0.0000005)

In [48]:
toy.shape
toy.head()

Unnamed: 0,series_or_movie_name,encrypted_customer_id,offer_group_desc,first_genre,entity_type,content_age,really_frist_stream,transaction_date_local,display_price
3237084,MÃ¤rchen aus 1001 Nacht,A13SG7MWO64BMR,PRIME,,Educational,,2018-02-11,,
3265432,ErlÃ¶se Uns Von Dem BÃ¶sen [dt./OV],AMTRBK7UL7LOJ,PRIME,horror,Movie,,2016-05-31,,
15090367,Roast of Charlie Sheen,A3DJP5WPHGPPTU,PRIME,,Movie,,2016-04-09,,
4773651,The Big Bang Theory,AU3F5QY7FFK5K,PRIME,comedy,TV Show,,2016-11-08,,
4168563,Chappie [Ultra HD],A175P34LI5EQPN,PRIME,action,Movie,,2017-01-21,,


In [49]:
toy['encrypted_customer_id'].nunique()

8

In [50]:
toy['encrypted_customer_id'].unique()

array(['A13SG7MWO64BMR', 'AMTRBK7UL7LOJ', 'A3DJP5WPHGPPTU',
       'AU3F5QY7FFK5K', 'A175P34LI5EQPN', 'ADBNTGD91UNT7',
       'A1VW6JCW93A34W', 'A12Q3AF8Q7TQ71'], dtype=object)

In [55]:
sample_df = df[df['encrypted_customer_id'].isin(toy['encrypted_customer_id'].unique())]

In [68]:
sample_df.head()

Unnamed: 0,series_or_movie_name,encrypted_customer_id,offer_group_desc,first_genre,entity_type,content_age,really_frist_stream,transaction_date_local,display_price
8560,14 Blades,A13SG7MWO64BMR,PRIME,action,Movie,,2015-01-03,,
8561,Hochzeitsnacht Im Geisterschloss,A13SG7MWO64BMR,PRIME,comedy,Movie,,2016-10-15,,
10422,Mama Muh und die KrÃ¤he [dt./OV],A12Q3AF8Q7TQ71,PRIME,animation,Movie,,2016-12-09,,
32381,Die kleine Raupe Nimmersatt und vier weitere l...,AMTRBK7UL7LOJ,PRIME,,TV Show,,2016-06-05,,
41778,Die Geier warten schon,A13SG7MWO64BMR,PRIME,western,Movie,,2014-12-05,,


In [57]:
sample_df['first_genre'].unique()

array(['action', 'comedy', 'animation', nan, 'western', 'drama', 'crime',
       'adventure', 'biography', 'horror', 'family', 'documentary',
       'adult', 'mystery', 'fantasy', 'history', 'music', 'romance',
       'sci_fi', 'thriller'], dtype=object)

In [59]:
sample_df['entity_type'].unique()

array(['Movie', 'TV Show', 'Other', 'Short Film', 'Review', 'Clip',
       'Educational', 'Default', nan, 'News and Commentary', 'Ambient',
       'Concert/Performance'], dtype=object)

In [60]:
sample_df['offer_group_desc'].unique()

array(['PRIME', 'RENTAL', 'PURCHASE'], dtype=object)

# Construct nodes and edges

In [136]:
nodereader = sample_df[['series_or_movie_name','offer_group_desc','first_genre','entity_type','content_age']]

In [137]:
edgereader = sample_df[['series_or_movie_name','encrypted_customer_id']]
edgereader.columns = ['source','target']

In [138]:
nodereader.head()

Unnamed: 0,series_or_movie_name,offer_group_desc,first_genre,entity_type,content_age
8560,14 Blades,PRIME,action,Movie,
8561,Hochzeitsnacht Im Geisterschloss,PRIME,comedy,Movie,
10422,Mama Muh und die KrÃ¤he [dt./OV],PRIME,animation,Movie,
32381,Die kleine Raupe Nimmersatt und vier weitere l...,PRIME,,TV Show,
41778,Die Geier warten schon,PRIME,western,Movie,


In [139]:
edgereader.head()

Unnamed: 0,source,target
8560,14 Blades,A13SG7MWO64BMR
8561,Hochzeitsnacht Im Geisterschloss,A13SG7MWO64BMR
10422,Mama Muh und die KrÃ¤he [dt./OV],A12Q3AF8Q7TQ71
32381,Die kleine Raupe Nimmersatt und vier weitere l...,AMTRBK7UL7LOJ
41778,Die Geier warten schon,A13SG7MWO64BMR


# Import networkx

In [140]:
import networkx as nx

from operator import itemgetter
import community

In [141]:
node_names = [i for i in nodereader.iloc[:,0]]

In [142]:
node_names[0:5]

['14 Blades',
 'Hochzeitsnacht Im Geisterschloss',
 'Mama Muh und die KrÃ¤he [dt./OV]',
 'Die kleine Raupe Nimmersatt und vier weitere lustige Abenteuer',
 'Die Geier warten schon']

In [143]:
edges = list(zip(edgereader.source, edgereader.target)) #create tuple for every pair from the dataset

In [144]:
print(len(node_names))
print(len(edges))

1684
1684


# Basics of NetworkX: Creating the Graph

In [134]:
G = nx.Graph() #creates empty graph, initiliasize a graph object

#Add list of nodes and edges
G.add_nodes_from(node_names)
G.add_edges_from(edges)

In [135]:
print(nx.info(G)) #print basic info about the newly create graph

Name: 
Type: Graph
Number of nodes: 1407
Number of edges: 1673
Average degree:   2.3781


# Adding attributes

In [171]:
#initialize empty dictionaries

offer_group_desc_dict = {}
first_genre_dict = {}
entity_type_dict = {}

In [172]:
#Make nodes generator, i.e. convert each row of dataframe into a list
nodes = nodereader.values.tolist() 

In [173]:
nodes[0:4]

[['14 Blades', 'PRIME', 'action', 'Movie', nan],
 ['Hochzeitsnacht Im Geisterschloss', 'PRIME', 'comedy', 'Movie', nan],
 ['Mama Muh und die KrÃ¤he [dt./OV]', 'PRIME', 'animation', 'Movie', nan],
 ['Die kleine Raupe Nimmersatt und vier weitere lustige Abenteuer',
  'PRIME',
  nan,
  'TV Show',
  nan]]

In [174]:
for node in nodes:
    offer_group_desc_dict[node[0]] = node[1]
    first_genre_dict[node[0]] = node[2]
    entity_type_dict[node[0]] = node[3]

<p> After having each attribute in a node, add attributes to Graph using set_node_attributes function
which takes 3 variables : the graph to which attributes are added , name of the attribute, dict of attributes </p>

In [178]:
nx.set_node_attributes(G,  offer_group_desc_dict, 'offer_group')
nx.set_node_attributes(G, first_genre_dict, 'genre')
nx.set_node_attributes(G, entity_type_dict, 'entity_type')

<p> Now all nodes have attributes, which can be accessed at any time </p>
<p> For example, print all genres of the nodes by looping through them and accessing genre attribute </p>

In [183]:
# for n in G.nodes():
#     print(n, G.node[n]['genre'])

# Metrics available in Networkx


<p> A good metric to begin with is network density. This is simply the ratio of actual edges in the network to all possible edges in the network. In an undirected network like this one, there could be a single edge between any two nodes, but as you saw in the visualization, only a few of those possible edges are actually present. Network density gives you a quick sense of how closely knit your network is.
</p>

In [181]:
density = nx.density(G)
print("Network density: ", density)

Network density:  0.0016914007487456035


<p>
In this case, the density of our network is quite low. On a scale of 0 to 1, not a very dense network, which comports with what you can see in the visualization.8 A 0 would mean that there are no connections at all, and a 1 would indicate that all possible edges are present (a perfectly connected network): this Quaker network is on the lower end of that scale, but still far from 0.
</p>


<p> A shortest path measurement is a bit more complex. It calculates the shortest possible series of nodes and edges that stand between any two nodes, something hard to see in large network visualizations. This measure is essentially finding friends-of-friends—if my mother knows someone that I don’t, then mom is the shortest path between me and that person.
</p>

# Export graph

In [182]:
nx.write_gexf(G, 'sample_movies.gexf')