<a href="https://colab.research.google.com/github/dp22acn/Data_Science_Project/blob/main/python_code_1_manual_approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installing Required Libraries

In [None]:
pip install spacy

In [None]:
!pip install pyvis


Importing Libraries

In [None]:
import pandas as pd
import networkx as nx
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
colnames = ['Character', 'Alternate Names', 'Role', 'Faction', 'Friends', 'enimies']
df_edges = pd.read_csv("/content/drive/MyDrive/Mac_data_processed_.csv", header=None, names=colnames, skiprows=1)
print(df_edges.shape)
df_edges.head()

EDA

In [None]:
import pandas as pd

# Load the CSV file
file_path = "/content/drive/MyDrive/Mac_data_processed_.csv"
data = pd.read_csv(file_path)

# Inspect the data
print("First 5 rows of the dataset:")
print(data.head())

print("\nDataset Info:")
data.info()

print("\nSummary Statistics:")
print(data.describe())

print("\nMissing Values:")
print(data.isnull().sum())


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Data Heatmap")
plt.show()


In [None]:
categorical_cols = data.select_dtypes(include=['object', 'category']).columns
print("Categorical Columns:", categorical_cols)

for col in categorical_cols:
    print(f"\nValue Counts for {col}:")
    print(data[col].value_counts())


In [None]:
import pandas as pd
import networkx as nx

# Load the CSV file into a DataFrame
df = df_edges

# Initialize a directed graph
G = nx.DiGraph()

# Add nodes (characters) along with their and faction attributes
for _, row in df.iterrows():
    character = str(row['Character'])
    faction = str(row['Faction']) if pd.notnull(row['Faction']) else 'Unknown'  # If 'Faction' is not present, default to 'Unknown'
    role = row.get('Role', 'Unknown')  # If 'Role' is not present, default to 'Unknown'

    # Add node with the character, faction, and role attributes
    G.add_node(character, faction=faction, role=role)

# Add edges based on Friendly Links and Hostile Links with advanced weight calculations
for _, row in df.iterrows():
    character = str(row['Character'])

    # Friendly Links (edges)
    if isinstance(row['Friends'], str):
        friendly_links = row['Friends'].split(',')
        for friend in friendly_links:
            friend = friend.strip()
            # Using custom logic for edge weight (e.g., frequency of interaction)
            weight = row['Friends'].count(friend)  # For example, frequency of mention
            if G.has_edge(character, friend):
                G[character][friend]['weight'] += weight  # Increase weight based on frequency
            else:
                G.add_edge(character, friend, relationship='friendly', weight=weight)

    # Hostile Links (edges)
    if isinstance(row['enimies'], str):
        hostile_links = row['enimies'].split(',')
        for enemy in hostile_links:
            enemy = enemy.strip()
            # Using custom logic for edge weight (e.g., frequency of mention)
            weight = row['enimies'].count(enemy)  # Frequency of mention
            if G.has_edge(character, enemy):
                G[character][enemy]['weight'] += weight  # Increase weight based on frequency
            else:
                G.add_edge(character, enemy, relationship='hostile', weight=weight)

# normalize edge weights for better visualization
max_weight = max(nx.get_edge_attributes(G, 'weight').values())
for u, v, data in G.edges(data=True):
    data['weight'] = data['weight'] / max_weight  # Normalize weights to a 0-1 scale

# Save the network as a GEXF file
gexf_file = "A_Mahabarata_network.gexf"
nx.write_gexf(G, gexf_file)

from google.colab import files
files.download(gexf_file)


In [None]:
nx.density(G)

In [None]:
degree_centrality = nx.degree_centrality(G)
print("Degree Centrality:", degree_centrality)

# 2. Betweenness Centrality
betweenness_centrality = nx.betweenness_centrality(G)
print("Betweenness Centrality:", betweenness_centrality)

# 3. Closeness Centrality
closeness_centrality = nx.closeness_centrality(G)
print("Closeness Centrality:", closeness_centrality)

# 4. Eigenvector Centrality
eigenvector_centrality = nx.eigenvector_centrality(G)
print("Eigenvector Centrality:", eigenvector_centrality)

# 5. Clustering Coefficient
clustering_coefficient = nx.clustering(G)
print("Clustering Coefficient:", clustering_coefficient)
average_clustering = nx.average_clustering(G)
print("Average Clustering Coefficient:", average_clustering)

# 6. Density
density = nx.density(G)
print("Density of the Graph:", density)


In [None]:
# 1. Degree Centrality
degree_centrality = nx.degree_centrality(G)
top_degree_centrality = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top 5 Nodes by Degree Centrality:")
for node, centrality in top_degree_centrality:
    print(f"Node: {node}, Centrality: {centrality}")

# 2. Betweenness Centrality
betweenness_centrality = nx.betweenness_centrality(G)
top_betweenness_centrality = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
print("\nTop 5 Nodes by Betweenness Centrality:")
for node, centrality in top_betweenness_centrality:
    print(f"Node: {node}, Centrality: {centrality}")

# 3. Closeness Centrality
closeness_centrality = nx.closeness_centrality(G)
top_closeness_centrality = sorted(closeness_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
print("\nTop 5 Nodes by Closeness Centrality:")
for node, centrality in top_closeness_centrality:
    print(f"Node: {node}, Centrality: {centrality}")

# 4. Eigenvector Centrality
eigenvector_centrality = nx.eigenvector_centrality(G)
top_eigenvector_centrality = sorted(eigenvector_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
print("\nTop 5 Nodes by Eigenvector Centrality:")
for node, centrality in top_eigenvector_centrality:
    print(f"Node: {node}, Centrality: {centrality}")

# 5. Clustering Coefficient
clustering_coefficient = nx.clustering(G)
top_clustering_coefficient = sorted(clustering_coefficient.items(), key=lambda x: x[1], reverse=True)[:5]
print("\nTop 5 Nodes by Clustering Coefficient:")
for node, coefficient in top_clustering_coefficient:
    print(f"Node: {node}, Coefficient: {coefficient}")

# 6. Density
density = nx.density(G)
print("\nDensity of the Graph:", density)

# 7. Average Clustering Coefficient
average_clustering = nx.average_clustering(G)
print("Average Clustering Coefficient:", average_clustering)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Plot Degree Centrality
plt.figure(figsize=(10, 6))
plt.bar(range(len(degree_centrality)), degree_centrality.values(), color='blue', alpha=0.7)
plt.title('Degree Centrality')
plt.xlabel('Nodes')
plt.ylabel('Centrality')
plt.xticks(range(len(degree_centrality)), labels=[], rotation=90)  # Remove node names
plt.show()

# 2. Plot Betweenness Centrality
plt.figure(figsize=(10, 6))
plt.bar(range(len(betweenness_centrality)), betweenness_centrality.values(), color='green', alpha=0.7)
plt.title('Betweenness Centrality')
plt.xlabel('Nodes')
plt.ylabel('Centrality')
plt.xticks(range(len(betweenness_centrality)), labels=[], rotation=90)  # Remove node names
plt.show()

# 3. Plot Closeness Centrality
plt.figure(figsize=(10, 6))
plt.bar(range(len(closeness_centrality)), closeness_centrality.values(), color='orange', alpha=0.7)
plt.title('Closeness Centrality')
plt.xlabel('Nodes')
plt.ylabel('Centrality')
plt.xticks(range(len(closeness_centrality)), labels=[], rotation=90)  # Remove node names
plt.show()

# 4. Plot Eigenvector Centrality
plt.figure(figsize=(10, 6))
plt.bar(range(len(eigenvector_centrality)), eigenvector_centrality.values(), color='purple', alpha=0.7)
plt.title('Eigenvector Centrality')
plt.xlabel('Nodes')
plt.ylabel('Centrality')
plt.xticks(range(len(eigenvector_centrality)), labels=[], rotation=90)  # Remove node names
plt.show()

# 5. Clustering Coefficient
plt.figure(figsize=(10, 6))
plt.bar(range(len(clustering_coefficient)), clustering_coefficient.values(), color='cyan', alpha=0.7)
plt.title('Clustering Coefficient')
plt.xlabel('Nodes')
plt.ylabel('Coefficient')
plt.xticks(range(len(clustering_coefficient)), labels=[], rotation=90)  # Remove node names
plt.show()

# 6. Average Clustering Coefficient and Density (Pie Chart)
metrics = ['Average Clustering Coefficient', 'Density']
values = [average_clustering, density]

plt.figure(figsize=(6, 6))
plt.pie(values, labels=metrics, autopct='%1.2f%%', colors=['pink', 'lightgreen'], startangle=90)
plt.title('Graph Metrics')
plt.show()


In [None]:
# Calculate triads for a directed graph
triadic_census = nx.triadic_census(G)
print("Triadic Census for Directed Graph:", triadic_census)

# Convert the directed graph to undirected for triangle calculations
G_undirected = G.to_undirected()

# Calculate the number of triangles per node
triangle_counts = nx.triangles(G_undirected)
print("Triangle Counts per Node:", triangle_counts)

# Calculate the total number of triangles in the graph
total_triangles = sum(triangle_counts.values()) // 3  # Each triangle is counted thrice
print("Total Number of Triangles:", total_triangles)

# Additional insights: Clustering coefficient
clustering_coefficient = nx.clustering(G_undirected)
print("Clustering Coefficient per Node:", clustering_coefficient)

# Average clustering coefficient
average_clustering = nx.average_clustering(G_undirected)
print("Average Clustering Coefficient:", average_clustering)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert triadic census dictionary to lists for plotting
triad_types = list(triadic_census.keys())
triad_counts = list(triadic_census.values())

# Create a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=triad_types, y=triad_counts, palette="viridis")
plt.title("Triadic Census Distribution")
plt.xlabel("Triad Type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()


In [None]:
nx.write_gexf(G, "A_Mahabarata_network_with_triads.gexf")
files.download("A_Mahabarata_network_with_triads.gexf")


In [None]:
# Calculate the size for Draupadi (Node size based on relationships)
draupadi_node = "Draupadi"

# 1. Size based on all relationships (all edges)
all_relations = len(list(G.neighbors(draupadi_node)))  # All friendly and hostile links

# 2. Size based on hostile relationships (edges where relationship='hostile')
hostile_relations = sum(1 for _, _, data in G.out_edges(draupadi_node, data=True) if data['relationship'] == 'hostile')

# 3. Size based on friendly relationships (edges where relationship='friendly')
friendly_relations = sum(1 for _, _, data in G.out_edges(draupadi_node, data=True) if data['relationship'] == 'friendly')

# Output the results
print(f"Draupadi's total relations (all): {all_relations}")
print(f"Draupadi's hostile relations: {hostile_relations}")
print(f"Draupadi's friendly relations: {friendly_relations}")