In [2]:
%matplotlib inline
import networkx as nx
import pandas as pd

In [3]:
import networkx as nx
from networkx.algorithms import approximation
import matplotlib.pyplot as plt
from collections import Counter
import random
from itertools import combinations, groupby

import statistics

def local_summaries(G, directed=False):
    betweenness_centrality = nx.centrality.betweenness_centrality(G)
    eigenvector_centrality = nx.centrality.eigenvector_centrality(G)
    closeness_centrality = nx.centrality.closeness_centrality(G)
    if directed:
        in_degrees = [G.in_degree(n) for n in G.nodes]
        out_degrees = [G.in_degree(n) for n in G.nodes]
        return zip(betweenness_centrality.values(), eigenvector_centrality.values(), closeness_centrality.values(), in_degrees, out_degrees)
    else:
        degrees = [G.degree(n) for n in G.nodes]
        return zip(betweenness_centrality.values(), eigenvector_centrality.values(), closeness_centrality.values(), degrees)

def global_summaries(G):
    try:
        diameter = nx.algorithms.distance_measures.diameter(G)
    except:
        diameter = "Found infinite path length because the graph is not connected !"
    clustering_coefficient = nx.algorithms.approximation.clustering_coefficient.average_clustering(G)
    number_of_nodes = G.number_of_edges()
    number_of_edges = G.number_of_nodes()
    number_of_connected_components = nx.number_connected_components(G)
    largest_connected_component = max([ len(i) for i in list(nx.connected_components(G))])
    print("##### Global Summaries #####")
    print("Diameter : ",diameter)
    print("Number of Nodes : ",number_of_nodes)
    print("Number of Edges : ",number_of_edges)
    print("Number of Connected Components : ",number_of_connected_components)
    print("Size of the Largest Connected Compopnent : ",largest_connected_component)
    
def explore_and_summarize_network(edgelist, vertices, subgraph, directed=False):
    """
    Concisely summarizes any induced subgraph of the input network
    """
    G = nx.Graph()
    if subgraph is not None:
        induced_edges = [ edge for edge in edgelist if ((edge[0] in subgraph) and (edge[1] in subgraph))]
        G.add_nodes_from(subgraph)
        G.add_edges_from(induced_edges, nodetype=int)
    else :
        G.add_nodes_from(vertices)
        G.add_edges_from(edgelist, nodetype=int)
    
    # (a)
    nx.draw(G,pos=nx.spring_layout(G),node_color='maroon',
        node_size=20,
        edge_color="gray",
        width=0.5)
    plt.show()
    # (b)
    if directed:
        in_degree_sequence = [item[-2] for item in local_summaries(G)]
        in_degree_counts = Counter(degree_sequence)
        fig, ax = plt.subplots()
        ax.bar(in_degree_counts.keys(), in_degree_counts.values())
        ax.set_xlabel('Nodes')
        ax.set_ylabel('In Degrees')
        ax.set_title(r'Histogram of in_degrees')
        fig.tight_layout()
        plt.show()
        
        out_degree_sequence = [item[-1] for item in local_summaries(G)]
        out_degree_counts = Counter(out_degree_sequence)
        fig, ax = plt.subplots()
        ax.bar(out_degree_counts.keys(), out_degree_counts.values())
        ax.set_xlabel('Nodes')
        ax.set_ylabel('out Degrees')
        ax.set_title(r'Histogram of out_degrees')
        fig.tight_layout()
        plt.show() 
    else:
        degree_sequence = [item[-1] for item in local_summaries(G)]
        degree_counts = Counter(degree_sequence)
        fig, ax = plt.subplots()
        ax.bar(degree_counts.keys(), degree_counts.values())
        ax.set_xlabel('Nodes')
        ax.set_ylabel('Degrees')
        ax.set_title(r'Histogram of degrees')
        fig.tight_layout()
        plt.show() 
    # (c)

    # Print Global Summaries
    global_summaries(G)

In [4]:
import os
import glob
import networkx as nx

def load_edges_from_folder(folder_path):
    all_edges = []
    # Recursively search for all .edges files in the folder and subfolders
    edges_files = glob.glob(os.path.join(folder_path, '**', '*.edges'), recursive=True)
    
    # Iterate over each file and load edges
    for edges_file in edges_files:
        edges = load_edges(edges_file)
        all_edges.extend(edges)
    
    return all_edges

def load_edges(filename):
    edges = []
    with open(filename, 'r') as file:
        for line in file:
            try:
                nodes = line.strip().split()
                edges.append((int(nodes[0]), int(nodes[1])))
            except ValueError:
                print(f"Skipping malformed line in {filename}: {line.strip()}")
    return edges

# Usage
folder_path = './twitter'
all_edges = load_edges_from_folder(folder_path)
# Create a NetworkX graph with all edges
graph = nx.DiGraph()  # Directed graph for Twitter
graph.add_edges_from(all_edges)

# Compute node degrees and PageRank
degrees = dict(graph.degree())
pagerank = nx.pagerank(graph)

# Print graph information
print("Total edges in the graph:", graph.number_of_edges())
print("Total nodes in the graph:", graph.number_of_nodes())

# Print a sample of the PageRank scores (first 10 items)
print("\nSample PageRank scores:")
for node, score in list(pagerank.items())[:10]:
    print(f"Node {node}: {score}")
# explore_and_summarize_network(edgelist = graph.edges(), vertices=graph.nodes(), subgraph=None)


Total edges in the graph: 0
Total nodes in the graph: 0

Sample PageRank scores:


In [5]:
def load_circles_from_folder(folder_path):
    all_circles = {}
    # Recursively search for all .circles files in the folder and subfolders
    circles_files = glob.glob(os.path.join(folder_path, '**', '*.circles'), recursive=True)
    
    # Iterate over each file and load circles
    for circles_file in circles_files:
        circles = load_circles(circles_file)
        # Update the overall dictionary of circles
        all_circles.update(circles)
    
    return all_circles

def load_circles(filename):
    circles = {}
    with open(filename, 'r') as file:
        for line in file:
            circle_data = line.strip().split()
            circle_name = circle_data[0]
            circle_members = list(map(int, circle_data[1:]))
            circles[circle_name] = circle_members
    return circles

# Usage
folder_path = './twitter'
all_circles = load_circles_from_folder(folder_path)

# Example to print loaded circles
for circle_name, members in all_circles.items():
    print(f"Circle: {circle_name}, Members: {members}")

In [7]:
# Function to load all `.feat` files from a folder
def load_features_from_folder(folder_path):
    all_features = []
    feat_files = glob.glob(os.path.join(folder_path, '**', '*.feat'), recursive=True)

    for feat_file in feat_files:
        features = pd.read_csv(feat_file, sep=' ', header=None)
        features.rename(columns={0:"user_id"}, inplace=True)
        all_features.append(features)

    # Concatenate all the features into a single DataFrame
    combined_features = pd.concat(all_features, ignore_index=True)
    return combined_features

# Function to load all `.egofeat` files from a folder
def load_ego_features_from_folder(folder_path):
    all_ego_features = []
    ego_feat_files = glob.glob(os.path.join(folder_path, '**', '*.egofeat'), recursive=True)

    for ego_feat_file in ego_feat_files:
        ego_features = pd.read_csv(ego_feat_file, sep=' ', header=None)
        all_ego_features.append(ego_features)

    # Concatenate all ego features into a single DataFrame
    combined_ego_features = pd.concat(all_ego_features, ignore_index=True)
    return combined_ego_features

# Function to load all `.featnames` files from a folder
def load_featnames_from_folder(folder_path):
    all_featnames = {}
    featnames_files = glob.glob(os.path.join(folder_path, '**', '*.featnames'), recursive=True)

    for featnames_file in featnames_files:
        with open(featnames_file, 'r') as file:
            for line in file:
                idx, name = line.strip().split(' ', 1)
                all_featnames[int(idx)] = name

    return all_featnames

# Usage
folder_path = './twitter'

# Load `.feat` files
all_features = load_features_from_folder(folder_path)
print("Features DataFrame:")
print(all_features)

# Load `.egofeat` files
all_ego_features = load_ego_features_from_folder(folder_path)
print("Ego Features DataFrame:")
print(all_ego_features)

# Load `.featnames` files
all_featnames = load_featnames_from_folder(folder_path)
print("Feature Names Dictionary:")
print(all_featnames)

Features DataFrame:
          user_id  1    2    3    4    5    6    7    8    9  ...  2262  2263  \
0        51828900  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   NaN   NaN   
1        18584875  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   NaN   NaN   
2       217546128  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   NaN   NaN   
3       234169190  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   NaN   NaN   
4        39157827  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   NaN   NaN   
...           ... ..  ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   ...   
133852   13691782  0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  ...   NaN   NaN   
133853   21436960  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   NaN   NaN   
133854  104989762  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   NaN   NaN   
133855   15144542  0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  ...   NaN   NaN   
133856   37964895  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   NaN   NaN   

       

Preprocess Data

In [8]:
all_features.fillna(0, inplace=True)
all_ego_features.fillna(0, inplace=True)
print(all_features)
# print(all_ego_features)


          user_id  1    2    3    4    5    6    7    8    9  ...  2262  2263  \
0        51828900  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
1        18584875  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
2       217546128  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
3       234169190  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
4        39157827  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
...           ... ..  ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   ...   
133852   13691782  0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  ...   0.0   0.0   
133853   21436960  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
133854  104989762  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
133855   15144542  0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  ...   0.0   0.0   
133856   37964895  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   

        2264  2265  2266  2

features nomalization

In [39]:
# Generate the list of hashtags for each user and store them in a separate DataFrame
# hashtags_used = all_features.apply(
#     lambda row: [all_featnames.get(i) for i in row.index[1:] if row[i] == 1],
#     axis=1
# )

# # Create a new DataFrame with the user_id and hashtags_used columns
# hashtags_df = pd.DataFrame({
#     'user_id': all_features['user_id'],
#     'hashtags_used': hashtags_used
# })

# Optional: If you don't need the binary feature columns, set all_features to hashtags_df
# all_features = hashtags_df

# Print the resulting DataFrame with hashtags used


KeyboardInterrupt: 

In [12]:
hashtags_df = pd.read_csv("user_hashtags.csv")
print(hashtags_df.head())


     user_id                                      hashtags_used
0   51828900  ['#PAXEast', '#UMGClassic', '#runescape', '@9p...
1   18584875                                   ['@JCannon_nV:']
2  217546128  ['#Prototype2', '@BOMBHOR', '@Chipotle', '@Gea...
3  234169190  ['#Ireallydontcareaboutwhattheinternetthinks',...
4   39157827                          ['@DrChiz', '@FaZeFakie']


In [10]:
degrees= dict(graph.degree())
pagerank = nx.pagerank(graph)

In [11]:
# Create a DataFrame with the new columns

new_columns = pd.DataFrame({
    'node_id': all_features.index,
    'degree': all_features.index.map(degrees),
    'pagerank': all_features.index.map(pagerank)
})

# Concatenate the new columns with the original DataFrame
all_features = pd.concat([all_features, new_columns], axis=1)

print(all_features)

          user_id  1    2    3    4    5    6    7    8    9  ...  2265  2266  \
0        51828900  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
1        18584875  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
2       217546128  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
3       234169190  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
4        39157827  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
...           ... ..  ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   ...   
133852   13691782  0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  ...   0.0   0.0   
133853   21436960  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
133854  104989762  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
133855   15144542  0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  ...   0.0   0.0   
133856   37964895  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   

        2267  2268  2269  2