<a href="https://colab.research.google.com/github/gracek904/twitter-io/blob/main/hashtag_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
from nltk.tokenize import TweetTokenizer
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
from nltk.tokenize.treebank import TreebankWordDetokenizer
import matplotlib.ticker as ticker
from matplotlib_venn import venn2
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
# Download necessary NLTK data if not already downloaded
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('words', quiet=True)

In [None]:
# Load the datasets
russia_df = pd.read_csv('/content/russia.csv')
iran_df = pd.read_csv('/content/iran.csv')

In [None]:
# Filter to only include English tweets
russia_df = russia_df[russia_df['tweet_language'] == 'en']
iran_df = iran_df[iran_df['tweet_language'] == 'en']

In [None]:
# Function to extract and clean hashtags from each dataset
def extract_hashtags(hashtags_column):
    """
    Extract hashtags from the hashtags column.
    Expects either a string representation of list or an actual list.
    Returns a clean list of lowercase hashtags.
    """
    all_hashtags = []

    for entry in hashtags_column:
        if pd.isna(entry) or entry == '[]' or entry == '':
            continue

        # Handle string representation of list
        if isinstance(entry, str):
            # Remove brackets, quotes, and split by comma
            try:
                # Try evaluating if it's a string representation of a list
                hashtag_list = eval(entry)
                if isinstance(hashtag_list, list):
                    hashtags = hashtag_list
                else:
                    # If not a list, try to extract hashtags using regex
                    hashtags = re.findall(r'#(\w+)', entry)
            except:
                # If eval fails, try to extract hashtags using regex
                hashtags = re.findall(r'#(\w+)', entry)
        elif isinstance(entry, list):
            hashtags = entry
        else:
            continue

        # Clean and lowercase all hashtags
        cleaned_hashtags = [tag.lower() for tag in hashtags if tag]
        all_hashtags.extend(cleaned_hashtags)

    return all_hashtags

In [None]:
# Extract hashtags from both datasets
russia_hashtags = extract_hashtags(russia_df['hashtags'])
iran_hashtags = extract_hashtags(iran_df['hashtags'])

In [None]:
# Count and sort hashtags by frequency
russia_hashtag_counts = Counter(russia_hashtags)
iran_hashtag_counts = Counter(iran_hashtags)

# Get top 10 most frequent hashtags for each dataset
top_russia_hashtags = russia_hashtag_counts.most_common(50)
top_iran_hashtags = iran_hashtag_counts.most_common(50)

print("\nTop 50 Russia hashtags:")
for hashtag, count in top_russia_hashtags:
    print(f"#{hashtag}: {count}")

print("\nTop 50 Iran hashtags:")
for hashtag, count in top_iran_hashtags:
    print(f"#{hashtag}: {count}")

In [None]:
plt.figure(figsize=(16, 30))  # Much taller figure to accommodate all 50 hashtags

# Russia hashtags
plt.subplot(1, 2, 1)
russia_tags = [tag for tag, _ in top_russia_hashtags]
russia_counts = [count for _, count in top_russia_hashtags]
plt.barh(russia_tags, russia_counts, color='#3498db')
plt.xlabel('Frequency', fontsize=12)
plt.title('Top 50 Hashtags in Russian Dataset', fontsize=14)
plt.gca().invert_yaxis()  # To have highest frequency at the top
plt.tick_params(axis='y', labelsize=9)  # Slightly smaller font for labels

# Iran hashtags
plt.subplot(1, 2, 2)
iran_tags = [tag for tag, _ in top_iran_hashtags]
iran_counts = [count for _, count in top_iran_hashtags]
plt.barh(iran_tags, iran_counts, color='#e74c3c')
plt.xlabel('Frequency', fontsize=12)
plt.title('Top 50 Hashtags in Iranian Dataset', fontsize=14)
plt.gca().invert_yaxis()  # To have highest frequency at the top
plt.tick_params(axis='y', labelsize=9)  # Slightly smaller font for labels

plt.tight_layout(pad=3.0)  # Add padding between subplots
plt.savefig('top_hashtags_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Create a more sophisticated visualization using Plotly
fig = make_subplots(rows=1, cols=2, subplot_titles=("Top 10 Russia Hashtags", "Top 10 Iran Hashtags"),
                    specs=[[{"type": "bar"}, {"type": "bar"}]])

# Russia hashtags
fig.add_trace(
    go.Bar(
        x=[count for _, count in top_russia_hashtags],
        y=[tag for tag, _ in top_russia_hashtags],
        orientation='h',
        marker=dict(color='blue'),
        name='Russia'
    ),
    row=1, col=1
)

# Iran hashtags
fig.add_trace(
    go.Bar(
        x=[count for _, count in top_iran_hashtags],
        y=[tag for tag, _ in top_iran_hashtags],
        orientation='h',
        marker=dict(color='red'),
        name='Iran'
    ),
    row=1, col=2
)

fig.update_layout(
    title_text="Top Hashtags Comparison: Russia vs Iran Information Operations",
    height=500,
    width=1000
)

fig.write_html("top_hashtags_comparison_interactive.html")
fig.show()

In [None]:
# Find common hashtags between the two datasets
russia_set = set([tag for tag, _ in russia_hashtag_counts.most_common(50)])
iran_set = set([tag for tag, _ in iran_hashtag_counts.most_common(50)])
common_hashtags = russia_set.intersection(iran_set)

print(f"\nNumber of common hashtags in top 50: {len(common_hashtags)}")
print("Common hashtags:", sorted(list(common_hashtags)))

In [None]:
# Create a Venn diagram of top 50 hashtags
plt.figure(figsize=(10, 6))
venn2([russia_set, iran_set], ('Russia (Top 50)', 'Iran (Top 50)'))
plt.title('Common Hashtags in Top 50 Most Frequent')
plt.savefig('common_hashtags_venn.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Create a comparison bar chart for common hashtags
if common_hashtags:
    common_tags = list(common_hashtags)
    russia_common_counts = [russia_hashtag_counts[tag] for tag in common_tags]
    iran_common_counts = [iran_hashtag_counts[tag] for tag in common_tags]

    # Sort by combined frequency
    combined_counts = [(r+i, r, i, t) for r, i, t in zip(russia_common_counts, iran_common_counts, common_tags)]
    combined_counts.sort(reverse=True)

    # Take top 15 common hashtags by combined frequency
    top_combined = combined_counts[:min(15, len(combined_counts))]

    common_tags = [t for _, _, _, t in top_combined]
    russia_common_counts = [r for _, r, _, _ in top_combined]
    iran_common_counts = [i for _, _, i, _ in top_combined]

    # Create grouped bar chart
    x = np.arange(len(common_tags))
    width = 0.35

    fig, ax = plt.figure(figsize=(14, 8)), plt.gca()

    rects1 = ax.bar(x - width/2, russia_common_counts, width, label='Russia', color='#3498db')
    rects2 = ax.bar(x + width/2, iran_common_counts, width, label='Iran', color='#e74c3c')

    ax.set_title('Comparison of Common Hashtags Between Russia and Iran Datasets')
    ax.set_ylabel('Frequency')
    ax.set_xticks(x)
    ax.set_xticklabels(common_tags, rotation=45, ha='right')
    ax.legend()

    plt.tight_layout()
    plt.savefig('common_hashtags_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
# Create a normalized stacked bar chart to show relative usage
normalized_data = pd.DataFrame({
    'hashtag': common_tags,
    'Russia': russia_common_counts,
    'Iran': iran_common_counts
})

# Calculate total and percentages
normalized_data['Total'] = normalized_data['Russia'] + normalized_data['Iran']
normalized_data['Russia_pct'] = normalized_data['Russia'] / normalized_data['Total'] * 100
normalized_data['Iran_pct'] = normalized_data['Iran'] / normalized_data['Total'] * 100

# Sort by total frequency
normalized_data = normalized_data.sort_values('Total', ascending=False)

# Plot the percentages
plt.figure(figsize=(14, 8))

plt.bar(normalized_data['hashtag'], normalized_data['Russia_pct'], color='#3498db', label='Russia')
plt.bar(normalized_data['hashtag'], normalized_data['Iran_pct'], bottom=normalized_data['Russia_pct'],
        color='#e74c3c', label='Iran')

plt.xlabel('Hashtag')
plt.ylabel('Percentage')
plt.title('Relative Usage of Common Hashtags')
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.savefig('common_hashtags_relative_usage.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Create network graph of co-occurring hashtags (optional advanced analysis)
def extract_co_occurring_hashtags(df, hashtags_column, top_n=30):
    """Extract hashtags that co-occur in the same tweet."""
    co_occurrences = {}

    for entry in df[hashtags_column]:
        if pd.isna(entry) or entry == '[]' or entry == '':
            continue

        # Similar to extract_hashtags but returns a set of hashtags per tweet
        try:
            if isinstance(entry, str):
                hashtag_list = eval(entry)
                if isinstance(hashtag_list, list):
                    hashtags = set(h.lower() for h in hashtag_list if h)
                else:
                    hashtags = set(h.lower() for h in re.findall(r'#(\w+)', entry) if h)
            elif isinstance(entry, list):
                hashtags = set(h.lower() for h in entry if h)
            else:
                continue

            # Only consider tweets with multiple hashtags
            if len(hashtags) > 1:
                hashtags = list(hashtags)
                for i in range(len(hashtags)):
                    for j in range(i+1, len(hashtags)):
                        pair = tuple(sorted([hashtags[i], hashtags[j]]))
                        co_occurrences[pair] = co_occurrences.get(pair, 0) + 1
        except:
            continue

    # Get top co-occurring pairs
    top_pairs = sorted(co_occurrences.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return top_pairs

# This is optional and commented out as it requires additional libraries like networkx

import networkx as nx

# Extract co-occurring hashtags
russia_co_occur = extract_co_occurring_hashtags(russia_df, 'hashtags')
iran_co_occur = extract_co_occurring_hashtags(iran_df, 'hashtags')

# Create network graphs
def create_network_graph(co_occurrences, title):
    G = nx.Graph()

    # Add edges with weights
    for (h1, h2), weight in co_occurrences:
        G.add_edge(h1, h2, weight=weight)

    # Calculate node sizes based on degree centrality
    centrality = nx.degree_centrality(G)
    node_sizes = [centrality[node] * 5000 for node in G.nodes()]

    # Draw graph
    plt.figure(figsize=(12, 10))
    pos = nx.spring_layout(G, k=0.3, iterations=50)
    nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color='lightblue', alpha=0.8)
    edges = nx.draw_networkx_edges(G, pos, width=[G[u][v]['weight']/5 for u, v in G.edges()],
                                  alpha=0.5, edge_color='gray')
    nx.draw_networkx_labels(G, pos, font_size=10)

    plt.title(title)
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(f"{title.replace(' ', '_').lower()}.png", dpi=300, bbox_inches='tight')
    plt.show()

create_network_graph(russia_co_occur, "Russia Hashtag Co-occurrence Network")
create_network_graph(iran_co_occur, "Iran Hashtag Co-occurrence Network")