Scrape an online Social Media Site for Data. Use python to scrapeinformation
from twitter. Exploratory Data Analysis and visualization of Social Media Data.

In [None]:
from googleapiclient.discovery import build
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Set up API key and channel IDs
api_key = "AIzaSyBivHpCtwzkJrQQT1S0CoJ78jf2rHOb4jo"
channel_ids = [
    'UCMiJRAwDNSNzuYeN2uWa0pA',  # mrwhoistheboss
    'UCdPui8EYr_sX6q1xNXCRPXg',  # storror
    'UCj22tfcQrWG7EMEKS0qLeEg'   # Carryminati
]

# Initialize YouTube API client
youtube = build('youtube', 'v3', developerKey=api_key)

# Function to get channel statistics
def get_channel_stats(youtube, channel_ids):
    all_data = []
    request = youtube.channels().list(
        part='snippet,contentDetails,statistics',
        id=','.join(channel_ids)
    )
    response = request.execute()

    for i in range(len(response['items'])):
        data = dict(
            channel_name=response['items'][i]['snippet']['title'],
            Subscribers=response['items'][i]['statistics']['subscriberCount'],
            views=response['items'][i]['statistics']['viewCount'],
            Total_videos=response['items'][i]['statistics']['videoCount']
        )
        all_data.append(data)

    return all_data

# Calling the function and creating DataFrame
channel_statistics = get_channel_stats(youtube, channel_ids)
channel_data = pd.DataFrame(channel_statistics)

# Convert data types
channel_data['Subscribers'] = pd.to_numeric(channel_data['Subscribers'])
channel_data['views'] = pd.to_numeric(channel_data['views'])
channel_data['Total_videos'] = pd.to_numeric(channel_data['Total_videos'])

# Plotting
sns.barplot(x='channel_name', y='Subscribers', data=channel_data)
sns.barplot(x='channel_name', y='views', data=channel_data)
sns.barplot(x='channel_name', y='Total_videos', data=channel_data)
plt.show()


Develop Content (text, emoticons, image, audio, video) based social media
analytics model for business. (e.g., Content Based Analysis: Topic, Issue, Trend,
sentiment/opinion analysis, audio, video, image analytics)

In [None]:
# Uninstall conflicting versions
!pip uninstall -y numpy scipy gensim textblob

# Reinstall compatible versions
!pip install numpy==1.26.4 scipy==1.13.1 gensim==4.3.3 textblob


In [None]:
import numpy
import scipy
import gensim
import textblob


print("All imports working correctly!")


All imports working correctly!


In [None]:
import nltk

# Download everything needed
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Needed for lemmatizer
nltk.download('averaged_perceptron_tagger')  # Sometimes needed for TextBlob
nltk.download('brown')  # Optional, used by TextBlob in some functions
nltk.download('punkt_tab')  # Fixes your specific issue


In [None]:
import pandas as pd
import nltk
import gensim
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from textblob import TextBlob

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load your data
path = "/content/drive/MyDrive/Datasets/google.csv"
df = pd.read_csv(path)

# Extract non-empty reviews
reviews = df['Reviews'].dropna().tolist()

# Display first few reviews
print("Sample Reviews:\n", reviews[:5])

# Preprocessing function
def preprocess(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    # Remove stopwords and non-alphanumeric tokens
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return lemmatized_tokens

# Topic modeling
def topic_modeling(reviews):
    processed_reviews = [preprocess(review) for review in reviews]
    dictionary = corpora.Dictionary(processed_reviews)
    corpus = [dictionary.doc2bow(review) for review in processed_reviews]
    lda_model = models.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=10)
    topics = lda_model.print_topics(num_words=3)
    return topics

# Sentiment analysis
def sentiment_analysis(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity > 0:
        return "Positive"
    elif polarity < 0:
        return "Negative"
    else:
        return "Neutral"

# Analyze reviews
def analyze_reviews(reviews):
    df_result = pd.DataFrame(columns=['Review', 'Sentiment'])
    for idx, review in enumerate(reviews, start=1):
        sentiment = sentiment_analysis(review)
        print(f"\nReview {idx}: {review}")
        print(f"Sentiment: {sentiment}")
        df_result.loc[idx] = {'Review': review, 'Sentiment': sentiment}
    topics = topic_modeling(reviews)
    return df_result, topics

# Main execution
if __name__ == "__main__":
    review_df, topics = analyze_reviews(reviews)

    print("\nSentiment Analysis Summary:")
    print(review_df)

    print("\nTopics Identified by LDA:")
    for idx, topic in enumerate(topics):
        print(f"Topic {idx + 1}: {topic}")


Develop Structure based social media analytics model for any business.
(e.g. Structure Based Models -community detection influence analysis)

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from networkx.algorithms import community
import community as community_louvain

# Load the dataset
path ="/content/drive/MyDrive/Datasets/fake_social_data_100.csv"

df = pd.read_csv(path)
# Display the first few rows of the dataframe
print(df.head())
# Create the graph from the edge list
fb_graph = nx.from_pandas_edgelist(df, source="id", target="friend_id")
# Display all the nodes
print(fb_graph.nodes())
# Display all the edges
print(fb_graph.edges())
# Add a new edge to the graph
fb_graph.add_edge(123, 2154)
# Display all the nodes again
print(fb_graph.nodes())
# Community detection using Girvan-Newman algorithm
comp = community.girvan_newman(fb_graph)
first_level_communities = next(comp)
second_level_communities = next(comp)
first_community_list = sorted(map(sorted, first_level_communities))
second_community_list = sorted(map(sorted, second_level_communities))
print(f"First level communities: {first_community_list}")
print(f"Second level communities: {second_community_list}")
# Community detection using Louvain method
partition = community_louvain.best_partition(fb_graph)
# Plot the communities detected by the Louvain method
pos = nx.spring_layout(fb_graph)
cmap = plt.get_cmap('viridis')
colors = [partition[node] for node in fb_graph.nodes()]
plt.figure(figsize=(12, 12))
nx.draw(fb_graph, pos, node_color=colors, with_labels=True, cmap=cmap, node_size=50,
font_size=8)
plt.show()
# Calculate degree centrality
degree_centrality = nx.degree_centrality(fb_graph)
# Sort and display the degree centrality values
sorted_degree_centrality = sorted(degree_centrality.items(), key=lambda x: x[1],
reverse=True)
print("Degree Centrality:", sorted_degree_centrality)
# Calculate betweenness centrality
betCent = nx.betweenness_centrality(fb_graph, normalized=True, endpoints=True)
sorted_betCent = sorted(betCent.items(), key=lambda x: x[1], reverse=True)
print("Betweenness Centrality:", sorted_betCent)
# Plot the graph with betweenness centrality
node_color = [20000.0 * fb_graph.degree(v) for v in fb_graph]
node_size = [v * 10000 for v in betCent.values()]
plt.figure(figsize=(20, 20))
nx.draw_networkx(fb_graph, pos=pos, with_labels=False, node_color=node_color,
node_size=node_size)
plt.axis("off")
plt.show()

# Calculate and print closeness centrality
closeness_centrality = nx.closeness_centrality(fb_graph)
sorted_closeness_centrality = sorted(closeness_centrality.items(), key=lambda item: item[1],
reverse=True)
print("Closeness Centrality:", sorted_closeness_centrality[:8])
# Plot the graph with closeness centrality
node_size = [v * 50 for v in closeness_centrality.values()]

plt.figure(figsize=(15, 8))
nx.draw_networkx(fb_graph, pos=pos, node_size=node_size, with_labels=False, width=0.15)
plt.axis("off")
plt.show()
# Check for bridges in the graph
print(nx.has_bridges(fb_graph))
# Find and print all the bridges
bridges = list(nx.bridges(fb_graph))
print("Number of bridges:", len(bridges))
# Find and print all the local bridges
local_bridges = list(nx.local_bridges(fb_graph, with_span=False))
print("Number of local bridges:", len(local_bridges))
# Plot the graph highlighting the local bridges
plt.figure(figsize=(15, 5))
nx.draw_networkx(fb_graph, pos=pos, node_size=10, with_labels=False, width=0.15)
nx.draw_networkx_edges(fb_graph, pos, edgelist=local_bridges, width=0.5,
edge_color="green")
plt.axis("off")
plt.show()
# Calculate and print the average clustering coefficient
print("Average clustering coefficient:", nx.average_clustering(fb_graph))

Use Graph Neural Networks on the datasets (Planetoid Cora Dataset)/ Jazz
Musicians Network

In [None]:
!pip install torch
!pip install torch-geometric
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv
dataset = Planetoid(root='./', name='Cora', transform=T.NormalizeFeatures())

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

# Assuming dataset is already loaded
data = dataset[0]

# Define the GCN model
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Initialize the model and optimizer
model = GCN(dataset.num_features, 16, dataset.num_classes)
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
print(optimizer)

# Training function
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Test function
def test():
    model.eval()
    logits = model(data.x, data.edge_index)
    accs = []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
        pred = logits[mask].max(1)[1]
        acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
        accs.append(acc)
    return accs

# Training loop
for epoch in range(200):
    loss = train()
    train_acc, val_acc, test_acc = test()
    if epoch % 10 == 0:
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}, '
              f'Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}')

# Final test accuracy
model.eval()
_, pred = model(data.x, data.edge_index).max(dim=1)
correct = pred[data.test_mask].eq(data.y[data.test_mask]).sum().item()
acc = correct / data.test_mask.sum().item()
print(f'Test Accuracy: {acc:.4f}')


Pr 10) gephi code


In [None]:
import pandas as pd
import networkx as nx

# Load the CSV data (adjust path if needed)
df = pd.read_csv('/content/drive/MyDrive/Datasets/twitter_user_interactions.csv')

# Initialize a directed graph
G = nx.DiGraph()

# Add nodes with user data
for index, row in df.iterrows():
    G.add_node(
        row['user'],
        tweets=row['tweets'],
        retweets=row['retweets'],
        likes=row['likes'],
        mentions=row['mentions'],
        followers=row['followers']
    )

# Create dummy edges: each user mentions the next user in the list
users = df['user'].tolist()
for i in range(len(users) - 1):
    G.add_edge(users[i], users[i + 1], weight=df.loc[i, 'mentions'])

# ✅ Save the graph to a GEXF file in Colab
output_path = '/content/twitter_network.gexf'
nx.write_gexf(G, output_path)

print(f"Graph successfully saved to: {output_path}")
from google.colab import files
files.download(output_path)

Graph successfully saved to: /content/twitter_network.gexf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>