In [None]:
!pip install pandas textblob sentence-transformers

In [1]:
import json
import pandas as pd

# Load the dataset
with open('reddit_data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)
# Convert to a pandas DataFrame for easier manipulation
posts = pd.DataFrame(data)
print(posts.head())  # Inspect the first few rows

  subreddit         keyword  \
0      eggs  farm made eggs   
1      eggs  farm made eggs   
2      eggs  farm made eggs   
3      eggs  farm made eggs   
4      eggs  farm made eggs   

                                               title  \
0       How do make your inventory on your egg farm?   
1  Eggs are consistently greyish for the last 2 o...   
2                            What is this in my egg?   
3  [Homemade] Happy Farms eggs on garlic bread ma...   
4                  My children’s new favorite thing…   

                                             content  \
0  hi how's your egg business?? I have an Excel f...   
1  Hello, i live in Montana US. We have never had...   
2  Ive never bought this brand before but they se...   
3                                                      
4  Recently blew my children’s minds making eggie...   

                                      url  post_id  \
0    https://i.redd.it/qowddai1m7ke1.jpeg  1itofr6   
1    https://i.redd.it/wecifuuy

In [2]:
import re

def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Remove special characters and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text

# Apply cleaning to titles and comments
posts['title'] = posts['title'].apply(clean_text)
posts['content'] = posts['content'].apply(clean_text)

# Clean comment text
for i, post in posts.iterrows():
    for comment in post['comments']:
        comment['text'] = clean_text(comment['text'])

# Check if the output is correct
print(posts.head())

  subreddit         keyword  \
0      eggs  farm made eggs   
1      eggs  farm made eggs   
2      eggs  farm made eggs   
3      eggs  farm made eggs   
4      eggs  farm made eggs   

                                               title  \
0        how do make your inventory on your egg farm   
1  eggs are consistently greyish for the last 2 o...   
2                             what is this in my egg   
3  homemade happy farms eggs on garlic bread made...   
4                    my childrens new favorite thing   

                                             content  \
0  hi hows your egg business i have an excel form...   
1  hello i live in montana us we have never had a...   
2  ive never bought this brand before but they se...   
3                                                      
4  recently blew my childrens minds making eggies...   

                                      url  post_id  \
0    https://i.redd.it/qowddai1m7ke1.jpeg  1itofr6   
1    https://i.redd.it/wecifuuy

In [3]:
from textblob import TextBlob

def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Returns a score between -1 (negative) and 1 (positive)

# Calculate sentiment for post titles and content
posts['title_sentiment'] = posts['title'].apply(get_sentiment)
posts['content_sentiment'] = posts['content'].apply(get_sentiment)

# Calculate sentiment for comments
for i, post in posts.iterrows():
    for comment in post['comments']:
        comment['sentiment'] = get_sentiment(comment['text'])

# Check if the sentiment score is added correctly
print(posts[['title', 'title_sentiment', 'content', 'content_sentiment']].head())
for i, post in posts.iterrows():
    for comment in post['comments']:
        print(comment['text'], comment['sentiment'])

                                               title  title_sentiment  \
0        how do make your inventory on your egg farm         0.000000   
1  eggs are consistently greyish for the last 2 o...         0.125000   
2                             what is this in my egg         0.000000   
3  homemade happy farms eggs on garlic bread made...         0.800000   
4                    my childrens new favorite thing         0.318182   

                                             content  content_sentiment  
0  hi hows your egg business i have an excel form...          -0.122500  
1  hello i live in montana us we have never had a...          -0.028288  
2  ive never bought this brand before but they se...           0.322222  
3                                                              0.000000  
4  recently blew my childrens minds making eggies...           0.050000  
all eggs no matter where you get them from simple math is you are the common denominator  whether it be your pan oil 

In [4]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')


def get_semantic_score(text1, text2):
    embedding1 = model.encode(text1, convert_to_tensor=True)
    embedding2 = model.encode(text2, convert_to_tensor=True)
    return util.pytorch_cos_sim(embedding1, embedding2).item()

# Calculate semantic scores for each comment relative to the post title
for i, post in posts.iterrows():
    post_title = post['title']
    for comment in post['comments']:
        comment_text = comment['text']
        comment['semantic_score'] = get_semantic_score(post_title, comment_text)

# Check if the semantic score is added correctly
for i, post in posts.iterrows():
    for comment in post['comments']:
        print(comment['text'], comment['semantic_score'])

  from tqdm.autonotebook import tqdm, trange


KeyboardInterrupt: 

In [5]:
# Define the engagement score calculation function
def calculate_engagement_score(post):
    # Check if the required fields exist
    if 'num_comments' not in post or 'upvote_ratio' not in post:
        raise ValueError("Post data is missing required fields.")

    # Normalize num_comments
    max_comments = posts['num_comments'].max()
    normalized_comments = post['num_comments'] / max_comments if max_comments else 0

    # Average sentiment of comments
    comment_sentiments = [comment.get('sentiment', 0) for comment in post.get('comments', [])]
    avg_comment_sentiment = sum(comment_sentiments) / len(comment_sentiments) if comment_sentiments else 0

    # Average semantic score of comments
    comment_semantic_scores = [comment.get('semantic_score', 0) for comment in post.get('comments', [])]
    avg_semantic_score = sum(comment_semantic_scores) / len(comment_semantic_scores) if comment_semantic_scores else 0

    # Assign weights (adjust based on importance)
    w1, w2, w3, w4 = 0.4, 0.3, 0.2, 0.1

    engagement_score = (
        w1 * post.get('upvote_ratio', 0) +
        w2 * normalized_comments +
        w3 * avg_comment_sentiment +
        w4 * avg_semantic_score
    )

    # Check the output
    if not isinstance(engagement_score, (int, float)):
        raise TypeError("Engagement score calculation did not return a numeric value.")

    return engagement_score

# Ensure comments are in a list format
if 'comments' in posts.columns:
    posts['comments'] = posts['comments'].apply(lambda x: x if isinstance(x, list) else [])

# Calculate engagement scores for each post
posts['engagement_score'] = posts.apply(calculate_engagement_score, axis=1)

# Add engagement score to the rest of the data in the dataframe
posts['rst'] = posts['engagement_score']

# Sort posts by engagement score in descending order and print the top 100
top_100_posts = posts.nlargest(100, 'engagement_score')
print(top_100_posts[['title', 'engagement_score']])

                                                 title  engagement_score
335        whats your go to cheese for an egg sandwich          0.780000
336  got these from a local the eggs have some crud...          0.636689
332                         i hate peeling boiled eggs          0.630859
329  been on the single ingredient diet for 2 month...          0.584649
339  am i wrong for sending the plate back because ...          0.570811
..                                                 ...               ...
355  somewhere under the egg is ham hash brown pick...          0.414682
395               tried poached eggs on grilled cheese          0.414551
351  im an egg eater you can see here ive put some ...          0.414402
407          second attempt at making soft boiled eggs          0.414402
381                      just a beautifully cooked egg          0.413398

[100 rows x 2 columns]


In [6]:
# Save the processed data to a new JSON file
posts.to_json('processed_reddit_dataset.json', orient='records', indent=4)

facebook data preprocessing


In [4]:
# Install required packages
!pip install nltk emoji pandas numpy

# Import and download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords') 
nltk.download('vader_lexicon')



Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
   ---------------------------------------- 0.0/590.6 kB ? eta -:--:--
   ----------------- ---------------------- 262.1/590.6 kB ? eta -:--:--
   ---------------------------------------- 590.6/590.6 kB 2.6 MB/s eta 0:00:00
Installing collected packages: emoji
Successfully installed emoji-2.14.1



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...


True

In [8]:
# Facebook Data Preprocessing

# Text cleaning and preprocessing
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
import emoji

# Load the Facebook data with explicit UTF-8 encoding
with open('dataset_facebook-posts-scraper_2025-02-25_06-25-33-542.json', 'r', encoding='utf-8') as f:
    fb_data = pd.read_json(f)

# Text cleaning function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    
    # Remove special characters but keep hashtags
    text = re.sub(r'[^\w\s#]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Extract hashtags function
def extract_hashtags(text):
    hashtags = re.findall(r'#(\w+)', text)
    return [tag.lower() for tag in hashtags]

# Process Facebook posts
fb_data['clean_text'] = fb_data['text'].apply(clean_text)
fb_data['text_length'] = fb_data['clean_text'].str.len()
fb_data['word_count'] = fb_data['clean_text'].apply(lambda x: len(str(x).split()))
fb_data['hashtags'] = fb_data['text'].apply(extract_hashtags)

# Sentiment analysis
sia = SentimentIntensityAnalyzer()
fb_data['sentiment_scores'] = fb_data['clean_text'].apply(lambda x: sia.polarity_scores(str(x)))
fb_data['sentiment'] = fb_data['sentiment_scores'].apply(lambda x: x['compound'])

# Calculate engagement metrics
fb_data['total_engagement'] = fb_data['shares'] + fb_data['likes']

# Engagement categorization
def get_engagement_category(row):
    total = row['total_engagement']
    if total < 10:
        return 'low'
    elif total < 50:
        return 'medium' 
    else:
        return 'high'

fb_data['engagement_category'] = fb_data.apply(get_engagement_category, axis=1)

# Save processed data
fb_data.to_json('processed_facebook_dataset.json', orient='records', indent=4, force_ascii=False)

print("Facebook data preprocessing complete!")


Facebook data preprocessing complete!


Instagram data processing 


In [10]:
# Load Instagram data
with open('dataset_instagram-scraper-sparrow_2025-02-25_05-48-10-463.json', 'r', encoding='utf-8') as f:
    ig_data = pd.read_json(f)

# Clean caption text
ig_data['clean_caption'] = ig_data['caption'].apply(clean_text)

# Process comments
def process_comments(comments):
    if not comments or not isinstance(comments, list):
        return []
    
    processed = []
    for comment in comments:
        if isinstance(comment, dict) and 'text' in comment:
            clean = clean_text(comment['text'])
            sentiment = sia.polarity_scores(clean)['compound']
            processed.append({
                'clean_text': clean,
                'sentiment': sentiment,
                'likes': comment.get('likesCount', 0),
                'timestamp': comment.get('timestamp')
            })
    return processed

ig_data['processed_comments'] = ig_data['latestComments'].apply(process_comments)

# Calculate comment sentiment stats
def get_comment_stats(comments):
    if not comments:
        return {
            'avg_sentiment': 0,
            'positive_count': 0,
            'negative_count': 0,
            'neutral_count': 0,
            'avg_likes': 0
        }
        
    sentiments = [c['sentiment'] for c in comments]
    likes = [c['likes'] for c in comments]
    
    return {
        'avg_sentiment': np.mean(sentiments),
        'positive_count': sum(1 for s in sentiments if s > 0.05),
        'negative_count': sum(1 for s in sentiments if s < -0.05),
        'neutral_count': sum(1 for s in sentiments if -0.05 <= s <= 0.05),
        'avg_likes': np.mean(likes)
    }

ig_data['comment_stats'] = ig_data['processed_comments'].apply(get_comment_stats)

# Extract hashtag frequencies
def get_hashtag_stats(hashtags):
    if not hashtags or not isinstance(hashtags, list):
        return {'count': 0, 'unique': 0}
    return {
        'count': len(hashtags),
        'unique': len(set(hashtags))
    }

ig_data['hashtag_stats'] = ig_data['hashtags'].apply(get_hashtag_stats)

# Calculate engagement metrics
ig_data['engagement_score'] = ig_data.apply(
    lambda x: (x['likesCount'] + x['commentsCount']*2)/100, axis=1
)

ig_data['engagement_category'] = ig_data['engagement_score'].apply(
    lambda x: 'high' if x > 5 else ('medium' if x > 1 else 'low')
)

# Perform sentiment analysis on captions
ig_data['caption_sentiment'] = ig_data['clean_caption'].apply(
    lambda x: sia.polarity_scores(str(x))['compound']
)

# Save processed data
processed_data = {
    'posts': ig_data.to_dict('records'),
    'metadata': {
        'total_posts': len(ig_data),
        'avg_likes': ig_data['likesCount'].mean(),
        'avg_comments': ig_data['commentsCount'].mean(),
        'total_hashtags': ig_data['hashtags'].apply(len).sum()
    }
}

with open('processed_instagram_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(processed_data, f, indent=4, ensure_ascii=False)

print("Instagram data preprocessing complete!")


NameError: name 'json' is not defined