<a href="https://colab.research.google.com/github/fyas101/AI-Journal-INST326/blob/main/Reddit_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
def categorize_by_tone(post_text):
    if not isinstance(post_text, str):
        raise TypeError("post_text must be a string")

    clean_text = post_text.strip()
    if not clean_text:
        raise ValueError("post_text cannot be empty")

    lower_text = clean_text.lower()

    # Basic heuristic keywords and cues
    anger_cues = ['!', 'angry', 'hate', 'worst', 'terrible']
    sarcasm_cues = ['yeah right', 'sure', 'totally']
    humor_cues = ['lol', 'funny', 'haha']
    uncertainty_cues = ['maybe', 'not sure', 'idk', 'perhaps']

    # Rule-based detection
    if any(cue in lower_text for cue in sarcasm_cues):
        return 'sarcastic'
    elif any(cue in lower_text for cue in anger_cues) and clean_text.isupper():
        return 'angry'
    elif any(cue in lower_text for cue in humor_cues):
        return 'humorous'
    elif any(cue in lower_text for cue in uncertainty_cues):
        return 'uncertain'
    elif '?' in clean_text:
        return 'neutral'
    else:
        return 'informative'


In [None]:
def total_metadata_type(posts_metadata):
  if not isinstance(posts_metadata, list):
        raise TypeError("posts_metadata must be a list of dictionaries")

  if not posts_metadata:
        raise ValueError("posts_metadata cannot be empty")

  if not all(isinstance(post, dict) for post in posts_metadata):
        raise TypeError("Each element in posts_metadata must be a dictionary")

  summary = {}

  for post in posts_metadata:
        for key, value in post.items():
            if key not in summary:
                summary[key] = {}
            str_value = str(value)
            summary[key][str_value] = summary[key].get(str_value, 0) + 1

  return summary

In [None]:
def track_users(posts_data):
  if not isinstance(posts_data, list):
        raise TypeError("posts_data must be a list of dictionaries")

  if not posts_data:
        raise ValueError("posts_data cannot be empty")

  required_keys = {"username", "tone", "category", "is_disinformation", "upvotes", "comments"}
  if not all(required_keys.issubset(post.keys()) for post in posts_data):
        raise ValueError(f"Each post must include keys: {required_keys}")

  users_summary = {}

  for post in posts_data:
        user = post["username"]
        if user not in users_summary:
            users_summary[user] = {
                "total_posts": 0,
                "total_upvotes": 0,
                "total_comments": 0,
                "disinformation_posts": 0,
                "tones_used": {},
                "categories_posted": {}
            }

        summary = users_summary[user]
        summary["total_posts"] += 1
        summary["total_upvotes"] += post["upvotes"]
        summary["total_comments"] += post["comments"]

        # Track disinformation
        if post["is_disinformation"]:
            summary["disinformation_posts"] += 1

        # Track tones
        tone = post["tone"]
        summary["tones_used"][tone] = summary["tones_used"].get(tone, 0) + 1

        # Track categories
        category = post["category"]
        summary["categories_posted"][category] = summary["categories_posted"].get(category, 0) + 1

  # Compute averages
  for user, stats in users_summary.items():
        stats["avg_upvotes"] = round(stats["total_upvotes"] / stats["total_posts"], 2)
        stats["avg_comments"] = round(stats["total_comments"] / stats["total_posts"], 2)
        del stats["total_upvotes"]
        del stats["total_comments"]

  return users_summary

In [None]:
def sorts_weekly_top_10(posts_data):
  if not isinstance(posts_data, list):
        raise TypeError("posts_data must be a list of dictionaries")

  if not posts_data:
        raise ValueError("posts_data cannot be empty")

  for post in posts_data:
        if not isinstance(post, dict):
            raise TypeError("Each post must be a dictionary")
        if "timestamp" not in post or "upvotes" not in post or "comments" not in post:
            raise ValueError("Each post must include 'timestamp', 'upvotes', and 'comments' keys")

  # Group posts by ISO week number (YYYY-W##)
  weekly_posts = defaultdict(list)

  for post in posts_data:
        try:
            dt = datetime.fromisoformat(post["timestamp"])
        except ValueError:
            raise ValueError(f"Invalid timestamp format in post: {post.get('title', 'unknown')}")

        week_label = f"{dt.isocalendar().year}-W{dt.isocalendar().week:02d}"

        # Add engagement score
        post_copy = dict(post)
        post_copy["engagement"] = post["upvotes"] + post["comments"]
        weekly_posts[week_label].append(post_copy)

    # Sort and take top 10 per week
  top_weekly_posts = {}
  for week, posts in weekly_posts.items():
        sorted_posts = sorted(posts, key=lambda p: p["engagement"], reverse=True)
        top_weekly_posts[week] = sorted_posts[:10]

  return top_weekly_posts

In [None]:
def top_posters_list(posts_data, top_n=10):
   if not isinstance(posts_data, list):
        raise TypeError("posts_data must be a list of dictionaries")

   if not posts_data:
        raise ValueError("posts_data cannot be empty")

   required_keys = {"username", "upvotes", "comments", "is_disinformation"}
   if not all(required_keys.issubset(post.keys()) for post in posts_data):
        raise ValueError(f"Each post must include keys: {required_keys}")

   user_stats = {}

   for post in posts_data:
        user = post["username"]
        if user not in user_stats:
            user_stats[user] = {
                "total_posts": 0,
                "total_engagement": 0,
                "disinformation_posts": 0
            }

        user_stats[user]["total_posts"] += 1
        user_stats[user]["total_engagement"] += post["upvotes"] + post["comments"]
        if post["is_disinformation"]:
            user_stats[user]["disinformation_posts"] += 1

    # Compute averages and prepare for sorting
   ranked_users = []
   for user, stats in user_stats.items():
        avg_engagement = round(stats["total_engagement"] / stats["total_posts"], 2)
        ranked_users.append({
            "username": user,
            "total_posts": stats["total_posts"],
            "total_engagement": stats["total_engagement"],
            "avg_engagement": avg_engagement,
            "disinformation_posts": stats["disinformation_posts"]
        })

    # Sort by total_engagement first, then total_posts
   ranked_users.sort(key=lambda u: (u["total_engagement"], u["total_posts"]), reverse=True)

   return ranked_users[:top_n]

In [None]:
def total_interactions_this_week(posts_data, current_date=None):
  if not isinstance(posts_data, list):
        raise TypeError("posts_data must be a list of dictionaries")

  if not posts_data:
        return {
            "total_posts": 0,
            "total_upvotes": 0,
            "total_comments": 0,
            "total_interactions": 0,
            "start_date": None,
            "end_date": None
        }

  required_keys = {"upvotes", "comments", "created_utc"}
  if not all(required_keys.issubset(post.keys()) for post in posts_data):
        raise TypeError(f"Each post must include keys: {required_keys}")

  if current_date is None:
        current_date = datetime.utcnow()

  week_start = current_date - timedelta(days=7)

  total_upvotes = 0
  total_comments = 0
  post_count = 0

  for post in posts_data:
        try:
            post_date = datetime.fromisoformat(post["created_utc"])
        except ValueError:
            raise ValueError(f"Invalid date format for post: {post.get('title', 'Unknown')}")

        if week_start <= post_date <= current_date:
            post_count += 1
            total_upvotes += post["upvotes"]
            total_comments += post["comments"]

  return {
        "total_posts": post_count,
        "total_upvotes": total_upvotes,
        "total_comments": total_comments,
        "total_interactions": total_upvotes + total_comments,
        "start_date": week_start.date().isoformat(),
        "end_date": current_date.date().isoformat()
    }

In [None]:
def track_top_5_posts_of_the_semester(posts_data):
  if not isinstance(posts_data, list):
        raise TypeError("posts_data must be a list of dictionaries.")

  required_keys = {"title", "upvotes", "comments", "created_utc"}
  for post in posts_data:
        if not isinstance(post, dict):
            raise TypeError("Each post must be a dictionary.")
        if not required_keys.issubset(post.keys()):
            raise TypeError(f"Each post must include the keys: {required_keys}")

  # Calculate engagement and add it to each post
  for post in posts_data:
        post["total_interactions"] = post["upvotes"] + post["comments"]

  # Sort posts by total interactions (descending)
  sorted_posts = sorted(posts_data, key=lambda x: x["total_interactions"], reverse=True)

  # Return top 5 posts
  return sorted_posts[:5]

In [None]:
def interaction_rate(post_data):
  if not isinstance(post_data, dict):
        raise TypeError("post_data must be a dictionary.")

  required_keys = {"upvotes", "comments", "views"}
  if not required_keys.issubset(post_data.keys()):
        raise TypeError(f"post_data must include the keys: {required_keys}")

  views = post_data["views"]
  if views <= 0:
        raise ValueError("views must be greater than zero to calculate interaction rate.")

  total_interactions = post_data["upvotes"] + post_data["comments"]
  rate = (total_interactions / views) * 100

  return round(rate, 2)

In [None]:
import re
def clean_post_text(text):
  if not isinstance(text, str):
        raise TypeError("Input must be a string.")

  # Remove URLs
  text = re.sub(r"http\S+|www\S+|https\S+", "", text)

  # Remove Reddit mentions and hashtags
  text = re.sub(r"[@#]\w+", "", text)

  # Remove punctuation and non-alphabetic characters (except spaces)
  text = re.sub(r"[^a-zA-Z\s]", "", text)

  # Convert to lowercase
  text = text.lower()

  # Remove extra whitespace
  text = re.sub(r"\s+", " ", text).strip()

  return text

In [None]:
def check_duplicate(new_post, existing_posts, similarity_threshold=0.9):
  if not isinstance(new_post, str):
        raise TypeError("new_post must be a string.")
  if not isinstance(existing_posts, list) or not all(isinstance(p, str) for p in existing_posts):
        raise TypeError("existing_posts must be a list of strings.")
  if not (0 <= similarity_threshold <= 1):
        raise ValueError("similarity_threshold must be between 0 and 1.")

  # --- Helper function: clean text for comparison ---
  def _clean_text(text):
        text = re.sub(r"http\S+|www\S+", "", text)
        text = re.sub(r"[^a-zA-Z\s]", "", text).lower()
        text = re.sub(r"\s+", " ", text).strip()
        return text

  new_post_clean = _clean_text(new_post)
  new_words = set(new_post_clean.split())

  for post in existing_posts:
        existing_clean = _clean_text(post)
        existing_words = set(existing_clean.split())

        if not existing_words:
            continue

        # Calculate similarity: word overlap ratio
        overlap = len(new_words & existing_words) / len(new_words | existing_words)

        if overlap >= similarity_threshold:
            return True  # Duplicate or near-duplicate found

  return False

In [None]:
def detect_misinformation(post_text, keyword_list=None):
  if not isinstance(post_text, str):
        raise TypeError("post_text must be a string.")
  if keyword_list is not None and not (isinstance(keyword_list, list) and all(isinstance(k, str) for k in keyword_list)):
        raise TypeError("keyword_list must be a list of strings or None.")

  # Default keywords often associated with misinformation
  default_keywords = [
        "rumor", "unconfirmed", "heard", "confirmed??", "sources say", "reportedly", "breaking", "shocking",
        "can't believe", "conspiracy", "fake news", "scam", "hoax", "allegedly"
  ]

  keywords = keyword_list if keyword_list else default_keywords

  # Clean post text for matching
  cleaned_text = post_text.lower()
  cleaned_text = re.sub(r"http\S+|www\S+|https\S+", "", cleaned_text)  # remove URLs
  cleaned_text = re.sub(r"[^a-zA-Z\s]", "", cleaned_text)  # remove punctuation

  matched = [kw for kw in keywords if kw in cleaned_text]

  return {
        "is_misinformation": bool(matched),
        "matched_keywords": matched
  }

In [None]:
def generate_weekly_report(posts_data, week_start=None, week_end=None):
  if not isinstance(posts_data, list):
        raise TypeError("posts_data must be a list of dictionaries.")

  # Set default week range if not provided
  if week_end is None:
        week_end = datetime.utcnow()
  if week_start is None:
        from datetime import timedelta
        week_start = week_end - timedelta(days=7)

  # Filter posts in the week
  weekly_posts = []
  for post in posts_data:
        try:
            post_date = datetime.fromisoformat(post["created_utc"])
            if week_start <= post_date <= week_end:
                weekly_posts.append(post)
        except Exception:
            continue  # skip posts with invalid date formats

  # Metrics
  total_upvotes = sum(p["upvotes"] for p in weekly_posts)
  total_comments = sum(p["comments"] for p in weekly_posts)
  total_interactions = total_upvotes + total_comments
  total_posts = len(weekly_posts)

  # Misinformation detection
  misinformation_count = sum(
        detect_misinformation(p.get("text", ""))["is_misinformation"]
        for p in weekly_posts
  )
  misinformation_rate = round((misinformation_count / total_posts) * 100, 2) if total_posts > 0 else 0

  # Top posters
  top_posters = top_posters_list(weekly_posts, top_n=5)

  # Top posts
  top_posts = sorts_weekly_top_10(weekly_posts)

  report = {
        "week_start": week_start.date().isoformat(),
        "week_end": week_end.date().isoformat(),
        "total_posts": total_posts,
        "total_upvotes": total_upvotes,
        "total_comments": total_comments,
        "total_interactions": total_interactions,
        "misinformation_count": misinformation_count,
        "misinformation_rate": misinformation_rate,
        "top_posters": top_posters,
        "top_posts": top_posts
  }

  return report

In [None]:
def post_type(post_data):
  if not isinstance(post_data, dict):
        raise TypeError("post_data must be a dictionary.")

  # If Reddit metadata explicitly says this is an image
  if post_data.get("post_hint") == "image":
        return "Pictures"

  # If URL ends with an image extension
  url = post_data.get("url", "").lower()
  image_extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]
  if any(url.endswith(ext) for ext in image_extensions):
        return "Pictures"

  # If text is present
  selftext = post_data.get("selftext", "")
  if isinstance(selftext, str) and selftext.strip():
        return "Words"

  return "Other"

In [None]:
def summarize_semester_trends(weekly_reports):
  if not isinstance(weekly_reports, list) or not all(isinstance(r, dict) for r in weekly_reports):
        raise TypeError("weekly_reports must be a list of dictionaries.")

  num_weeks = len(weekly_reports)
  if num_weeks == 0:
        return {}

  total_posts_list = [week.get("total_posts", 0) for week in weekly_reports]
  total_interactions_list = [week.get("total_interactions", 0) for week in weekly_reports]
  misinformation_rate_list = [week.get("misinformation_rate", 0) for week in weekly_reports]

  # Average metrics
  average_posts_per_week = mean(total_posts_list)
  average_interactions_per_week = mean(total_interactions_list)
  average_misinformation_rate = mean(misinformation_rate_list)

  # Trends for categories and post types
  category_counter = Counter()
  post_type_counter = Counter()

  for week in weekly_reports:
        for post in week.get("top_posts", []):
            if "category" in post:
                category_counter[post["category"]] += 1
            if "content_type" in post:
                post_type_counter[post["content_type"]] += 1

  category_trends = dict(category_counter.most_common())
  most_common_post_types = dict(post_type_counter.most_common())

  return {
        "average_posts_per_week": round(average_posts_per_week, 2),
        "average_interactions_per_week": round(average_interactions_per_week, 2),
        "average_misinformation_rate": round(average_misinformation_rate, 2),
        "category_trends": category_trends,
        "most_common_post_types": most_common_post_types
  }

In [None]:
def compare_engagement(posts_data, group_by="category"):
  if not isinstance(posts_data, list) or not all(isinstance(p, dict) for p in posts_data):
        raise TypeError("posts_data must be a list of dictionaries.")

  engagement_by_group = {}
  count_by_group = {}

  for post in posts_data:
        if group_by not in post:
            raise ValueError(f"Each post must contain the key '{group_by}'.")

        group = post[group_by]
        engagement = post.get("upvotes", 0) + post.get("comments", 0)

        engagement_by_group[group] = engagement_by_group.get(group, 0) + engagement
        count_by_group[group] = count_by_group.get(group, 0) + 1

  # Calculate average engagement per group
  avg_engagement_by_group = {
        group: round(engagement_by_group[group] / count_by_group[group], 2)
        for group in engagement_by_group
  }

  # Sort by average engagement descending
  sorted_engagement = dict(sorted(avg_engagement_by_group.items(), key=lambda item: item[1], reverse=True))

  return sorted_engagement

In [None]:
def extract_category_keywords(categories):
# Returns a dictionary that maps each category to example keywords after given a list of categories.

    categegory_keywords = {
        "humor": ["lol", "lmao", "haha", "joke"],
        "random": ["idk", "random",],
        "news": ["alert", "news", "update", "announcement", "diamondback", "report"],
        "academics": ["class", "classes", "exam", "professor", "grade", "gpa", "study", "midterm", "final", "project"],
        "advice": ["reccomend", "tips", "help", "should I", "question"],
        "social": ["party", "hangout", "movie", "homecoming", "game"],
    }
# Lists of categories with assigned example keywords
    return {cat: extract_category_keywords.get(cat, []) for cat in categories}
def normalize_word(word):
       # helps to make keywords lowercase as well as clean up extra space
        return word.strip().lower()

In [None]:
def analyze_post_lengths_by_category(posts, category_keywords):
  categorized = {}
    # A list of posts and category_keywords dictionary is taken.
    # Returns a dictionary where each category has list of the post and length with an average word count.
  for post in posts:
        if not isinstance(post, str) or not post.strip():
            continue
# Detects the category of the post based on keywords
        category = analyze_post_lengths_by_category(post, category_keywords)
# Calculates how many words the post contains
        word_count = len(post.split())
# Stores post and word count
        categorized[category].append((post, word_count))
        results = {}
# Loops through categories to match (post, length) and extracts word counts
  for cat, items in categorized.items():
        lengths = [length for _, length in items]
        avg_length = round(sum(lengths) / len(lengths), 2)
# Stores data of (post, length)
        results[cat] = {
            "posts": items,
            "average_length": avg_length
        }
  return results