In [None]:
pip install praw langdetect pandas



In [None]:
import praw
from langdetect import detect
import pandas as pd
import re
import time

In [None]:
# ---- SETUP ----
reddit = praw.Reddit(
    client_id='ptFMyDVWowRoHvJs1JGMnw',
    client_secret='ZSq2k_I5bYfZUI5QEeJpxO54a7ouuA',
    user_agent='kannada_scraper by /u/	boseeeee_12345',
)


In [None]:
def is_kannada(text, threshold=0.5):
    try:
        if detect(text) == 'kn':
            return True
    except:
        pass  # Fallback method if language detection fails

    kannada_chars = [char for char in text if '\u0C80' <= char <= '\u0CFF']  # Kannada Unicode range
    if len(kannada_chars) / len(text) > threshold:
        return True

    return False

In [None]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = re.sub(r'http\S+', '', text)  # Remove links
    return text.strip()

In [None]:
def scrape_kannada_posts_and_comments(subreddit_names=['india'], limit=300, keywords=None):
    data = []

    for subreddit_name in subreddit_names:
        try:
            subreddit = reddit.subreddit(subreddit_name)

            # Check if subreddit exists
            if not subreddit.display_name:
                print(f"Warning: Subreddit {subreddit_name} does not exist or is inaccessible.")
                continue

            for submission in subreddit.new(limit=limit):
                title = clean_text(submission.title)
                body = clean_text(submission.selftext)

                if not body:
                    continue

                if keywords and not any(k in body for k in keywords):
                    continue

                if is_kannada(body):
                    post_data = {
                        'type': 'post',
                        'parent_id': '',
                        'post_id': submission.id,
                        'title': title,
                        'text': body,
                        'url': submission.url,
                        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(submission.created_utc)),
                        'score': submission.score,
                        'num_comments': submission.num_comments,
                        'subreddit': subreddit_name
                    }
                    data.append(post_data)

                    # Extract comments
                    submission.comments.replace_more(limit=0)
                    for comment in submission.comments.list():
                        comment_body = clean_text(comment.body)
                        if comment_body and is_kannada(comment_body):
                            comment_data = {
                                'type': 'comment',
                                'parent_id': comment.parent_id,
                                'post_id': submission.id,
                                'title': '',
                                'text': comment_body,
                                'url': submission.url,
                                'timestamp': time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(comment.created_utc)),
                                'score': comment.score,
                                'num_comments': '',
                                'subreddit': subreddit_name
                            }
                            data.append(comment_data)
        except Exception as e:
            print(f"Error while processing subreddit {subreddit_name}: {e}")
            continue  # Skip this subreddit and move on to the next

    return pd.DataFrame(data)

In [None]:
# Enhanced Kannada keywords list (Add more keywords as needed)
kannada_keywords = [
    # General Language/State
    'ಕನ್ನಡ', 'ಕರ್ನಾಟಕ', 'ಕರ್ನಾಟಕದ', 'ಕರ್ನಾಟಕದಲ್ಲಿ',

    # Cities
    'ಬೆಂಗಳೂರು', 'ಮೈಸೂರು', 'ಧಾರವಾಡ', 'ಗುಲ್ಬರ್ಗಾ', 'ಹುಬ್ಬಳ್ಳಿ', 'ದಾವಣಗೆರೆ', 'ಮಂಗಳೂರು', 'ಬಳ್ಳಾರಿ', 'ತುಮಕೂರು',

    # Politics and Government
    'ಚುನಾವಣೆ', 'ಸರ್ಕಾರ', 'ಮತದಾನ', 'ವಿಧಾನಸಭೆ', 'ಸಂಸತ್', 'ಪ್ರಧಾನಮಂತ್ರಿ', 'ಮುಖ್ಯಮಂತ್ರಿ', 'ರಾಜಕೀಯ',

    # Culture and Identity
    'ಜಾನಪದ', 'ಭಾಷೆ', 'ಸಾಹಿತ್ಯ', 'ಕಾವ್ಯ', 'ಚಟುವಟಿಕೆ', 'ಹಬ್ಬ', 'ಉತ್ಸವ', 'ನಾಟಕ',

    # Entertainment
    'ಚಲನಚಿತ್ರ', 'ಸಿನಿಮಾ', 'ಹೀರೋ', 'ಹೀರೋಯಿನ್', 'ಗಾನ', 'ಸಂಗೀತ', 'ಅಭಿನಯ', 'ಡ್ರಾಮಾ',

    # Infrastructure and Daily Life
    'ಮೆಟ್ರೋ', 'ಬಸ್', 'ರಸ್ತೆ', 'ಸಮಸ್ಯೆ', 'ವಿದ್ಯುತ್', 'ನೀರು', 'ಅರ್ಜಿ', 'ಆಸ್ಪತ್ರೆ', 'ಪೊಲೀಸ್', 'ಶಿಕ್ಷಣ',

    # Education and Youth
    'ವಿದ್ಯಾರ್ಥಿ', 'ಶಾಲೆ', 'ವಿಶ್ವವಿದ್ಯಾಲಯ', 'ಪದವಿ', 'ಪರೀಕ್ಷೆ', 'ಗುರು', 'ಉನ್ನತ ಶಿಕ್ಷಣ',

    # Issues/Problems
    'ಭ್ರಷ್ಟಾಚಾರ', 'ಹಿಂಸೆ', 'ಕಗ್ಗುಲು', 'ಕಸ', 'ದೂರು', 'ತೊಂದರೆ', 'ಅಸಮಾಧಾನ', 'ಪ್ರತಿಬಟನೆ',

    # Festivals and Religion
    'ದಸರಾ', 'ದೀಪಾವಳಿ', 'ಉಗಾದಿ', 'ರಾಮನವಮಿ', 'ಈದ್', 'ಕ್ರಿಸ್ಮಸ್', 'ಹಬ್ಬ', 'ಪೂಜೆ',

    # Technology & Internet
    'ಮೆಸೇಜ್', 'ಮೋಬೈಲ್', 'ಆ್ಯಪ್', 'ಇಂಟರ್ನೆಟ್', 'ಅನ್ಲೈನ್', 'ಅಪ್‌ಡೇಟ್'
]

# List of subreddits to scrape (can include Kannada-related ones)
subreddits = [
    'india', 'kannada', 'bangalore', 'karnataka', 'southindia', 'bengaluru', 'blr',
    'karnatakanews', 'ruralindia', 'indianpolitics', 'kannadaculture'
]

In [None]:
# Run the scraper for enhanced subreddits
df_kannada = scrape_kannada_posts_and_comments(
    subreddit_names=subreddits,
    limit=2000,
    keywords=kannada_keywords
)


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Error while processing subreddit blr: received 404 HTTP response
Error while processing subreddit karnatakanews: received 404 HTTP response


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Error while processing subreddit indianpolitics: received 403 HTTP response
Error while processing subreddit kannadaculture: Redirect to /subreddits/search


In [None]:
# Debugging: Check the number of entries returned
print(f"Total entries scraped: {len(df_kannada)}")

Total entries scraped: 603


In [None]:
output_path = 'kannada_reddit_posts_and_comments.csv'
df_kannada.to_csv(output_path,encoding='utf-8-sig', index=False)
print(f"✅ File saved as {output_path}")



✅ File saved as kannada_reddit_posts_and_comments.csv


In [None]:
import pandas as pd
import re

# Function to clean text and remove non-Kannada words
def remove_english_words(text):
    # Remove English alphabets, numbers, and punctuation; keep only Kannada characters
    kannada_text = re.sub(r'[^\u0C80-\u0CFF\s]', '', text)  # Kannada Unicode range: \u0C80-\u0CFF
    return kannada_text

# Load the existing CSV with the scraped data
input_path = 'kannada_reddit_posts_and_comments.csv'
df_kannada = pd.read_csv(input_path)

# Filter for rows where 'text' is not empty
df_kannada = df_kannada[df_kannada['text'].notna()]

# Clean the 'text' column by removing non-Kannada words (English words)
df_kannada['text'] = df_kannada['text'].apply(remove_english_words)

# Keep only the 'text' column
df_kannada_cleaned = df_kannada[['text']]

# Save the cleaned data to a new CSV
output_path_cleaned = 'cleaned_kannada_text.csv'
df_kannada_cleaned.to_csv(output_path_cleaned,encoding='utf-8-sig', index=False)

print(f"✅ Cleaned CSV saved as {output_path_cleaned}")


✅ Cleaned CSV saved as cleaned_kannada_text.csv
