# Scraping Code: Reddit and Instagram
### Jagruta Advani, Dhruv Arora, Tara Mary Joseph, Gayathree Gopi, John Hwang, Greeshma Gopinathan

# Reddit Scraper

- The reddit code is dynamic such that we can dynamically enter what all reddits to scrape and what should be the limit.
- If we want certain posts to scrape, we can enter their url and the limit for each post
- We have option for primary and secondary keyword such that if we want only those posts to be scraped with relevant information, it can be done
- If we are not sure if the post contains complete relevant information, we can use keyword search to find such posts and scrape only the relevant comments where keyword matching is achieved.
- We can also sort subreddits before scraping as well like sort on basis of relevance or hot or top or new. We can have time filter as well like all, year, day etc

In [107]:
import praw
import csv
import time
from datetime import datetime

def scrape_subreddit_for_keywords(
    subreddits_dict, primary_keywords, secondary_keywords, 
    posts_dict=None, sort_by='new', time_filter='all'
):
    """
    Scrape Reddit for specified subreddits and posts based on keyword filtering.
    
    :param subreddits_dict: Dictionary where key is the subreddit name and value is the entry limit.
    :param primary_keywords: List of primary keywords to search for.
    :param secondary_keywords: List of secondary keywords to search for.
    :param posts_dict: Optional dictionary where key is post URL and value is the entry limit for each post.
    :param sort_by: Sort criteria for posts. Options are 'relevance', 'hot', 'top', 'new'.
    :param time_filter: Time filter for sorting when using 'top'. Options are 'all', 'year', 'month', 'week', 'day', 'hour'.
    """
    
    # List to hold the scraped data
    scraped_data = []
    posts_checked = 0
    posts_scraped = 0
    comments_scraped = 0

    # Set to keep track of processed submissions to avoid duplicates
    processed_submissions = set()

    # Process each subreddit from the dictionary
    for subreddit_name, entry_limit in subreddits_dict.items():
        subreddit = reddit.subreddit(subreddit_name)
        print(f"Scraping subreddit: {subreddit_name} with limit: {entry_limit}, sort by: {sort_by}, time filter: {time_filter}")
        
        # Determine the sorting method for submissions
        if sort_by == 'relevance':
            submissions = subreddit.search(query=" ".join(primary_keywords + secondary_keywords), limit=entry_limit)
        elif sort_by == 'hot':
            submissions = subreddit.hot(limit=entry_limit)
        elif sort_by == 'top':
            submissions = subreddit.top(time_filter=time_filter, limit=entry_limit)
        elif sort_by == 'new':
            submissions = subreddit.new(limit=entry_limit)
        else:
            print(f"Unsupported sort_by value: {sort_by}. Defaulting to 'new'.")
            submissions = subreddit.new(limit=entry_limit)

        # Iterate over each submission in the subreddit
        for submission in submissions:
            # Avoid processing the same submission multiple times
            if submission.id in processed_submissions:
                print(f"Submission {submission.id} already processed. Skipping.")
                continue
            processed_submissions.add(submission.id)
            posts_checked += 1

            # Combine title and selftext for comprehensive keyword search
            post_text = submission.title.lower()
            if submission.selftext:
                post_text += ' ' + submission.selftext.lower()

            # Case 1: Post contains both primary and secondary keywords -> scrape the post and all comments
            if (any(pk.lower() in post_text for pk in primary_keywords) and
                any(sk.lower() in post_text for sk in secondary_keywords)):
                
                # Convert post date to a readable format
                post_date = datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S')
                post_data = {
                    "subreddit": subreddit_name,
                    "post_title": submission.title,
                    "post_url": submission.url,
                    "post_body": submission.selftext,
                    "post_score": submission.score,
                    "number_of_comments": submission.num_comments,
                    "post_date": post_date,
                    "type": "Post"
                }
                scraped_data.append(post_data)
                posts_scraped += 1
                print(f"Posts Scraped: {posts_scraped}")

                # Scrape all comments (no keyword filtering)
                submission.comments.replace_more(limit=0)
                for comment in submission.comments.list():
                    comment_date = datetime.fromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S')
                    comment_data = {
                        "subreddit": subreddit_name,
                        "post_title": submission.title,
                        "post_url": submission.url,
                        "comment_author": str(comment.author),
                        "comment_body": comment.body,
                        "comment_score": comment.score,
                        "comment_url": f"https://www.reddit.com{comment.permalink}",
                        "comment_date": comment_date,
                        "type": "Comment"
                    }
                    scraped_data.append(comment_data)
                    comments_scraped += 1
                    print(f"Comments Scraped: {comments_scraped}")

            # Case 2: Post contains either primary or secondary keyword -> scrape comments where both primary and secondary keywords are present
            elif (any(pk.lower() in post_text for pk in primary_keywords) or
                  any(sk.lower() in post_text for sk in secondary_keywords)):
                
                # Scrape the post
                post_date = datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S')
                post_data = {
                    "subreddit": subreddit_name,
                    "post_title": submission.title,
                    "post_url": submission.url,
                    "post_body": submission.selftext,
                    "post_score": submission.score,
                    "number_of_comments": submission.num_comments,
                    "post_date": post_date,
                    "type": "Post"
                }
                #scraped_data.append(post_data)
                posts_scraped += 1
                print(f"Partial Post Scraped: {posts_scraped}")

                # Scrape comments where both primary and secondary keywords are present
                submission.comments.replace_more(limit=0)
                for comment in submission.comments.list():
                    comment_text = comment.body.lower()
                    # Scrape only if both primary and secondary keywords are present in the comment
                    if any(sk.lower() in comment_text for sk in secondary_keywords):
                        comment_date = datetime.fromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S')
                        comment_data = {
                            "subreddit": subreddit_name,
                            "post_title": submission.title,
                            "post_url": submission.url,
                            "comment_author": str(comment.author),
                            "comment_body": comment.body,
                            "comment_score": comment.score,
                            "comment_url": f"https://www.reddit.com{comment.permalink}",
                            "comment_date": comment_date,
                            "type": "Comment"
                        }
                        scraped_data.append(comment_data)
                        comments_scraped += 1
                        print(f"Filtered Comments Scraped: {comments_scraped}")

            # Respect Reddit's API rate limits
            time.sleep(1)

    # Process individual posts from posts_dict if provided
    if posts_dict:
        for post_url, entry_limit in posts_dict.items():
            try:
                print(f"Processing post URL: {post_url} with limit: {entry_limit}")
                submission = reddit.submission(url=post_url)
                
                # Check if the submission is valid
                if submission is None:
                    print(f"Submission not found for URL: {post_url}")
                    continue
                
                # Avoid processing the same submission multiple times
                if submission.id in processed_submissions:
                    print(f"Submission {submission.id} already processed. Skipping.")
                    continue
                processed_submissions.add(submission.id)

                # Combine title and selftext for comprehensive keyword search
                post_text = submission.title.lower()
                if submission.selftext:
                    post_text += ' ' + submission.selftext.lower()

                # Case 1: Post contains both primary and secondary keywords -> scrape the post and all comments
                if (any(pk.lower() in post_text for pk in primary_keywords) and
                    any(sk.lower() in post_text for sk in secondary_keywords)):
                    # Scrape the post and all comments
                    post_date = datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S')
                    post_data = {
                        "subreddit": submission.subreddit.display_name,
                        "post_title": submission.title,
                        "post_url": submission.url,
                        "post_body": submission.selftext,
                        "post_score": submission.score,
                        "number_of_comments": submission.num_comments,
                        "post_date": post_date,
                        "type": "Post"
                    }
                    scraped_data.append(post_data)
                    posts_scraped += 1
                    print(f"Additional Post Scraped: {posts_scraped}")

                    # Scrape all comments (no keyword filtering)
                    submission.comments.replace_more(limit=0)
                    for comment in submission.comments.list()[:entry_limit]:
                        comment_date = datetime.fromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S')
                        comment_data = {
                            "subreddit": submission.subreddit.display_name,
                            "post_title": submission.title,
                            "post_url": submission.url,
                            "comment_author": str(comment.author),
                            "comment_body": comment.body,
                            "comment_score": comment.score,
                            "comment_url": f"https://www.reddit.com{comment.permalink}",
                            "comment_date": comment_date,
                            "type": "Comment"
                        }
                        scraped_data.append(comment_data)
                        comments_scraped += 1
                        print(f"Additional Comments Scraped: {comments_scraped}")


                # Case 2: Post contains either primary or secondary keyword -> scrape the post and filtered comments
                elif (any(pk.lower() in post_text for pk in primary_keywords) or
                      any(sk.lower() in post_text for sk in secondary_keywords)):
                    # Scrape the post
                    post_date = datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S')
                    post_data = {
                        "subreddit": submission.subreddit.display_name,
                        "post_title": submission.title,
                        "post_url": submission.url,
                        "post_body": submission.selftext,
                        "post_score": submission.score,
                        "number_of_comments": submission.num_comments,
                        "post_date": post_date,
                        "type": "Post"
                    }
                    #scraped_data.append(post_data)
                    posts_scraped += 1
                    print(f"Partial Post Scraped: {posts_scraped}")

                    # Scrape comments where both primary and secondary keywords are present
                    submission.comments.replace_more(limit=0)
                    for comment in submission.comments.list()[:entry_limit]:
                        comment_text = comment.body.lower()
                        # Scrape only if both primary and secondary keywords are present in the comment
                        if (any(pk.lower() in comment_text for pk in primary_keywords) and
                            any(sk.lower() in comment_text for sk in secondary_keywords)):
                            comment_date = datetime.fromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S')
                            comment_data = {
                                "subreddit": submission.subreddit.display_name,
                                "post_title": submission.title,
                                "post_url": submission.url,
                                "comment_author": str(comment.author),
                                "comment_body": comment.body,
                                "comment_score": comment.score,
                                "comment_url": f"https://www.reddit.com{comment.permalink}",
                                "comment_date": comment_date,
                                "type": "Comment"
                            }
                            scraped_data.append(comment_data)
                            comments_scraped += 1
                            print(f"Filtered Comments Scraped: {comments_scraped}")

                # Respect Reddit's API rate limits
                time.sleep(1)
       
            except Exception as e:
                print(f"Error processing post {post_url}: {e}")
                # If scraping failed and there was only one URL, handle gracefully
                if len(posts_dict) == 1:
                    print("Only one post URL was provided, and it failed to scrape. Skipping this URL.")

    # Save the scraped data to a CSV file if there are any scraped entries
    if scraped_data:
        with open('reddit_scraped_data.csv', mode='w', newline='', encoding='utf-8') as file:
            fieldnames = ["subreddit", "post_title", "post_url", "post_body", "post_score", "number_of_comments",
                          "post_date", "comment_author", "comment_body", "comment_score", "comment_url",
                          "comment_date", "type"]
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(scraped_data)

        print(f"Scraped {len(scraped_data)} entries saved to 'reddit_scraped_data.csv'.")
        print(f"Total Posts Checked: {posts_checked}")
        print(f"Total Posts Scraped: {posts_scraped}")
        print(f"Total Comments Scraped: {comments_scraped}")
    else:
        print("No data was scraped. The CSV file will not be created.")



In [95]:
# import pandas as pd
# df1=pd.read_csv('reddit_scraped_data_01.csv')
# df2=pd.read_csv('reddit_scraped_data_02.csv')

In [96]:
# df1.shape,df2.shape

((36, 13), (307, 13))

In [98]:
# df_union = pd.concat([df1, df2]).drop_duplicates(subset='comment_body')
# df_union.shape

(320, 13)

In [99]:
# df_union.to_csv('isro_nasa.csv')

In [108]:

primary_keywords = [" "]
secondary_keywords = [' ']

# Scrape posts and comments from the specified subreddit
subreddits_dict = {
    "NASA": 10
}
posts_dict = {
'https://www.reddit.com/r/india/comments/160s1jx/why_is_isro_so_terrible_at_presentation/':50,
'https://www.reddit.com/r/IndianModerate/comments/x6mar1/would_you_leave_working_at_isro_for_nasa/':50,
'https://www.reddit.com/r/AstronautHopefuls/comments/1e3zaip/what_sets_nasa_apart_from_other_space_agencies/':50,
'https://www.reddit.com/r/space/comments/14g79g0/india_joins_artemis_accords_to_launch_isronasa/':50,
'https://www.reddit.com/r/IndianDankMemes/comments/po2pox/isro_is_the_best_space_agency_it_cost_less_and_it/':50,
'https://www.reddit.com/r/india/comments/15i6bbc/why_are_isro_scientists_not_really_scientists/':50,
'https://www.reddit.com/r/IndiaSpeaks/comments/161z79i/indians_working_in_nasa_right_now_after/':50,
'https://www.reddit.com/r/india/comments/rm43qu/isros_budget_vs_launches_over_the_years/':50,
'https://www.reddit.com/r/space/comments/1bv1oiv/has_space_exploration_moved_too_slowly/':50,
'https://www.reddit.com/r/space/comments/167r9xl/india_wants_to_fly_its_own_astronauts_to_the_moon/':50,
'https://www.reddit.com/r/space/comments/1b17wzz/isro_gaganyaan_mission_names_of_four_astronauts/':50,
'https://www.reddit.com/r/scienceisdope/comments/168qbja/why_people_are_still_thinking_isro_is_wasting/':50,
'https://www.reddit.com/r/SpaceXLounge/comments/1dqhj0e/isro_to_launch_first_satellite_on_spacex_falcon_9/':50,
'https://www.reddit.com/r/space/comments/1eqdqce/scientists_slam_indefensible_axing_of_nasas_450m/':50,
'https://www.reddit.com/r/space/comments/15z2m32/chandrayaan3_has_landed/':50,
'https://www.reddit.com/r/nasa/comments/1b5gyr1/why_doesnt_nasa_build_its_own_camera/':50,
'https://www.reddit.com/r/space/comments/1dpwvmg/nasa_will_pay_spacex_nearly_1_billion_to_deorbit/':50,
'https://www.reddit.com/r/space/comments/1cujjgg/indias_ambitious_2nd_mars_mission_to_include_a/':50,
'https://www.reddit.com/r/Btechtards/comments/175x9ca/isro_director_on_why_there_are_very_few_iitians/':50,
'https://www.reddit.com/r/askasia/comments/16vjww8/why_is_indias_space_program_more_successful_than/':50,
'https://www.reddit.com/r/nasa/comments/14s99lh/india_a_growing_space_power_is_forging_closer/':50,
'https://www.reddit.com/r/indiadiscussion/comments/1ekim6n/manuvadi_nasa/':50,
'https://www.reddit.com/r/ISRO/comments/kt5igi/how_isro_is_viewed_among_indian_space_nerds/':50,
'https://www.reddit.com/r/agedlikemilk/comments/15z6tlx/this_new_york_times_racist_cartoon_on_india_and/':50,
'https://www.reddit.com/r/indiadiscussion/comments/122ww0z/worldnews_cant_even_be_positive_about_isro_how/':50,
'https://www.reddit.com/r/bakchodi/comments/d0oklz/hes_is_the_son_of_farmer_he_didnt_choose_nasa_hes/':50,
'https://www.reddit.com/r/nasa/comments/6q8hej/why_are_there_so_many_nasa_is_fake_comments_on/':50,
'https://www.reddit.com/r/IndiaSpeaks/comments/n64o1y/i_emailed_isro_about_a_suggestion_to_start/':50,
'https://www.reddit.com/r/ISRO/comments/14yxn8r/powerful_nasaisro_earth_observing_satellite_nisar/':50,
'https://www.reddit.com/r/space/comments/ijya8f/the_story_of_finding_and_confirming_water_on_the/':50,
'https://www.reddit.com/r/space/comments/uqzubu/behind_the_scenes_of_indias_moon_mission/':50,
'https://www.reddit.com/r/nasa/comments/qcayi3/voice_your_opinion_about_nasa/':50,
'https://www.reddit.com/r/ISRO/comments/163gk6t/opinion_on_the_upcoming_budgets_for_isro/':50,
'https://www.reddit.com/r/ISRO/comments/1ftiicx/why_is_isros_social_media_so_bad/':50,
'https://www.reddit.com/r/ISRO/comments/18fht7i/nasa_to_train_indian_astronauts_for_a_weeklong/':50,
'https://www.reddit.com/r/ISRO/comments/14zdlzt/why_does_isro_not_use_rocket_cams_like_nasa_or/':50,
'https://www.reddit.com/r/ISRO/comments/17f64j6/how_the_upcoming_nasaled_gateway_orbital_habitat/':50,
'https://www.reddit.com/r/india/comments/xse9v/if_india_has_so_many_nasa_scientistswhere_are/':50,
'https://www.reddit.com/r/india/comments/2h5104/nasa_makes_exception_in_policy_to_hire_indian/':50,
'https://www.reddit.com/r/india/comments/1761zxt/iitians_not_joining_isro_60_students_walked_out/':50,
'https://www.reddit.com/r/india/comments/14zbceu/isros_chandrayaan3_successfully_launched/':50,
'https://www.reddit.com/r/india/comments/162k1hk/how_much_are_isro_scientists_and_engineers_being/':50,
'https://www.reddit.com/r/india/comments/160d4x8/why_arent_isro_jobs_aspirational_in_india/':50,
'https://www.reddit.com/r/india/comments/d0o84g/isro_loses_contact_with_chandrayaan2_lander_full/':50,
'https://www.reddit.com/r/india/comments/y9sj8s/rip_isro/':50,
'https://www.reddit.com/r/india/comments/5uch4q/if_we_can_establish_such_a_highly_competent/':50,
'https://www.reddit.com/r/worldnews/comments/24qy3n/shoestring_theory_indias_pioneering_budget_space/':50,
'https://www.reddit.com/r/india/comments/d76kex/isro_chief_k_sivan_chandrayaan2_orbiter_is_doing/':50,
'https://www.reddit.com/r/spaceporn/comments/4kpbx7/indias_new_space_shuttle_800_x_1200/':50,
'https://www.reddit.com/r/spaceporn/comments/15z7zlm/first_image_form_the_chandrayaan3_lander_after/':50,
'https://www.reddit.com/r/india/comments/15z2lnl/chandrayaan3s_lander_makes_soft_landing_on_the/':50,
'https://www.reddit.com/r/ISRO/comments/17agwa2/there_is_no_capability_in_india_to_manufacture/':50,
'https://www.reddit.com/r/NoStupidQuestions/comments/1610bxm/why_does_india_have_a_space_program/':50,
'https://www.reddit.com/r/space/comments/15tk53v/isros_chandrayaan3_moon_lander_separates_from/':50,
'https://www.reddit.com/r/ISRO/comments/185z621/isro_lunar_roadmap/':50,
'https://www.reddit.com/r/ISRO/comments/1fs6hqc/america_should_welcome_indias_rise_as_a_space/':50,
'https://www.reddit.com/r/ISRO/comments/ega63d/why_hasnt_nasa_released_the_vikram_lander_flybys/':50,
'https://www.reddit.com/r/space/comments/160q9qr/chandrayaan3_rover_ramping_down_from_the_lander/':50,
'https://www.reddit.com/r/space/comments/169lztm/indias_vikram_lander_successfully_underwent_a_hop/':50,
'https://www.reddit.com/r/space/comments/179z1tm/india_aims_to_send_astronaut_to_the_moon_by_2040/':50,
'https://www.reddit.com/r/space/comments/14m7evt/artemis_accords_expert_explains_why_india_is_a/':50,
'https://www.reddit.com/r/space/comments/1fk067y/india_considers_joining_russia_china_to_build/':50,
'https://www.reddit.com/r/space/comments/xb9f6b/indias_chandrayaan_moon_mission_placed_words_most/':50,
'https://www.reddit.com/r/space/comments/1fjx6o8/india_approves_4_major_space_programmes_including/':50,
'https://www.reddit.com/r/space/comments/167r9xl/india_wants_to_fly_its_own_astronauts_to_the_moon/':50,
'https://www.reddit.com/r/space/comments/yyfai0/historic_isro_launches_indias_first_private/':50,
'https://www.reddit.com/r/space/comments/15wglo5/meanwhile_isro_credit_indianexpress/':50,
'https://www.reddit.com/r/ISRO/comments/191qjl6/why_doesnt_isro_build_large_rockets/':50,
'https://www.reddit.com/r/Damnthatsinteresting/comments/15z3lwd/reaction_from_isros_control_room_as_indias/':50,
'https://www.reddit.com/r/Btechtards/comments/1e97odk/why_no_one_here_is_aspiring_to_join_isro/':50,
'https://www.reddit.com/r/india/comments/1615tot/should_space_exploration_be_low_priority_for_a/':50,
'https://www.reddit.com/r/hinduism/comments/1e9innz/isro_scientists_in_collaboration_with_nasa_create/':50,
'https://www.reddit.com/r/india/comments/d0ytzc/nasa_on_twitter_space_is_hard_we_commend_isro_s/':50,
'https://www.reddit.com/r/IndiaSpeaks/comments/cjjlxn/outrageous_to_blast_isro_chairman_k_sivan_for/':50,
'https://www.reddit.com/r/india/comments/6bb8ii/nasa_nasa_isro_to_jointly_inspect_oldest/':50,
'https://www.reddit.com/r/worldnews/comments/5u5b3d/indian_space_research_organisation_isro/':50,
'https://www.reddit.com/r/IndiaSpeaks/comments/1d0weed/nasa_will_train_indian_astronauts_launch_nisar/':50,
'https://www.reddit.com/r/ISRO/comments/t7r5uq/unignited_has_india_fallen_behind_in_the_space/':50,
'https://www.reddit.com/r/ISRO/comments/cnius7/india_is_a_maturing_space_power_but_can_rival_the/':50,
'https://www.reddit.com/r/ISRO/comments/160txsv/discussion_why_we_dont_have_a_powerful_deep_space/':50,
'https://www.reddit.com/r/worldnews/comments/2i3jfj/india_us_agree_to_joint_exploration_of_mars/':50,
'https://www.reddit.com/r/aviation/comments/130m3a5/what_do_u_guys_think_of_this_crazy_concept_by/':50,
'https://www.reddit.com/r/space/comments/1619fxq/why_was_pragyan_rover_not_designed_differently/':50,
'https://www.reddit.com/r/ISRO/comments/1e7stvc/what_happen_to_isro_launch_plan/':50,
'https://www.reddit.com/r/IndiaSpeaks/comments/c9s7kk/5_years_later_our_mangalyaan_is_still_alive_and/':50,
}



scrape_subreddit_for_keywords(
    subreddits_dict=subreddits_dict,
    primary_keywords=primary_keywords,
    secondary_keywords=secondary_keywords,
    posts_dict=posts_dict,
    sort_by='new',        
    time_filter='all'   
)


Scraping subreddit: NASA with limit: 0, sort by: new, time filter: all
Processing post URL: https://www.reddit.com/r/india/comments/160s1jx/why_is_isro_so_terrible_at_presentation/ with limit: 50
Additional Post Scraped: 1
Additional Comments Scraped: 1
Additional Comments Scraped: 2
Additional Comments Scraped: 3
Additional Comments Scraped: 4
Additional Comments Scraped: 5
Additional Comments Scraped: 6
Additional Comments Scraped: 7
Additional Comments Scraped: 8
Additional Comments Scraped: 9
Additional Comments Scraped: 10
Additional Comments Scraped: 11
Additional Comments Scraped: 12
Additional Comments Scraped: 13
Additional Comments Scraped: 14
Additional Comments Scraped: 15
Additional Comments Scraped: 16
Additional Comments Scraped: 17
Additional Comments Scraped: 18
Additional Comments Scraped: 19
Additional Comments Scraped: 20
Additional Comments Scraped: 21
Additional Comments Scraped: 22
Additional Comments Scraped: 23
Additional Comments Scraped: 24
Additional Comment

# Instagram Scraping - NASA and ISRO

In [None]:
import os
import sys
import pandas as pd
from datetime import datetime as dt
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import time
import collections
import string
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet
from nltk import pos_tag
import scipy.stats as ss
import matplotlib.pyplot as plt
import math
import re
from nltk.corpus import stopwords
from sklearn.manifold import MDS
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
import undetected_chromedriver as uc
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import pandas as pd
from collections import Counter 
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold
import seaborn as sns
import string
import operator
import io
import csv
import re
import string
import re
import decimal
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from itertools import combinations
import shutil
from tempfile import NamedTemporaryFile
import random
import undetected_chromedriver as uc
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
import undetected_chromedriver as uc
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### Open Instagram and enter credentials

In [None]:
# Initialize the undetected Chrome driver with options
options = uc.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--window-size=1920,1080')
options.add_argument('--disable-blink-features=AutomationControlled')  # Helps avoid detection

# Initialize the Chrome WebDriver using undetected-chromedriver
driver = uc.Chrome(options=options)

try:
    # Go to Instagram login page
    driver.get("https://www.instagram.com/accounts/login/")
    time.sleep(10)

    # Locate the username field using the provided XPath
    username_field = driver.find_element("xpath", '//*[@id="loginForm"]/div/div[1]/div/label/input')
    
    # Locate the password field using the provided XPath
    password_field = driver.find_element("xpath", '//*[@id="loginForm"]/div/div[2]/div/label/input')
    
    # Locate the login button using the provided XPath
    login_button = driver.find_element("xpath", '//*[@id="loginForm"]/div/div[3]/button/div')

    # Enter your credentials
    username_field.send_keys("YOUR_ACCOUNT_USERNAME") # enter your instagram id
    password_field.send_keys("YOUR_ACCOUNT_PASSWORD") # enter your instagram password

    # Click the login button
    login_button.click()

    # Wait for some time to complete the login process
    time.sleep(10)
except Exception as e:
    print(f"Error occurred: {e}")

In [None]:
# search button
element_to_click = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[contains(@id, 'mount_0_0_')]/div/div/div/div[2]/div/div/div[1]/div[1]/div[2]/div/div/div/div/div[2]/div[2]/span/div/a/div")))
element_to_click.click()

# entering text in search button
input_field = wait.until(EC.presence_of_element_located((By.XPATH, "//*[contains(@id, 'mount_0_0_')]/div/div/div/div[2]/div/div/div[1]/div[1]/div[2]/div/div/div[2]/div/div/div[2]/div/div/div[1]/div/div/input")))
# Clear the input field (optional) and send the text
input_field.clear()  # Optional: clear the field if there's existing text
input_field.send_keys('isro.dos') # I am scraping ISRO here; you can change it to your choice

# //*[contains(@id, 'mount_0_0_')]/div/div/div/div[2]/div/div/div[1]/div[1]/div[2]/div/div/div[2]/div/div/div[2]/div/div/div[1]/div/div/input

### Scraping 1st post and then the rest of the posts - limit clicks - 100

In [None]:
# selecting first result
element_to_click = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[contains(@id, 'mount_0_0_')]/div/div/div/div[2]/div/div/div[1]/div[2]/div/div[1]/section/main/div/div[2]/div/div[1]/div[1]/a/div[1]/div[2]")))

# Click on the element
element_to_click.click()
#//*[@id="mount_0_0_TN"]/div/div/div/div[2]/div/div/div[1]/div[2]/div/div[1]/section/main/div/div[2]/div/div[1]/div[1]/a/div[1]/div[2]

image_xpath = "//*[contains(@id, 'mount_0_0_')]/div/div/div/div[2]/div/div/div[1]/div[2]/div/div[1]/section/main/div/div[2]/div/div[1]/div[1]/a/div[1]/div[2]"
driver.find_element(By.XPATH, image_xpath).click()
# Define the function to generate dynamic XPath with any numeric value in div[]
def dynamic_xpath(toggle):
    xpath_1 = f"/html/body/div[*]/div[1]/div/div[3]/div/div/div/div/div[1]/div/div/div[2]/button"
    xpath_2 = f"/html/body/div[*]/div[1]/div/div[3]/div/div/div/div/div[2]/div/article/div/div[1]/div/div"
    return xpath_1, xpath_2
image_url_list = []
caption = []
like_count = []

element = driver.find_element(By.XPATH, "/html/body/div[*]/div[1]/div/div[3]/div/div/div/div/div[2]/div/article/div/div[1]/div/div") # Change this as the first post html changes every time
# find the xpath that say padding etc in the div and grab that xpath and not the other
# the correct xpath will be when you hover in the html and get yellow color shade on the image
# Extract the 'src' attribute which contains the image URL
image_url_1 = element.find_element(By.TAG_NAME, "img").get_attribute("src")
image_url_list.append(image_url_1)
time.sleep(2)
comment_element = driver.find_element(By.XPATH, "/html/body/div[*]/div[1]/div/div[3]/div/div/div/div/div[2]/div/article/div/div[2]/div/div/div[2]/div[1]/ul/div[1]/li/div/div/div[2]")   
# Extract the text of the comment
comment_text = comment_element.text 
caption.append(comment_text)
time.sleep(2)
# Find the review text using the provided XPath
review_element = driver.find_element(By.XPATH, '/html/body/div[*]/div[1]/div/div[3]/div/div/div/div/div[2]/div/article/div/div[2]/div/div/div[2]/section[2]')
# Extract the text from the element
review_text = review_element.text
like_count.append(review_text)
element = driver.find_element(By.XPATH, "/html/body/div[*]/div[1]/div/div[3]/div/div/div/div/div[1]/div/div/div/button" )
element.click()
# Iterate and click the dynamically located next buttons
for _ in range(100):  # Adjust the range for the number of clicks
    try:
        # First, try the toggle=False XPath pattern
        xpath_1, xpath_2 = dynamic_xpath(_)
        element = driver.find_element(By.XPATH, xpath_1)
        element.click()
        element_2 = driver.find_element(By.XPATH, xpath_2)
        # Extract the 'src' attribute which contains the image URL
        image_url = element_2.find_element(By.TAG_NAME, "img").get_attribute("src")
        image_url_list.append(image_url)
        time.sleep(2)
        comment_element = driver.find_element(By.XPATH, "/html/body/div[*]/div[1]/div/div[3]/div/div/div/div/div[2]/div/article/div/div[2]/div/div/div[2]/div[1]/ul/div[1]/li/div/div/div[2]/div[1]/h1")
        comment_text = comment_element.text        
        caption.append(comment_text)
        time.sleep(2)
        review_element = driver.find_element(By.XPATH, '/html/body/div[*]/div[1]/div/div[3]/div/div/div/div/div[2]/div/article/div/div[2]/div/div/div[2]/section[2]')
        # Extract the text from the element
        review_text = review_element.text
        like_count.append(review_text)
        print(_)
    except Exception as e:
        continue

In [None]:
data = {
    'Image URL': image_url_list,
    'Caption': caption,
    'Like Count': like_count
}

# Create a DataFrame
df = pd.DataFrame(data)

In [None]:
# Save DataFrame to CSV
df.to_csv('output_isro1.csv', index=False)

## Using HuggingFace to perform Image Analytics on intagram scraped images

In [None]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

import pandas as pd
df = pd.read_csv('output_isro1.csv')

In [None]:
def generate_image_captions(dataframe, image_link_column, limit):
    """
    Generates captions for images up to the specified limit and assigns them to the DataFrame.

    Parameters:
    - dataframe (pd.DataFrame): The DataFrame containing image URLs.
    - limit (int): The number of images to process.
    """
    for index, img_url in enumerate(dataframe[image_link_column]):
        if index >= limit:
            break  # Exit the loop once the limit is reached

        try:
            # Fetch the image from the URL
            response = requests.get(img_url, stream=True)
            response.raise_for_status()  # Raise an error for bad status codes

            # Open the image and convert it to RGB
            raw_image = Image.open(response.raw).convert('RGB')

            # Prepare the image for the model
            inputs = processor(raw_image, return_tensors="pt")

            # Generate the caption
            out = model.generate(**inputs)
            caption = processor.decode(out[0], skip_special_tokens=True)

            # Assign the caption to the 'Image Label' column using .loc
            dataframe.loc[index, 'Image Label'] = caption

            # Print the generated caption
            print(f"Caption for Image {index + 1}: {caption}")

        except requests.exceptions.RequestException as req_err:
            print(f"Request error for Image {index + 1}: {req_err}")
            dataframe.loc[index, 'Image Label'] = 'Error fetching image'

        except Exception as e:
            print(f"An error occurred for Image {index + 1}: {e}")
            dataframe.loc[index, 'Image Label'] = 'Error processing image'


In [None]:
# Set the limit (number of images to process)
limit = 99  # Change this value as needed

# Call the function with the correct column name
generate_image_captions(df, 'Image URL', limit)

# Save the Updated DataFrame to a CSV File
df.to_csv('isro_output_labels1.csv', index=False)