Install Dependencies

In [23]:
!pip install google.cloud

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting google.cloud
  Downloading google_cloud-0.34.0-py2.py3-none-any.whl (1.8 kB)
Installing collected packages: google.cloud
Successfully installed google.cloud-0.34.0


In [4]:
!pip install pymongo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymongo
  Downloading pymongo-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.1/492.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dnspython<3.0.0,>=1.16.0
  Downloading dnspython-2.3.0-py3-none-any.whl (283 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m283.7/283.7 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.3.0 pymongo-4.3.3


In [2]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=4a167565800f645c87997aee97b6ce39d940932b157ac8b89b30246d80b07f3a
  Stored in directory: /root/.cache/pip/wheels/9f/34/a4/159aa12d0a510d5ff7c8f0220abbea42e5d81ecf588c4fd884
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


Connect MongoDB

In [5]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "use your MongoDB URI"

# Create a new client and connect to the server
mongo_client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    mongo_client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


Reddit Scrapping and storing it in Mongo

In [None]:
import praw
from pymongo import MongoClient
import datetime

# Set up your Reddit API credentials
client_id = 'insert your id'
client_secret = 'insert key'
user_agent = 'mybot'

# Set up MongoDB connection
db = mongo_client['reddit_db']
collection = db['climate_change_posts']

# Initialize the Reddit API wrapper
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent, check_for_async=False)

# Define the search parameters
search_terms = [
    "climate change", "Global warming", "Greenhouse effect", "Carbon dioxide emissions",
    "Renewable energy", "Sea level rise", "Climate adaptation", "Climate mitigation",
    "Extreme weather events", "Deforestation", "Melting glaciers", "Ocean acidification",
    "Climate policy", "Carbon footprint", "Sustainable development", "Fossil fuels",
    "Climate", "Biodiversity loss", "Energy efficiency", "Climate action"
]
search_results_limit = 100  # Number of search results to retrieve

# Calculate the search timeframe (last week)
current_date = datetime.datetime.now()
week_ago_date = current_date - datetime.timedelta(days=7)

# Store the search results and their comments in MongoDB
for search_term in search_terms:
    search_query = f'{search_term} timestamp:{week_ago_date.timestamp()}:'
    search_results = reddit.subreddit('all').search(search_query, time_filter='week', limit=search_results_limit)

    for post in search_results:
        post_data = {
            'title': post.title,
            'score': post.score,
            'url': post.url,
            'author': post.author.name if post.author else None,
            'created_utc': post.created_utc,
            'comments': []
        }
        post.comments.replace_more(limit=10)
        for comment in post.comments.list():
            comment_data = {
                'body': comment.body,
                'score': comment.score,
                'author': comment.author.name if comment.author else None,
                'created_utc': comment.created_utc
            }
            post_data['comments'].append(comment_data)

        collection.insert_one(post_data)

print("Data successfully scraped and stored in MongoDB.")


Retrieve from MongoDB

In [6]:
import pandas as pd

# Access the database and collection
db = mongo_client['reddit_db']
collection = db['climate_change_posts']

# Fetch all documents from the collection
documents = collection.find()

# Initialize empty lists to store posts and comments data
posts_data = []
comments_data = []

# Iterate over the documents
for post in documents:
    # Extract post data
    post_data = {
        'title': post['title'],
        'score': post['score'],
        'url': post['url'],
        'author': post['author'] if post['author'] else None,
        'created_utc': post['created_utc']
    }

    # Append post data to the posts_data list
    posts_data.append(post_data)

    # Iterate over the comments
    for comment in post['comments']:
        # Extract comment data
        comment_data = {
            'body': comment['body'],
            'score': comment['score'],
            'author': comment['author'] if comment['author'] else None,
            'created_utc': comment['created_utc']
        }

        # Append comment data to the comments_data list
        comments_data.append(comment_data)

# Convert the lists of post and comment data to Pandas DataFrames
posts_df = pd.DataFrame(posts_data)
comments_df = pd.DataFrame(comments_data)

Sentiment Analysis

In [18]:
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import PipelineModel
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType

# Create a SparkConf object and set the desired configuration properties
conf = SparkConf()
conf.set("spark.submit.deployMode", "client")

# Set the driver memory to 16 GB
conf.set("spark.driver.memory", "16g")

# Create a SparkSession and SparkContext
spark = SparkSession.builder.config(conf=conf).getOrCreate()
sc = spark.sparkContext

# Load the saved model
lrModel = LogisticRegressionModel.load("/content/drive/MyDrive/model")

label_stringIdx = StringIndexer(inputCol="sentiment_label", outputCol="label")
regexTokenizer = RegexTokenizer(inputCol="body", outputCol="words")
stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
countVectorizer = CountVectorizer(inputCol="filtered_words", outputCol="features")

spark_df = spark.createDataFrame(comments_df)
data_final = spark_df.select("body")

pipeline = Pipeline(stages=[
    regexTokenizer,
    stopWordsRemover,
    countVectorizer,
    label_stringIdx
])

pipelineFit = PipelineModel.load("/content/drive/MyDrive/pipeline")
dataset = pipelineFit.transform(data_final)

# Make predictions using the logistic regression model
predictions = lrModel.transform(dataset)

# Select the 'body' and 'prediction' columns from the predictions
body_predictions = predictions.select("body", "prediction")

# Convert the Spark DataFrame back to Pandas DataFrame
body_predictions_pandas = body_predictions.toPandas()


In [26]:
merged_df = comments_df.merge(body_predictions_pandas, left_index=True, right_index=True, how="left")

merged_df = merged_df.drop('body_y', axis=1)
merged_df = merged_df.drop('author', axis=1)
# Remove duplicate values from 'body_x' column
merged_df['body_x'] = merged_df['body_x'].drop_duplicates()

# Rename 'body_x' column to 'body'
merged_df = merged_df.rename(columns={'body_x': 'body'})

merged_df = merged_df.drop_duplicates(subset='body', keep='first')


Store it in BigQuery

In [24]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "include json file with your Google cloud credentials"

from google.cloud import bigquery

# Create a BigQuery client
client = bigquery.Client()

project_id = 'famous-athlete-386604'
dataset_name = 'reddit'
table_name = 'comment'

# Create a BigQuery client
bigquery_client = bigquery.Client(project=project_id)

# Create the dataset reference
dataset_ref = bigquery_client.dataset(dataset_name)

# Create the dataset if it doesn't exist
if not bigquery_client.get_dataset(dataset_ref):
    bigquery_client.create_dataset(dataset_ref)

# Define the table schema
schema = [
    bigquery.SchemaField("body", "STRING"),
    bigquery.SchemaField("score", "INTEGER"),
    bigquery.SchemaField("created_utc", "INTEGER"),
    bigquery.SchemaField("prediction", "FLOAT"),
]

# Create the table reference
table_ref = dataset_ref.table(table_name)

print(f'Table created: {project_id}.{dataset_name}.{table_name}')

Table created: famous-athlete-386604.reddit.comment


In [27]:
# Write the DataFrame to the BigQuery table
job_config = bigquery.LoadJobConfig()
job = bigquery_client.load_table_from_dataframe(merged_df, table_ref, job_config=job_config)
job.result()  # Wait for the job to complete

print(f'Data uploaded to BigQuery table: {project_id}.{dataset_name}.{table_name}')

Data uploaded to BigQuery table: famous-athlete-386604.reddit.comment


In [33]:
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import PipelineModel
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType

# Create a SparkConf object and set the desired configuration properties
conf = SparkConf()
conf.set("spark.submit.deployMode", "client")

# Set the driver memory to 16 GB
conf.set("spark.driver.memory", "16g")

# Create a SparkSession and SparkContext
spark = SparkSession.builder.config(conf=conf).getOrCreate()
sc = spark.sparkContext

# Load the saved model
lrModel = LogisticRegressionModel.load("/content/drive/MyDrive/model")

label_stringIdx = StringIndexer(inputCol="sentiment_label", outputCol="label")
regexTokenizer = RegexTokenizer(inputCol="body", outputCol="words")
stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
countVectorizer = CountVectorizer(inputCol="filtered_words", outputCol="features")
posts_df = posts_df.rename(columns={'title': 'body'})
spark_df = spark.createDataFrame(posts_df)
data_final = spark_df.select("body")

pipeline = Pipeline(stages=[
    regexTokenizer,
    stopWordsRemover,
    countVectorizer,
    label_stringIdx
])

pipelineFit = PipelineModel.load("/content/drive/MyDrive/pipeline")
dataset = pipelineFit.transform(data_final)

# Make predictions using the logistic regression model
predictions = lrModel.transform(dataset)

# Select the 'body' and 'prediction' columns from the predictions
body_predictions = predictions.select("body", "prediction")

# Convert the Spark DataFrame back to Pandas DataFrame
body_predictions_pandas = body_predictions.toPandas()


In [39]:
merged_df = posts_df.merge(body_predictions_pandas, left_index=True, right_index=True, how="left")

merged_df = merged_df.drop('body_y', axis=1)
merged_df = merged_df.drop('author', axis=1)
merged_df = merged_df.drop('url', axis=1)
# Remove duplicate values from 'body_x' column
merged_df['body_x'] = merged_df['body_x'].drop_duplicates()

# Rename 'body_x' column to 'body'
merged_df = merged_df.rename(columns={'body_x': 'body'})

merged_df = merged_df.drop_duplicates(subset='body', keep='first')

In [37]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "replace with your credentials"

from google.cloud import bigquery

# Create a BigQuery client
client = bigquery.Client()

project_id = 'replace with your project id'
dataset_name = 'reddit'
table_name = 'comment'

# Create a BigQuery client
bigquery_client = bigquery.Client(project=project_id)

# Create the dataset reference
dataset_ref = bigquery_client.dataset(dataset_name)

# Create the dataset if it doesn't exist
if not bigquery_client.get_dataset(dataset_ref):
    bigquery_client.create_dataset(dataset_ref)

# Define the table schema
schema = [
    bigquery.SchemaField("body", "STRING"),
    bigquery.SchemaField("score", "INTEGER"),
    bigquery.SchemaField("created_utc", "INTEGER"),
    bigquery.SchemaField("prediction", "FLOAT"),
]

# Create the table reference
table_ref = dataset_ref.table(table_name)

print(f'Table created: {project_id}.{dataset_name}.{table_name}')

Table created: famous-athlete-386604.reddit.comment


In [40]:
# Write the DataFrame to the BigQuery table
job_config = bigquery.LoadJobConfig()
job = bigquery_client.load_table_from_dataframe(merged_df, table_ref, job_config=job_config)
job.result()  # Wait for the job to complete

print(f'Data uploaded to BigQuery table: {project_id}.{dataset_name}.{table_name}')

Data uploaded to BigQuery table: famous-athlete-386604.reddit.comment


Dropping the Collection

In [42]:
# Access the database and collection
db = mongo_client['reddit_db']
collection = db['climate_change_posts']

# Drop the collection
collection.drop()

# Confirm if the collection has been dropped
collections_list = db.list_collection_names()
if 'climate_change_posts' in collections_list:
    print("Collection was not dropped successfully.")
else:
    print("Collection dropped successfully.")

Collection dropped successfully.
