In [None]:
# extracting data from reddit using pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_unixtime
from pyspark.sql.types import StringType
import matplotlib.pyplot as plt
import pandas as pd

import requests
from IPython.core.interactiveshell import InteractiveShell
from datetime import datetime
InteractiveShell.ast_node_interactivity = 'all'
from config import CLIENT_ID, SECRET_KEY

with open('pw.txt', 'r') as f:
    pw = f.read()

auth = requests.auth.HTTPBasicAuth(CLIENT_ID, SECRET_KEY)
data = {
    'grant_type': 'password',
    'username': 'Electronic-Land-1475',
    'password': pw
}

headers = {'User-Agent': 'MyApi/0.0.1'}
res = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth=auth, data=data, headers=headers)
TOKEN = res.json()['access_token']
headers = {**headers, **{'Authorization': f'bearer {TOKEN}'}}

res = requests.get('https://oauth.reddit.com/r/python/hot', headers=headers)

# Assuming 'created' is a Unix timestamp
posts = res.json()['data']['children']

# Create a Spark session outside the loop
spark = SparkSession.builder.appName("RedditPosts").getOrCreate()

# List to store formatted data for each post
formatted_data_list = []

for post in posts:
    # Convert the Unix timestamp to a datetime object
    created_timestamp = post['data']['created']

    # Convert timestamp to formatted string using the 'yyyy-MM-dd HH:mm:ss' pattern
    formatted_created = spark.sql(f"SELECT from_unixtime({created_timestamp}, 'yyyy-MM-dd HH:mm:ss') as formatted_created").first().formatted_created

    # Create a dictionary with the formatted created time for the current post
    formatted_post_data = {
        'subreddit': post['data']['subreddit'],
        'title': post['data']['title'],
        'upvote_ratio': post['data']['upvote_ratio'],
        'formatted_created': formatted_created
    }

    # Append the formatted data for the current post to the list
    formatted_data_list.append(formatted_post_data)

# Create a Spark DataFrame from the list of dictionaries
df = spark.createDataFrame(formatted_data_list)

# Show the DataFrame
# df.show(50, truncate=False)


# Plotting
# pandas_df = df.toPandas()

# upvote_ratio_counts = pandas_df['upvote_ratio'].value_counts().sort_index()

# plt.figure(figsize=(10, 6))
# upvote_ratio_counts.plot(kind='bar', color='skyblue')
# plt.title('Upvote Ratio Distribution')
# plt.xlabel('Upvote Ratio')
# plt.ylabel('Count')
# plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
# plt.show()


# Stop the Spark session
spark.stop()

