#### Importing required library

In [None]:
import praw # for fetching data from Reddit
import pandas as pd

In [5]:
from pprint import pprint 

### setting up praw to fetch data from reddit

In [None]:
user = 'scraper 1.0 usr/fahad'
reddit = praw.Reddit(
    client_id = '',
    client_secret = '',
    user_agent=user
)

In [17]:
reddit.read_only = True

In [18]:
# Fetching comments from a descussion
def get_3_comments(submission):
    comments=[]
    submission.comment_limit=3
    submission.comments.replace_more(limit=0)
    for comment in submission.comments:
        comments.append(f"▲ {comment.score} | {comment.body[:250]}")
    return comments

In [19]:
def get_relevant_news(topic): # returns a list of top 3 relevent news from the specified topic
    n=0
    relevant_news = []
    for post in reddit.subreddit("all").search(
        query=topic,
        sort='relevance',
        time_filter='week',
        limit=3
    ):
        news_string = ''
        n+=1
        news_string+='News number : '+str(n)+"\n Score : "+str(post.score)+" \n "
        news_string+='title : '+post.title[:300]+"\n content : "+post.selftext[:500]+'\n '
        # if no body text is found, then the comments are fetched
        if post.selftext == '':
            comments = get_3_comments(post)
            for comm in comments:
                news_string+='comment : '+comm+'\n '
        news_string+='END \n '
        # appending each news to the news list
        relevant_news.append(news_string)
    return relevant_news

In [12]:
# creating dataframe for storing the data
df = pd.DataFrame(columns=['topic','input','output'])

In [None]:
# 100 different topics to fetch related data (for training the model)
news_topics = [
    "Elections and voting behavior",
    "Political corruption scandals",
    "Government policy reforms",
    "International diplomacy",
    "Refugee and immigration policies",
    "Freedom of speech and censorship",
    "Human rights issues",
    "Political protests and movements",
    "Military coups and power transitions",
    "Global sanctions and trade restrictions",
    "Stock market trends",
    "Cryptocurrency regulations",
    "Startups and venture capital",
    "Inflation and interest rates",
    "Job market and unemployment",
    "Mergers and acquisitions",
    "Corporate fraud cases",
    "Energy prices and oil markets",
    "Housing market updates",
    "Global trade and tariffs",
    "UN meetings and resolutions",
    "Geopolitical conflicts",
    "Peace negotiations",
    "Border disputes",
    "Global migration crises",
    "Foreign aid and development",
    "International law cases",
    "Sanctions and diplomatic isolation",
    "Treaties and alliances",
    "Nuclear proliferation",
    "Artificial intelligence breakthroughs",
    "Quantum computing research",
    "Space exploration and NASA missions",
    "Climate change innovations",
    "Biotechnology and genetics",
    "Cybersecurity and hacking incidents",
    "Renewable energy technologies",
    "Semiconductor industry",
    "Robotics and automation",
    "Scientific fraud or retractions",
    "Vaccine development and rollout",
    "Mental health awareness",
    "Public health policies",
    "Medical research discoveries",
    "Global pandemics and epidemics",
    "Healthcare accessibility",
    "Nutrition and fitness trends",
    "Pharmaceutical controversies",
    "Health insurance reforms",
    "AI in medical diagnosis",
    "Global warming updates",
    "Deforestation and wildlife conservation",
    "Ocean pollution",
    "Renewable energy adoption",
    "Natural disasters and recovery",
    "Carbon emissions policies",
    "Water scarcity",
    "Sustainable agriculture",
    "Climate activism (e.g., Fridays for Future)",
    "Plastic waste reduction initiatives",
    "Fake news and misinformation",
    "Freedom of the press",
    "Social media regulation",
    "Data privacy and surveillance",
    "Digital censorship laws",
    "Influence of AI-generated media",
    "Journalism ethics",
    "Media bias and polarization",
    "Podcast and independent media growth",
    "Online advertising trends",
    "FIFA World Cup updates",
    "Olympic Games preparation",
    "Doping and athlete bans",
    "Player transfers and contracts",
    "Sports betting and regulation",
    "Women’s sports growth",
    "Stadium construction and management",
    "E-sports tournaments",
    "Athlete activism",
    "Sports injuries and recovery research",
    "Film festivals and awards (Oscars, Cannes)",
    "Celebrity controversies",
    "Music industry trends",
    "Streaming platforms competition",
    "Video game releases",
    "Art exhibitions and auctions",
    "Book publications and bestsellers",
    "Cultural preservation and heritage",
    "Fashion trends and sustainability",
    "Censorship in art and media",
    "Education reform and access",
    "Urban development and housing",
    "Crime rates and justice system",
    "Gender equality movements",
    "LGBTQ+ rights and representation",
    "Poverty and inequality",
    "Technological unemployment",
    "Remote work and hybrid culture",
    "Digital addiction and wellbeing",
    "Youth culture and internet trends"
]


In [13]:
df['topic'] = news_topics

In [20]:
# fetching and storing related data to the corresponding topic names
df['input'] = df['topic'].apply(get_relevant_news)

In [None]:
# saving the data
df.to_csv('data/relevant_news.csv')

#### After n8n Automation ( generating desired summarization for all the topics )

In [4]:
out_df = pd.read_csv('data/summary.csv')

In [7]:
out_df.head()

Unnamed: 0,index,message.role,message.content,message.refusal,logprobs,finish_reason
0,0,assistant,The recent No Kings protests against Trump's l...,,,stop
1,0,assistant,In recent discussions surrounding political in...,,,stop
2,0,assistant,Recent public discourse reveals a growing mome...,,,stop
3,0,assistant,The recent discussions surrounding internation...,,,stop
4,0,assistant,Recent discussions highlight critical issues s...,,,stop


In [None]:
df['output'] = out_df['output']

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,topic,input,output
0,0,Elections and voting behavior,"[""News number : 1\n Score : 1696 \n title : “T...",The recent No Kings protests against Trump's l...
1,1,Political corruption scandals,"[""News number : 1\n Score : 24 \n title : How ...",In recent discussions surrounding political in...
2,2,Government policy reforms,['News number : 1\n Score : 170 \n title : Opi...,Recent public discourse reveals a growing mome...
3,3,International diplomacy,['News number : 1\n Score : 581 \n title : Int...,The recent discussions surrounding internation...
4,4,Refugee and immigration policies,['News number : 1\n Score : 335 \n title : Una...,Recent discussions highlight critical issues s...


In [164]:
def get_len_input_text(txt):
    return len(' '.join(txt))

In [165]:
df['len_input'] = df['input'].apply(get_len_input_text)

In [168]:
df.len_input.argmax()

np.int64(5)

In [169]:
df.len_input.describe()

count     100.000000
mean     1706.090000
std       375.054234
min         0.000000
25%      1604.750000
50%      1843.500000
75%      1915.750000
max      2303.000000
Name: len_input, dtype: float64

In [None]:
# saving the inputs and outputs to train the model
df.to_csv('data/news_input_output.csv')