# Exploring Aylien API for dataset

In [40]:
import requests
import json
import pandas as pd
import os
import datetime

# API keys:
from dotenv import load_dotenv
load_dotenv()

# request option 2:
from aylien_news_api.rest import ApiException
from pprint import pprint
import pandas as pd
import datetime
import time
import aylien_news_api



# set the option to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Aylien News API Python SDK - Testing parameters

* working, but need to create loop to combine requests into a df.
* API guide: https://aylien.com/blog/which-taxonomy-should-you-use-to-classify-news-content-iab-qag-or-iptc-subject-codes
* Taxonomy: https://docs.aylien.com/newsapi/search-taxonomies/#search-labels-for-iab-qag

In [36]:
# Calculate the date 6 months ago
six_months_ago = datetime.datetime.now() - datetime.timedelta(days=180)

# Format the date in the required format for the Aylien API request
start_date = six_months_ago.strftime('%Y-%m-%dT%H:%M:%SZ')

# Configure API key authorization
configuration = aylien_news_api.Configuration()
configuration.api_key['X-AYLIEN-NewsAPI-Application-ID'] = os.getenv('AYLIEN_ID')
configuration.api_key['X-AYLIEN-NewsAPI-Application-Key'] = os.getenv('AYLIEN_KEY')

# Create an instance of the API class
api_instance = aylien_news_api.DefaultApi(aylien_news_api.ApiClient(configuration))

# Define the search parameters
search_params = {
  'language': ['en']
  ,'sort_by': 'published_at'
  ,'published_at_start': start_date #time restriction.
    
    ## declare what taxonomy you want to use
  ,'categories_taxonomy': 'iptc-subjectcode'
    
    ## declare category, ids corresponding to above taxonomy.
    # can have multiple categories.
    
  ,'categories_id': [
                      '04018000' # business (general)
                      ,'04016000' # company information
                      ,'04008034', # business enterprise.
                    ]
  ,'sentiment_title_polarity': 'negative' # negative sentiment articles.
  ,'per_page': 4 # n articles pulled per page.
  ,'source_rankings_alexa_rank_min': 10000 # Alexa website (ranking minimum requirement)
    
    # return structure:
  ,'_return': ['id', 'title', 'published_at', 'source', 'body']
}

# Retrieve the news articles
try:
    response = api_instance.list_stories(**search_params)
    articles = response.stories
except ApiException as e:
    print("Exception when calling DefaultApi->list_stories: %s\n" % e)
    articles = []

# Store the articles in a pandas DataFrame
df = pd.DataFrame([(a.id, a.title, a.published_at, a.source.name, a.body) for a in articles], 
                  columns=['id', 'title', 'published_at', 'source', 'body'])


## explore sample pulls

In [37]:
df.head()

Unnamed: 0,id,title,published_at,source,body
0,5709185215,"C21 Investments FY23 Revenue Declines 12.4%, R...",2023-06-02 12:21:30+00:00,Benzinga,C21 Investments Inc. CXXIF CXXI released unaud...
1,5709191998,Protecting Online Interactions: FLAIR's Soluti...,2023-06-02 12:19:46+00:00,Digital Information World,A groundbreaking study conducted by researcher...
2,5709182798,"Kia, Hyundai struggling with antitheft softwar...",2023-06-02 12:17:09+00:00,Autonews,The software fixes that Kia and Hyundai are di...
3,5709167065,"Home sales up 25% from last year, but supply r...",2023-06-02 12:14:58+00:00,CBC/Radio-Canada,The slowdown in Toronto's housing market conti...


In [31]:
# Pretty print the first 3 article descriptions
for i in range(10):
    print(f"Article {i+1} body:")
    print(df.iloc[i]['body'])
    print("-----------------------------------------------------------------")
    print()

Article 1 body:
Search  Search   Search  Search  Sana Meer  June 2, 2023
  Amkor Technology Inc. (NASDAQ: AMKR) started the day on June 01, 2023, with a price increase of 1.33% at $25.11. During the day, the stock rose to $25.33 and sunk to $24.565 before settling in for the price of $24.78 at the close. Taking a more long-term approach, AMKR posted a 52-week range of $14.89-$31.38.
  Unlock the Hottest Top 10 Penny Stocks Today! Discover Now
  Dive into the world of lucrative penny stocks with MarketClub's groundbreaking "Smart Scan" technology! Get an instant snapshot of the top 50 high volume stocks with a clear direction and outstanding liquidity - in other words, the strongest trending. To unlock this exclusive list, simply provide your first name, last name, and email for instant access.
  Gain Access to Top 10 Penny Stocks Now!
  Sponsored
  The Technology Sector giants' yearly sales growth during the last 5-year period was 11.00%. Meanwhile, its Annual Earning per share during 

IndexError: single positional indexer is out-of-bounds

## Loop the above to pull bulk articles

In [47]:
import logging
from dateutil.relativedelta import relativedelta

logging.basicConfig(level=logging.INFO)

def get_api_instance():
    configuration = aylien_news_api.Configuration()
    configuration.api_key['X-AYLIEN-NewsAPI-Application-ID'] = os.getenv('AYLIEN_ID')
    configuration.api_key['X-AYLIEN-NewsAPI-Application-Key'] = os.getenv('AYLIEN_KEY')
    return aylien_news_api.DefaultApi(aylien_news_api.ApiClient(configuration))

def fetch_news(api_instance, start_date, search_params, max_iterations=100):
    next_page_cursor = '*'
    iteration = 1
    articles_list = []
    while next_page_cursor and iteration <= max_iterations:
        search_params['cursor'] = next_page_cursor
        try:
            response = api_instance.list_stories(**search_params)
            articles = response.stories
            for a in articles:
                article_dict = {
                    'id': a.id,
                    'title': a.title,
                    'published_at': a.published_at,
                    'source': a.source.name,
                    'body': a.body
                }
                articles_list.append(article_dict)
            next_page_cursor = response.next_page_cursor
            logging.info(f"Iteration {iteration} completed. Fetched {len(articles_list)} articles.")
            iteration += 1
        except ApiException as e:
            logging.error(f"Exception when calling DefaultApi->list_stories for iteration {iteration}: {e}")
            break  # or implement retry logic here
    return articles_list

def main():
    six_months_ago = datetime.datetime.now() - relativedelta(months=6)
    start_date = six_months_ago.strftime('%Y-%m-%dT%H:%M:%SZ')
    # Define search parameters here
    search_params = {
      'language': ['en']
      ,'sort_by': 'published_at'
      ,'published_at_start': start_date #time restriction.

        ## declare what taxonomy you want to use
      ,'categories_taxonomy': 'iptc-subjectcode'

        ## declare category, ids corresponding to above taxonomy.
        # can have multiple categories.

      ,'categories_id': [
                          '04018000' # business (general)
                          ,'04016000' # company information
                          ,'04008034', # business enterprise.
                        ]
      ,'sentiment_title_polarity': 'negative' # negative sentiment articles.
      ,'per_page': 100 # n articles pulled per page.
      ,'source_rankings_alexa_rank_min': 10000 # Alexa website (ranking minimum requirement)

        # return structure:
      ,'_return': ['id', 'title', 'published_at', 'source', 'body']
    }
    api_instance = get_api_instance()
    articles = fetch_news(api_instance, start_date, search_params)
    return articles

if __name__ == "__main__":
    Aylien_business_articles = pd.DataFrame(main())

INFO:root:Iteration 1 completed. Fetched 100 articles.
INFO:root:Iteration 2 completed. Fetched 200 articles.
INFO:root:Iteration 3 completed. Fetched 300 articles.
INFO:root:Iteration 4 completed. Fetched 400 articles.
INFO:root:Iteration 5 completed. Fetched 500 articles.
INFO:root:Iteration 6 completed. Fetched 600 articles.
INFO:root:Iteration 7 completed. Fetched 700 articles.
INFO:root:Iteration 8 completed. Fetched 800 articles.
INFO:root:Iteration 9 completed. Fetched 900 articles.
INFO:root:Iteration 10 completed. Fetched 1000 articles.
INFO:root:Iteration 11 completed. Fetched 1100 articles.
INFO:root:Iteration 12 completed. Fetched 1200 articles.
INFO:root:Iteration 13 completed. Fetched 1300 articles.
INFO:root:Iteration 14 completed. Fetched 1400 articles.
INFO:root:Iteration 15 completed. Fetched 1500 articles.
INFO:root:Iteration 16 completed. Fetched 1600 articles.
INFO:root:Iteration 17 completed. Fetched 1700 articles.
INFO:root:Iteration 18 completed. Fetched 1800 ar

# Explore df

In [48]:
print(Aylien_business_articles.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype                  
---  ------        --------------  -----                  
 0   id            10000 non-null  int64                  
 1   title         10000 non-null  object                 
 2   published_at  10000 non-null  datetime64[ns, tzutc()]
 3   source        10000 non-null  object                 
 4   body          10000 non-null  object                 
dtypes: datetime64[ns, tzutc()](1), int64(1), object(3)
memory usage: 390.8+ KB
None


In [49]:
print(len(Aylien_business_articles))

10000


In [50]:
Aylien_business_articles = Aylien_business_articles.drop_duplicates()

In [51]:
print(len(Aylien_business_articles))

10000


In [55]:
# Pretty print the first 3 article descriptions
for index, row in Aylien_business_articles[100:105].iterrows():
    print(f"Article {index+1} body:")
    print(row['body'])
    print("----------------------------------------")
    print()

Article 101 body:
On this week's episode, we discuss the dangers of blind trust in enhancement technologies, and the risks that can come with using them.
We also take some time to answer questions submitted by our community, including topics like whether product management is cannibalizing UXR, whether it's worth taking a job on a dysfunctional project, and how pro bono work can impact job searches. Tune in for an insightful discussion!
  ​
  #BlindTrust #EnhancementTechnologies #ProductManagement #UXResearch #DysfunctionalProjects #ContractorWork #ProBono #JobSearching #CommunityQuestions
  Recorded live on June 1st, 2023, hosted by Nick Roome with Barry Kirby and others.
  Check out the latest from our sister podcast - 1202 The Human Factors Podcast -on Human Factors Integration - An interview with Trevor Dobbins:
  https://www.1202podcast.com/trevor-dobbins
  News:
  Blind trust in enhancement technologies encourages risk-taking even if the tech is a sham
  https://www.aalto.fi/en/n

# Export

In [61]:
Aylien_business_articles.to_pickle('datasets/Aylien/Aylien_business_articles_10k.pickle')
Aylien_business_articles.to_csv("datasets/Aylien/Aylien_business_articles_10k.csv", sep='|', index=False)

## Explore re-import to test stability

In [57]:
df_test = pd.read_pickle('datasets/Aylien_business_articles.pickle')
print(len(df_test))
df_test.info() #should match above. errors if not.

10000
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype                  
---  ------        --------------  -----                  
 0   id            10000 non-null  int64                  
 1   title         10000 non-null  object                 
 2   published_at  10000 non-null  datetime64[ns, tzutc()]
 3   source        10000 non-null  object                 
 4   body          10000 non-null  object                 
dtypes: datetime64[ns, tzutc()](1), int64(1), object(3)
memory usage: 468.8+ KB


In [58]:
df_test.head()

Unnamed: 0,id,title,published_at,source,body
0,5709294677,: Nike stock jumps to lead the Dow's premarket...,2023-06-02 12:52:39+00:00,Finanzen.ch,Nike 96.41 CHF -1.99% Charts News Analysen...
1,5709292390,Renault Customers Planning Criminal Complaint ...,2023-06-02 12:52:02+00:00,MarketScreener,Financials EUR USD GBP Sales 2023 50 468 M 54 ...
2,5709276868,Mersana Therapeutics Inc. (MRSN) last month vo...,2023-06-02 12:46:55+00:00,Newsdaemon.com,Search Search Search Search Sana Meer Ju...
3,5709269010,Mike Novogratz Says Crypto is Lackadaisical Ri...,2023-06-02 12:46:28+00:00,Crypto Potato,"Michael Novogratz, the founder and CEO of Gala..."
4,5709271283,Another Soft Market Is Inevitable,2023-06-02 12:46:23+00:00,Carrier Management,New You can now listen to Carrier Management a...


In [59]:
# Pretty print the first 3 article descriptions
for i in range(10):
    print(f"Article {i+1} body:")
    print(df_test.iloc[i]['body'])
    print()

Article 1 body:
Nike  96.41 CHF -1.99%  Charts  News  Analysen  Kaufen  Verkaufen Shares of Nike Inc.
NKE jumped 3.0% to pace all of the Dow Jones Industrial Average's DJIA premarket gainers, putting them on track to snap a four-day losing streak. The athletic apparel giant's stock had actually dropped in nine of the past 10 sessions, to close Thursday at the lowest price since Dec. 20, 2022. It has lost 13.1% during the 10-day stretch, while the Dow has slipped 1.4% over the same time. During that losing stretch, Nike received a rare “sell” recommendation from Williams Trading's Sam Poser, who said the company's footwear has run “stale.” The stock's bounce Friday comes a day after Macy's Inc. M said it will bring Nike products back to certain stores and e-commerce operations in the fall, with plans to scale to additional stores in 2024.Market Pulse Stories are Rapid-fire, short news bursts on stocks and markets as they move. Visit MarketWatch.com for more information on this news.
  W