In [5]:
import os
import FinanceDataReader as fdr
import pandas as pd
import yfinance as yf

# 주식 종목 불러오기
nasdaq = fdr.StockListing('NASDAQ')

# 데이터 합치기 (모든 정보 포함)
df = pd.concat([nasdaq])

# 중복된 데이터 제거 (Symbol 기준으로 중복 제거)
df = df.drop_duplicates('Symbol')

# Symbol과 Name, IndustryCode, Industry 포함된 리스트 가져오기
symbols = df[['Symbol', 'Name', 'IndustryCode', 'Industry']].to_dict('records')

# 시가총액 정보 수집 및 추가
data = []
for stock in symbols:
    try:
        ticker = yf.Ticker(stock['Symbol'])
        info = ticker.info
        market_cap = info.get('marketCap', 0)
        if market_cap > 0:  # 시가총액이 있는 경우만 추가
            data.append({
                'Symbol': stock['Symbol'], 
                'Name': stock['Name'], 
                'IndustryCode': stock['IndustryCode'],
                'Industry': stock['Industry'], 
                'Market Cap': market_cap
            })
    except Exception as e:
        print(f"Error fetching data for {stock['Symbol']}: {e}")

# 데이터프레임으로 변환
df_market_cap = pd.DataFrame(data)

100%|███████████████████████████████████████████| 3685/3685 [00:02<00:00, 1550.50it/s]
404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/FUSN?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=FUSN&crumb=ZG1B6sr.%2Ffn
404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/VTRU?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=VTRU&crumb=ZG1B6sr.%2Ffn
404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/QDROU?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=QDROU&crumb=ZG1B6sr.%2Ffn
404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/SCTL?modules=financia

In [8]:
# 결과를 Pandas 데이터프레임으로 변환
df_market_cap = pd.DataFrame(data)

# 데이터 확인
print(df_market_cap)

# 저장할 폴더 경로
folder_path = './Data'

# 폴더가 없으면 생성
os.makedirs(folder_path, exist_ok=True)

# CSV 파일로 저장
csv_filename = os.path.join(folder_path, 'df_market_cap.csv')
df_market_cap.to_csv(csv_filename, index=False, encoding="utf-8")
print(f"Data saved to {csv_filename}")

     Symbol                                               Name IndustryCode  \
0      NVDA                                        NVIDIA Corp     57101010   
1      AAPL                                          Apple Inc     57106020   
2      MSFT                                     Microsoft Corp     57201020   
3      AMZN                                     Amazon.com Inc     53402010   
4      META                                 Meta Platforms Inc     57201030   
...     ...                                                ...          ...   
3370   TCBP                     TC Biopharm (Holdings) PLC ADR     56202010   
3371  BHFAL  Brighthouse Financial Junior Subordinated Debe...     55301010   
3372    JSM       Navient 6 Prcnt Senior Notes Exp 15 Dec 2043     55101030   
3373   PMNT                                 Perfect Moment Ltd     53205020   
3374  BKHAR                      Black Hawk Acquisition Rights     55601010   

           Industry     Market Cap  
0             

In [9]:
# 시가총액 기준으로 상위 30개 종목 선택
df_top = df_market_cap.sort_values(by='Market Cap', ascending=False).head(30)

# 'Data' 폴더가 없으면 생성
if not os.path.exists('Data'):
    os.makedirs('Data')

# 상위 30개 종목의 정보를 'Data' 폴더에 CSV 파일로 저장
df_top[['Symbol', 'Name', 'IndustryCode', 'Industry']].to_csv('Data/top_symbols.csv', index=False)

print(df_top)

   Symbol                         Name IndustryCode       Industry  \
0    NVDA                  NVIDIA Corp     57101010            반도체   
1    AAPL                    Apple Inc     57106020     전화 및 소형 장치   
2    MSFT               Microsoft Corp     57201020          소프트웨어   
3    AMZN               Amazon.com Inc     53402010            백화점   
7    GOOG         Alphabet Inc Class C     57201030        온라인 서비스   
5   GOOGL         Alphabet Inc Class A     57201030        온라인 서비스   
4    META           Meta Platforms Inc     57201030        온라인 서비스   
6    TSLA                    Tesla Inc     53101010    자동차 및 트럭 제조   
8    AVGO                 Broadcom Inc     57101010            반도체   
9    COST        Costco Wholesale Corp     53402020            할인점   
10   NFLX                  Netflix Inc     57201030        온라인 서비스   
12   ASML          ASML Holding NV ADR     57101020   반도체 장비 및 테스트   
11   TMUS              T-Mobile US Inc     57401020      무선 통신 서비스   
13   CSCO           

In [10]:
df_top['Industry'].unique()

array(['반도체', '전화 및 소형 장치', '소프트웨어', '백화점', '온라인 서비스', '자동차 및 트럭 제조',
       '할인점', '반도체 장비 및 테스트', '무선 통신 서비스', '통신 및 네트워킹', '무알콜 음료', '상품 화학',
       '제약', '핀테크', '첨단 의료 장비 및 기술', '여가 및 오락시설', '소비재 대기업'], dtype=object)

In [11]:
industry_translation = {
    '전화 및 소형 장치': 'Telecommunication and Small Devices',
    '반도체': 'Semiconductors',
    '소프트웨어': 'Software',
    '온라인 서비스': 'Online Services',
    '백화점': 'Retail',
    '자동차 및 트럭 제조': 'Automobile and Truck Manufacturing',
    '제약': 'Pharmaceuticals',
    '식품 소매 및 유통': 'Food Retail and Distribution',
    '은행': 'Banking',
    '통합 오일 및 가스': 'Integrated Oil and Gas',
    '의료 관리': 'Healthcare Management',
    '개인 생활 필수 용품': 'Personal Essentials',
    '할인점': 'Discount Stores',
    '주택 개조 제품 및 서비스 소매': 'Home Improvement Retail',
    'IT 서비스 및 컨설팅': 'IT Services and Consulting',
    '무알콜 음료': 'Non-Alcoholic Beverages',
    '반도체 장비 및 테스트': 'Semiconductor Equipment and Testing',
    '무선 통신 서비스': 'Wireless Communication Services',
    '상품 화학': 'Commodity Chemicals',
    '통신 및 네트워킹': 'Telecommunication and Networking',
    '레스토랑 및 바': 'Restaurants and Bars',
    '첨단 의료 장비 및 기술': 'Advanced Medical Equipment and Technology',
    '투자 관리 및 펀드 운영': 'Investment Management and Fund Operation',
    '담배': 'Tobacco',
    '의료 장비, 물품 및 유통': 'Medical Equipment, Supplies, and Distribution',
    '투자 은행 및 중개 서비스': 'Investment Banking and Brokerage',
    '소비자 대출': 'Consumer Lending',
    '항공우주 및 방위': 'Aerospace and Defense',
    '중장비 및 차량': 'Heavy Machinery and Vehicles',
    '핀테크': 'Fintech',
    '방송': 'Broadcasting',
    '통합 통신 서비스': 'Integrated Communication Services',
    '전력 유틸리티': 'Electric Utilities',
    '전문 정보 서비스': 'Professional Information Services',
    '여가 및 오락시설': 'Leisure and Entertainment Facilities',
    '다각적 채굴': 'Diversified Mining',
    '손해보험': 'Property and Casualty Insurance',
    '지상 화물 및 물류': 'Ground Freight and Logistics',
    '전기 부품 및 장비': 'Electrical Components and Equipment',
    '소비재 대기업': 'Consumer Goods Conglomerates',
    '양조업': 'Brewing',
    '생명 공학 및 의학 연구': 'Biotechnology and Medical Research'
}

# Applying the translation to the Industry column
df_top['Industry'] = df_top['Industry'].map(industry_translation)

# Display the translated DataFrame
print(df_top)

df_top.to_csv('Data/top_symbols.csv', index=False)

   Symbol                         Name IndustryCode  \
0    NVDA                  NVIDIA Corp     57101010   
1    AAPL                    Apple Inc     57106020   
2    MSFT               Microsoft Corp     57201020   
3    AMZN               Amazon.com Inc     53402010   
7    GOOG         Alphabet Inc Class C     57201030   
5   GOOGL         Alphabet Inc Class A     57201030   
4    META           Meta Platforms Inc     57201030   
6    TSLA                    Tesla Inc     53101010   
8    AVGO                 Broadcom Inc     57101010   
9    COST        Costco Wholesale Corp     53402020   
10   NFLX                  Netflix Inc     57201030   
12   ASML          ASML Holding NV ADR     57101020   
11   TMUS              T-Mobile US Inc     57401020   
13   CSCO            Cisco Systems Inc     57102010   
14   ADBE                    Adobe Inc     57201020   
15    AMD   Advanced Micro Devices Inc     57101010   
16    PEP                  PepsiCo Inc     54101030   
17    LIN 

In [13]:
import csv
from gnews import GNews
from datetime import datetime, timedelta
import pandas as pd

def fetch_article_for_symbol(symbol, target_date, max_attempts=2):
    google_news = GNews(language='en', country='US', max_results=10)
    attempt = 0
    
    while attempt < max_attempts:
        search_date = target_date + timedelta(days=attempt)
        google_news.start_date = search_date - timedelta(days=1)
        google_news.end_date = search_date + timedelta(days=1)
        
        articles = google_news.get_news(symbol)
        
        for article in articles:
            try:
                # Parse the published date
                published_date = datetime.strptime(article['published date'], '%a, %d %b %Y %H:%M:%S %Z').date()
                
                # Check if published date matches the target date
                if published_date == target_date.date():
                    return {
                        'Date': target_date.strftime('%Y-%m-%d'),
                        'Symbol': symbol,
                        'Title': article['title'],
                        'Published Date': article['published date'],
                        'Source': article['publisher']['title'],
                        'URL': article['url']
                    }
            except Exception as e:
                print(f"Error processing date for {symbol}: {e}")
        
        attempt += 1  # Increment attempt if no exact match found
    
    # Return default values if no article is found
    return {
        'Date': target_date.strftime('%Y-%m-%d'),
        'Symbol': symbol,
        'Title': 'No articles found',
        'Published Date': '',
        'Source': '',
        'URL': ''
    }

# Load symbols from Data/top_symbols.csv
top_symbols_path = 'Data/top_symbols.csv'
top_symbols = pd.read_csv(top_symbols_path)['Symbol'].tolist()

# Define the date range
start_date = datetime(2021, 1, 1)
end_date = datetime(2023, 12, 31)

# CSV 파일 저장 경로
csv_filename = 'Data/news_data.csv'

# CSV 파일 초기화 및 헤더 작성
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Date', 'Symbol', 'Title', 'Published Date', 'Source', 'URL']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

# Loop through each day in the date range
current_date = start_date
while current_date <= end_date:
    # Check if the current day is NOT Friday (4) or Saturday (5)
    if current_date.weekday() not in [4, 5]:
        # Loop through each symbol in top_symbols
        for symbol in top_symbols:
            # Fetch the article for the symbol and date using the function
            article_data = fetch_article_for_symbol(symbol, current_date)
            
            # 매번 파일에 기록
            with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writerow(article_data)

    # Move to the next day
    current_date += timedelta(days=1)

print("News data has been saved to Data/news_data.csv.")

# Display the first few rows of the CSV file
news_df = pd.read_csv(csv_filename)
print(news_df.head())


News data has been saved to Data/news_data.csv.
         Date Symbol              Title Published Date Source  URL
0  2021-01-03   NVDA  No articles found            NaN    NaN  NaN
1  2021-01-03   AAPL  No articles found            NaN    NaN  NaN
2  2021-01-03   MSFT  No articles found            NaN    NaN  NaN
3  2021-01-03   AMZN  No articles found            NaN    NaN  NaN
4  2021-01-03   GOOG  No articles found            NaN    NaN  NaN


In [14]:
news_df = pd.read_csv(csv_filename)

# 'Published Date' 열이 공란인 행 제거
news_df_cleaned = news_df.dropna(subset=['Published Date'])

# 수정된 데이터프레임을 다시 CSV 파일에 저장
news_df_cleaned.to_csv(csv_filename, index=False)

print("Rows with empty 'Published Date' have been removed and the file has been saved.")

Rows with empty 'Published Date' have been removed and the file has been saved.
