# Stock Data and News Collection System
## Time Series Analysis with Twelve Data and NewsAPI

This notebook collects stock market data from Twelve Data API and related news from NewsAPI.ai, formatting the data for time series analysis.

### 1. Install Required Libraries

In [None]:
!pip install requests pandas numpy matplotlib seaborn plotly -q

### 2. Import Libraries

In [None]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ Libraries imported successfully")

### 3. Configure API Keys

**Get your API keys:**
- Twelve Data: https://twelvedata.com/
- NewsAPI.ai: https://newsapi.ai/

In [None]:
# Replace with your actual API keys
TWELVE_DATA_API_KEY = "ac3aa4f4061b4c4f91a9e3636dbde84a"
NEWSAPI_KEY = "ae825b1e-233a-4559-a551-9d59adffc00f"

# Configuration - Major stocks across different sectors
STOCK_SYMBOLS = [
    "AAPL",   # Apple - Technology
    "MSFT",   # Microsoft - Technology
    "GOOGL",  # Google - Technology
    "AMZN",   # Amazon - E-commerce/Cloud
    "NVDA",   # NVIDIA - Semiconductors
    "META",   # Meta - Social Media
    "TSLA",   # Tesla - Electric Vehicles
    "JPM",    # JPMorgan Chase - Banking
    "V",      # Visa - Financial Services
    "WMT"     # Walmart - Retail
]

COMPANY_NAMES = {
    "AAPL": "Apple Inc",
    "MSFT": "Microsoft",
    "GOOGL": "Google",
    "AMZN": "Amazon",
    "NVDA": "NVIDIA",
    "META": "Meta",
    "TSLA": "Tesla",
    "JPM": "JPMorgan Chase",
    "V": "Visa",
    "WMT": "Walmart"
}

print(f"✓ Configuration complete - Analyzing {len(STOCK_SYMBOLS)} major stocks")

### 4. Stock Data Collector Class

In [None]:
class StockDataCollector:
    """Collects stock market data from Twelve Data API"""
    
    def __init__(self, api_key):
        self.api_key = api_key
        self.base_url = "https://api.twelvedata.com"
        
    def get_time_series(self, symbol, interval="1day", outputsize=30, timezone="America/New_York"):
        """Fetch time series stock data"""
        endpoint = f"{self.base_url}/time_series"
        params = {
            "symbol": symbol,
            "interval": interval,
            "outputsize": outputsize,
            "timezone": timezone,
            "apikey": self.api_key
        }
        
        response = requests.get(endpoint, params=params)
        
        if response.status_code == 200:
            data = response.json()
            
            if "values" in data:
                df = pd.DataFrame(data["values"])
                df["datetime"] = pd.to_datetime(df["datetime"])
                df = df.sort_values("datetime")
                
                # Convert numeric columns
                numeric_cols = ["open", "high", "low", "close", "volume"]
                for col in numeric_cols:
                    if col in df.columns:
                        df[col] = pd.to_numeric(df[col], errors="coerce")
                
                return df
            else:
                print(f"Error: {data.get('message', 'Unknown error')}")
                return pd.DataFrame()
        else:
            print(f"HTTP Error: {response.status_code}")
            return pd.DataFrame()
    
    def get_quote(self, symbol):
        """Get real-time quote for a stock"""
        endpoint = f"{self.base_url}/quote"
        params = {
            "symbol": symbol,
            "apikey": self.api_key
        }
        
        response = requests.get(endpoint, params=params)
        return response.json() if response.status_code == 200 else {}

print("✓ StockDataCollector class defined")

### 5. News Collector Class

In [None]:
class NewsCollector:
    """Collects news articles from NewsAPI.ai"""
    
    def __init__(self, api_key):
        self.api_key = api_key
        self.base_url = "https://newsapi.ai/api/v1"
        
    def search_articles(self, keyword, max_items=50, lang="eng", 
                       date_start=None, date_end=None):
        """Search for news articles related to a keyword"""
        endpoint = f"{self.base_url}/article/getArticles"
        
        query = {
            "$query": {
                "$and": [
                    {"keyword": keyword, "lang": lang}
                ]
            },
            "resultType": "articles",
            "articlesSortBy": "date",
            "articlesCount": max_items,
            "apiKey": self.api_key
        }
        
        if date_start:
            query["dateStart"] = date_start
        if date_end:
            query["dateEnd"] = date_end
        
        response = requests.post(endpoint, json=query)
        
        if response.status_code == 200:
            data = response.json()
            
            if "articles" in data and "results" in data["articles"]:
                articles = data["articles"]["results"]
                
                articles_list = []
                for article in articles:
                    articles_list.append({
                        "date": article.get("date"),
                        "datetime": article.get("dateTime"),
                        "title": article.get("title"),
                        "body": article.get("body"),
                        "url": article.get("url"),
                        "source": article.get("source", {}).get("title"),
                        "sentiment": article.get("sentiment"),
                        "relevance": article.get("relevance")
                    })
                
                df = pd.DataFrame(articles_list)
                if not df.empty and "datetime" in df.columns:
                    df["datetime"] = pd.to_datetime(df["datetime"])
                    df = df.sort_values("datetime")
                
                return df
            else:
                print(f"No articles found")
                return pd.DataFrame()
        else:
            print(f"HTTP Error: {response.status_code}")
            return pd.DataFrame()

print("✓ NewsCollector class defined")

### 6. Time Series Formatter Class

In [None]:
class TimeSeriesFormatter:
    """Format collected data as sequences for time series analysis"""
    
    @staticmethod
    def create_sequences(data, sequence_length=10, target_column="close"):
        """Create sequences for time series analysis"""
        if data.empty or target_column not in data.columns:
            return np.array([]), np.array([])
        
        values = data[target_column].values
        X, y = [], []
        
        for i in range(len(values) - sequence_length):
            X.append(values[i:i + sequence_length])
            y.append(values[i + sequence_length])
        
        return np.array(X), np.array(y)
    
    @staticmethod
    def normalize_data(data, columns):
        """Normalize specified columns to 0-1 range"""
        normalized_data = data.copy()
        
        for col in columns:
            if col in data.columns:
                min_val = data[col].min()
                max_val = data[col].max()
                if max_val - min_val != 0:
                    normalized_data[col] = (data[col] - min_val) / (max_val - min_val)
        
        return normalized_data
    
    @staticmethod
    def merge_stock_news(stock_df, news_df, time_window="1D"):
        """Merge stock data with news data based on time proximity"""
        if stock_df.empty or news_df.empty:
            return stock_df
        
        if "datetime" not in stock_df.columns or "datetime" not in news_df.columns:
            return stock_df
        
        merged_df = stock_df.copy()
        merged_df["news_count"] = 0
        merged_df["news_titles"] = ""
        
        for idx, row in stock_df.iterrows():
            stock_time = row["datetime"]
            
            time_mask = (news_df["datetime"] >= stock_time - pd.Timedelta(time_window)) & \
                       (news_df["datetime"] <= stock_time + pd.Timedelta(time_window))
            
            relevant_news = news_df[time_mask]
            
            merged_df.at[idx, "news_count"] = len(relevant_news)
            if not relevant_news.empty:
                merged_df.at[idx, "news_titles"] = " | ".join(relevant_news["title"].head(3).tolist())
        
        return merged_df

print("✓ TimeSeriesFormatter class defined")

### 7. Collect Stock Data

In [None]:
# Initialize collector
stock_collector = StockDataCollector(TWELVE_DATA_API_KEY)

# Collect data for specified stocks
stock_data_dict = {}

for symbol in STOCK_SYMBOLS:
    print(f"\nFetching stock data for {symbol}...")
    df = stock_collector.get_time_series(
        symbol=symbol,
        interval="1day",
        outputsize=90  # Last 90 days
    )
    
    if not df.empty:
        stock_data_dict[symbol] = df
        print(f"✓ Retrieved {len(df)} data points for {symbol}")
        print(df.head())
    else:
        print(f"✗ No data retrieved for {symbol}")
    
    time.sleep(1)  # Rate limiting

print(f"\n{'='*50}")
print(f"Total stocks collected: {len(stock_data_dict)}")
print(f"{'='*50}")

### 8. Visualize Stock Data

In [None]:
# Plot closing prices for all stocks
plt.figure(figsize=(14, 6))

for symbol, df in stock_data_dict.items():
    plt.plot(df["datetime"], df["close"], label=symbol, linewidth=2)

plt.title("Stock Closing Prices Over Time", fontsize=16, fontweight='bold')
plt.xlabel("Date", fontsize=12)
plt.ylabel("Closing Price ($)", fontsize=12)
plt.legend(loc='best', fontsize=10)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### 9. Collect News Data

In [None]:
# Initialize news collector
news_collector = NewsCollector(NEWSAPI_KEY)

# Date range for news
date_end = datetime.now().strftime("%Y-%m-%d")
date_start = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")

# Collect news for each stock
news_data_dict = {}

for symbol in STOCK_SYMBOLS:
    company_name = COMPANY_NAMES.get(symbol, symbol)
    print(f"\nFetching news for {company_name} ({symbol})...")
    
    df = news_collector.search_articles(
        keyword=company_name,
        max_items=50,
        date_start=date_start,
        date_end=date_end
    )
    
    if not df.empty:
        news_data_dict[symbol] = df
        print(f"✓ Retrieved {len(df)} articles for {symbol}")
        print(df[["datetime", "title", "source"]].head())
    else:
        print(f"✗ No news retrieved for {symbol}")
    
    time.sleep(2)  # Rate limiting

print(f"\n{'='*50}")
print(f"Total stocks with news: {len(news_data_dict)}")
print(f"{'='*50}")

### 10. Merge Stock and News Data

In [None]:
# Initialize formatter
formatter = TimeSeriesFormatter()

# Merge data for each stock
merged_data_dict = {}

for symbol in STOCK_SYMBOLS:
    if symbol in stock_data_dict and symbol in news_data_dict:
        print(f"\nMerging data for {symbol}...")
        
        merged_df = formatter.merge_stock_news(
            stock_data_dict[symbol],
            news_data_dict[symbol],
            time_window="1D"
        )
        
        merged_data_dict[symbol] = merged_df
        print(f"✓ Merged data has {len(merged_df)} rows")
        print(merged_df[["datetime", "close", "volume", "news_count"]].head())

print(f"\n{'='*50}")
print(f"Total merged datasets: {len(merged_data_dict)}")
print(f"{'='*50}")

### 11. Visualize News Impact

In [None]:
# Plot stock price with news volume overlay
for symbol in merged_data_dict.keys():
    df = merged_data_dict[symbol]
    
    fig, ax1 = plt.subplots(figsize=(14, 6))
    
    # Plot closing price
    ax1.plot(df["datetime"], df["close"], color='blue', linewidth=2, label='Close Price')
    ax1.set_xlabel('Date', fontsize=12)
    ax1.set_ylabel('Close Price ($)', color='blue', fontsize=12)
    ax1.tick_params(axis='y', labelcolor='blue')
    
    # Create second y-axis for news count
    ax2 = ax1.twinx()
    ax2.bar(df["datetime"], df["news_count"], alpha=0.3, color='red', label='News Count')
    ax2.set_ylabel('News Article Count', color='red', fontsize=12)
    ax2.tick_params(axis='y', labelcolor='red')
    
    plt.title(f'{symbol} - Stock Price vs News Volume', fontsize=16, fontweight='bold')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

### 12. Create Time Series Sequences

In [None]:
# Create sequences for each stock
sequence_length = 10  # Use last 10 days to predict next day

sequences_dict = {}

for symbol in stock_data_dict.keys():
    df = stock_data_dict[symbol]
    
    print(f"\nCreating sequences for {symbol}...")
    X, y = formatter.create_sequences(df, sequence_length=sequence_length, target_column="close")
    
    if len(X) > 0:
        sequences_dict[symbol] = {"X": X, "y": y}
        print(f"✓ Created {len(X)} sequences")
        print(f"  Input shape: {X.shape}")
        print(f"  Target shape: {y.shape}")
        print(f"  Sample sequence: {X[0]}")
        print(f"  Sample target: {y[0]}")
    else:
        print(f"✗ No sequences created for {symbol}")

### 13. Normalize Data

In [None]:
# Normalize stock data for modeling
normalized_data_dict = {}

for symbol in stock_data_dict.keys():
    df = stock_data_dict[symbol]
    
    print(f"\nNormalizing data for {symbol}...")
    normalized_df = formatter.normalize_data(
        df,
        columns=["open", "high", "low", "close", "volume"]
    )
    
    normalized_data_dict[symbol] = normalized_df
    print(f"✓ Normalized {len(normalized_df)} rows")
    print(normalized_df[["datetime", "open", "high", "low", "close"]].head())

### 14. Save Data to CSV

In [None]:
# Save all collected data
print("\nSaving data to CSV files...\n")

for symbol in STOCK_SYMBOLS:
    # Save stock data
    if symbol in stock_data_dict:
        filename = f"{symbol}_stock_data.csv"
        stock_data_dict[symbol].to_csv(filename, index=False)
        print(f"✓ Saved {filename}")
    
    # Save news data
    if symbol in news_data_dict:
        filename = f"{symbol}_news_data.csv"
        news_data_dict[symbol].to_csv(filename, index=False)
        print(f"✓ Saved {filename}")
    
    # Save merged data
    if symbol in merged_data_dict:
        filename = f"{symbol}_merged_data.csv"
        merged_data_dict[symbol].to_csv(filename, index=False)
        print(f"✓ Saved {filename}")
    
    # Save normalized data
    if symbol in normalized_data_dict:
        filename = f"{symbol}_normalized_data.csv"
        normalized_data_dict[symbol].to_csv(filename, index=False)
        print(f"✓ Saved {filename}")

print("\n" + "="*50)
print("Data collection and processing complete!")
print("="*50)

### 15. Summary Statistics

In [None]:
# Display summary for each stock
for symbol in stock_data_dict.keys():
    print(f"\n{'='*50}")
    print(f"Summary for {symbol}")
    print(f"{'='*50}")
    
    df = stock_data_dict[symbol]
    print(f"\nStock Data Statistics:")
    print(df[["open", "high", "low", "close", "volume"]].describe())
    
    if symbol in news_data_dict:
        news_df = news_data_dict[symbol]
        print(f"\nNews Articles: {len(news_df)}")
        print(f"Date Range: {news_df['datetime'].min()} to {news_df['datetime'].max()}")
    
    if symbol in sequences_dict:
        print(f"\nTime Series Sequences: {len(sequences_dict[symbol]['X'])}")
        print(f"Sequence Length: {sequence_length} days")