In [1]:
import pandas as pd
from datetime import datetime

def check_skipped_rows(filepath):
    # Load the Excel file
    df = pd.read_excel(filepath)
    df.columns = df.columns.str.strip()  # Clean column names

    print("Columns in Excel file:", df.columns.tolist())

    # Simulated existing entries (like a database unique key)
    existing_entries = set()

    inserted_rows = []
    skipped_rows = []

    for idx, row in df.iterrows():
        try:
            stock_code = row['Stock_Code_Yahoo'].strip()

            # Handle date conversion
            if isinstance(row['Date'], (float, int)):
                date = pd.to_datetime(row['Date'], unit='D', origin='julian').date()
            else:
                date = pd.to_datetime(row['Date']).date()

            key = (stock_code, date)

            # Check for market closed (all prices and volume = 0)
            if (
                float(row['Open']) == 0 and
                float(row['High']) == 0 and
                float(row['Low']) == 0 and
                float(row['Close']) == 0 and
                int(row['Volume']) == 0
            ):
                skipped_rows.append({
                    'Index': idx,
                    'Stock_Code': stock_code,
                    'Date': row['Date'],
                    'Reason': 'Market Closed (all zero values)'
                })
                continue

            # Check for duplicate
            if key in existing_entries:
                skipped_rows.append({
                    'Index': idx,
                    'Stock_Code': stock_code,
                    'Date': row['Date'],
                    'Reason': 'Duplicate entry'
                })
                continue

            # Simulate insertion
            existing_entries.add(key)
            inserted_rows.append(row)

        except Exception as e:
            skipped_rows.append({
                'Index': idx,
                'Stock_Code': row.get('Stock_Code_Yahoo', ''),
                'Date': row.get('Date', ''),
                'Reason': f"Error: {e}"
            })

    # Summary
    print(f"Inserted: {len(inserted_rows)} rows")
    print(f"Skipped: {len(skipped_rows)} rows")

    # Convert skipped rows to DataFrame and save
    skipped_df = pd.DataFrame(skipped_rows)
    skipped_df.to_excel("skipped_rows.xlsx", index=False)
    print("Skipped rows saved to 'skipped_rows.xlsx'.")

if __name__ == "__main__":
    check_skipped_rows("Historical_Data_v2.xlsx")


Columns in Excel file: ['Stock_Code_Yahoo', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Expiry Date']
Inserted: 18946 rows
Skipped: 21184 rows
Skipped rows saved to 'skipped_rows.xlsx'.


In [5]:
import pandas as pd
from datetime import datetime

def check_skipped_rows(filepath):
    # Load the Excel file
    df = pd.read_excel(filepath)
    df.columns = df.columns.str.strip()  # Clean column names

    print("Columns in Excel file:", df.columns.tolist())

    # Simulated existing entries (like a database unique key)
    existing_entries = set()

    inserted_rows = []
    skipped_rows = []

    for idx, row in df.iterrows():
        try:
            stock_code = str(row['Stock_Code_Yahoo']).strip()

            # Handle date conversion
            if isinstance(row['Date'], (float, int)):
                date = pd.to_datetime(row['Date'], unit='D', origin='julian').date()
            else:
                date = pd.to_datetime(row['Date']).date()

            key = (stock_code, date)

            # Check if volume is zero
            try:
                volume_val = float(row['Volume'])
                if volume_val == 0:
                    skipped_rows.append({
                        'Index': idx,
                        'Stock_Code': stock_code,
                        'Date': row['Date'],
                        'Reason': 'Volume is 0'
                    })
                    continue
            except (ValueError, TypeError):
                skipped_rows.append({
                    'Index': idx,
                    'Stock_Code': stock_code,
                    'Date': row['Date'],
                    'Reason': f'Invalid Volume: {row["Volume"]}'
                })
                continue

            # Check for duplicate
            if key in existing_entries:
                skipped_rows.append({
                    'Index': idx,
                    'Stock_Code': stock_code,
                    'Date': row['Date'],
                    'Reason': 'Duplicate entry'
                })
                continue

            # Simulate insertion
            existing_entries.add(key)
            inserted_rows.append(row)

        except Exception as e:
            skipped_rows.append({
                'Index': idx,
                'Stock_Code': row.get('Stock_Code_Yahoo', ''),
                'Date': row.get('Date', ''),
                'Reason': f"Error: {e}"
            })

    # Summary
    print(f"Inserted: {len(inserted_rows)} rows")
    print(f"Skipped: {len(skipped_rows)} rows")

    # Convert skipped rows to DataFrame and save
    if skipped_rows:
        skipped_df = pd.DataFrame(skipped_rows)
        skipped_df.to_excel("skipped_rows.xlsx", index=False)
        print("Skipped rows saved to 'skipped_rows.xlsx'.")

if __name__ == "__main__":
    check_skipped_rows("Historical_Data_v2.xlsx")


Columns in Excel file: ['Stock_Code_Yahoo', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Expiry Date']
Inserted: 18925 rows
Skipped: 21205 rows
Skipped rows saved to 'skipped_rows.xlsx'.


In [6]:
import pandas as pd
from datetime import datetime

def get_skipped_rows(filepath):
    sheet_name = 'Historical_Data_Values'
    try:
        df = pd.read_excel(filepath, sheet_name=sheet_name)
    except Exception as e:
        print(f"Error reading sheet '{sheet_name}': {e}")
        return

    # Drop first column if it's unnamed or empty
    if df.columns[0].startswith('Unnamed') or df.iloc[:, 0].isnull().all():
        df = df.iloc[:, 1:]

    df.columns = df.columns.str.strip()
    print("Columns:", df.columns.tolist())

    skipped_rows = []

    for idx, row in df.iterrows():
        try:
            stock_code = str(row['Stock_Code_Yahoo']).strip()

            # Parse date
            try:
                if isinstance(row['Date'], (float, int)):
                    date = pd.to_datetime(row['Date'], unit='D', origin='julian').date()
                else:
                    date = pd.to_datetime(row['Date']).date()
            except Exception:
                skipped_rows.append(row)
                continue

            # Parse numeric values
            try:
                open_val = float(row['Open'])
                high_val = float(row['High'])
                low_val = float(row['Low'])
                close_val = float(row['Close'])
                volume_val = float(row['Volume'])
            except Exception:
                skipped_rows.append(row)
                continue

            # Market closed check
            if open_val == 0 and high_val == 0 and low_val == 0 and close_val == 0 and volume_val == 0:
                skipped_rows.append(row)

        except Exception:
            skipped_rows.append(row)

    # Output skipped rows
    skipped_df = pd.DataFrame(skipped_rows)
    skipped_df.to_excel("skipped_rows_only.xlsx", index=False)
    print(f"{len(skipped_df)} rows skipped and saved to 'skipped_rows_only.xlsx'")

if __name__ == "__main__":
    get_skipped_rows("Historical_Data_v2.xlsx")


Columns: ['Stock_Code_Yahoo', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Expiry Date']
6505 rows skipped and saved to 'skipped_rows_only.xlsx'
