In [44]:
import pandas as pd
import numpy as np
import glob
import os
from datetime import datetime
from pandas.errors import EmptyDataError

In [45]:
# Define the directory containing the .txt files
data_directory = os.path.expanduser('~/Small-Cap-Scout/raw_data/Data/Stocks')

# Define the date range
start_date = '2010-01-01'
end_date = datetime.today()

# List to hold all dataframes
all_data = []

# Use glob to find all .txt files in the specified directory
file_pattern = f"{data_directory}/*.txt"
txt_files = glob.glob(file_pattern)

# Debugging: Print the list of files found
print(f"Files found: {txt_files}")

Files found: ['/Users/eoingaynard/Small-Cap-Scout/raw_data/Data/Stocks/iba.us.txt', '/Users/eoingaynard/Small-Cap-Scout/raw_data/Data/Stocks/wpz.us.txt', '/Users/eoingaynard/Small-Cap-Scout/raw_data/Data/Stocks/opnt.us.txt', '/Users/eoingaynard/Small-Cap-Scout/raw_data/Data/Stocks/indf.us.txt', '/Users/eoingaynard/Small-Cap-Scout/raw_data/Data/Stocks/flic.us.txt', '/Users/eoingaynard/Small-Cap-Scout/raw_data/Data/Stocks/nbhc.us.txt', '/Users/eoingaynard/Small-Cap-Scout/raw_data/Data/Stocks/ncs.us.txt', '/Users/eoingaynard/Small-Cap-Scout/raw_data/Data/Stocks/grvy.us.txt', '/Users/eoingaynard/Small-Cap-Scout/raw_data/Data/Stocks/lgcyo.us.txt', '/Users/eoingaynard/Small-Cap-Scout/raw_data/Data/Stocks/eqco.us.txt', '/Users/eoingaynard/Small-Cap-Scout/raw_data/Data/Stocks/gpk.us.txt', '/Users/eoingaynard/Small-Cap-Scout/raw_data/Data/Stocks/wti.us.txt', '/Users/eoingaynard/Small-Cap-Scout/raw_data/Data/Stocks/dyn_a.us.txt', '/Users/eoingaynard/Small-Cap-Scout/raw_data/Data/Stocks/cstr.us.t

In [46]:
# Loop through each .txt file found
for file_path in txt_files:
    try:
        # Get the company ticker from the filename
        filename = os.path.basename(file_path)  # Get the filename from the path
        company_ticker = filename.split('.')[0]  # Extract the ticker from the filename

        # Load the data from the .txt file
        df = pd.read_csv(file_path, parse_dates=['Date'])

        # Drop all columns except 'Date', 'Volume', and 'Close'
        columns_to_keep = ['Date', 'Volume', 'Close']
        df = df[[col for col in columns_to_keep if col in df.columns]]

        # Check if required columns are present
        if 'Date' not in df.columns or 'Close' not in df.columns:
            print(f"File {filename} is missing required columns. Skipping.")
            continue

        # Debugging: Check if DataFrame is loaded properly
        print(f"Processing {company_ticker}, initial data size: {df.shape}")

        # Filter the data by date
        df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]

        # Debugging: Check if the data filtering is correct
        print(f"Filtered data size for {company_ticker}: {df.shape}")

        # Proceed only if the DataFrame is not empty
        if not df.empty:
            # Calculate the monthly average prices
            df['Month'] = df['Date'].dt.to_period('M')
            monthly_avg = df.groupby('Month')['Close'].mean().reset_index()
            monthly_avg.rename(columns={'Close': 'Monthly_Avg_Close'}, inplace=True)

            # Calculate the monthly total volume
            monthly_volume_total = df.groupby('Month')['Volume'].sum().reset_index()
            monthly_volume_total.rename(columns={'Volume': 'Monthly_Volume_Total'}, inplace=True)

            # Calculate the monthly average volume
            monthly_volume_avg = df.groupby('Month')['Volume'].mean().reset_index()
            monthly_volume_avg.rename(columns={'Volume': 'Monthly_Volume_Avg'}, inplace=True)

            # Calculate the monthly volatility
            df['Log_Return'] = df['Close'].pct_change().apply(lambda x: np.log(1+x))
            monthly_volatility = df.groupby('Month')['Log_Return'].std().reset_index()

            # Dynamically calculate the number of months for volatility scaling
            num_months = len(monthly_volatility)
            if num_months > 0:
                monthly_volatility['Monthly_Volatility'] = monthly_volatility['Log_Return'] * np.sqrt(num_months)
            else:
                monthly_volatility['Monthly_Volatility'] = np.nan  # In case of no data

            monthly_volatility.drop(columns=['Log_Return'], inplace=True)

            # Merge the monthly average, total volume, average volume, and volatility data
            result = pd.merge(monthly_avg, monthly_volume_total, on='Month', how='inner')
            result = pd.merge(result, monthly_volume_avg, on='Month', how='inner')
            result = pd.merge(result, monthly_volatility, on='Month', how='inner')
            result['Ticker'] = company_ticker

            # Rearrange columns to put 'Ticker' first
            column_order = ['Ticker'] + [col for col in result.columns if col != 'Ticker']
            result = result[column_order]

            # Append to the list
            all_data.append(result)
        else:
            print(f"No data available for {company_ticker} in the specified date range.")

    except EmptyDataError:
        print(f"File {file_path} is empty. Skipping.")
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

# Check if there's any data to concatenate
if all_data:
    # Concatenate all data into a single dataframe
    final_df = pd.concat(all_data, ignore_index=True)

    # Save the final dataframe to a CSV file
    output_file = os.path.expanduser('~/Small-Cap-Scout/raw_data/processed_data.csv')
    final_df.to_csv(output_file, index=False)
    print(f"Data processing complete. Results saved to {output_file}.")

    # Optionally, display the first few rows of the final DataFrame
    print("First 5 rows of the processed data:")
    print(final_df.head())
else:
    print("No data to concatenate. Please check your files and filters.")

Processing iba, initial data size: (3199, 3)
Filtered data size for iba: (1979, 3)
Processing wpz, initial data size: (3081, 3)
Filtered data size for wpz: (1979, 3)
Processing opnt, initial data size: (53, 3)
Filtered data size for opnt: (53, 3)
Processing indf, initial data size: (356, 3)
Filtered data size for indf: (356, 3)
Processing flic, initial data size: (3006, 3)
Filtered data size for flic: (1979, 3)
Processing nbhc, initial data size: (1295, 3)
Filtered data size for nbhc: (1295, 3)
Processing ncs, initial data size: (3201, 3)
Filtered data size for ncs: (1979, 3)
Processing grvy, initial data size: (3076, 3)
Filtered data size for grvy: (1896, 3)
Processing lgcyo, initial data size: (853, 3)
Filtered data size for lgcyo: (853, 3)
Processing eqco, initial data size: (1331, 3)
Filtered data size for eqco: (1331, 3)
Processing gpk, initial data size: (3201, 3)
Filtered data size for gpk: (1979, 3)
Processing wti, initial data size: (3201, 3)
Filtered data size for wti: (1979,

In [48]:
# Let's load the CSV file into a DataFrame
processed_df = pd.read_csv(output_file)

# Display the first 5 rows of the DataFrame
print(processed_df.tail())

       Ticker    Month  Monthly_Avg_Close  Monthly_Volume_Total  \
467050    apf  2017-07          16.892010                220196   
467051    apf  2017-08          17.092074                559754   
467052    apf  2017-09          17.304750                431482   
467053    apf  2017-10          17.839318                571405   
467054    apf  2017-11          18.087013                 83924   

        Monthly_Volume_Avg  Monthly_Volatility  
467050        11009.800000            0.062602  
467051        24337.130435            0.067810  
467052        21574.100000            0.049952  
467053        25972.954545            0.072105  
467054        10490.500000            0.066259  


In [50]:
# Define the path to the processed data CSV file
processed_data_file = os.path.expanduser('~/Small-Cap-Scout/raw_data/processed_data.csv')

# Define the path to the new CSV file with tickers
tickers_file = os.path.expanduser('~/Small-Cap-Scout/raw_data/tickers.csv')

# Read the processed data CSV file
try:
    df = pd.read_csv(processed_data_file)

    # Ensure the 'Ticker' column exists
    if 'Ticker' in df.columns:
        # Get unique tickers
        unique_tickers = df['Ticker'].unique()

        # Create a DataFrame for tickers
        tickers_df = pd.DataFrame(unique_tickers, columns=['Ticker'])

        # Save the tickers to a new CSV file
        tickers_df.to_csv(tickers_file, index=False)
        print(f"Tickers have been successfully saved to {tickers_file}.")
    else:
        print("The 'Ticker' column is missing in the processed data file.")
except FileNotFoundError:
    print(f"The file {processed_data_file} does not exist. Please check the file path.")
except Exception as e:
    print(f"An error occurred: {e}")

Tickers have been successfully saved to /Users/eoingaynard/Small-Cap-Scout/raw_data/tickers.csv.


In [54]:
 # Read the newly created tickers CSV file
tickers_df_loaded = pd.read_csv(tickers_file)

# Show the first lines of the tickers DataFrame
print("First 5 lines of the tickers DataFrame:")
print(tickers_df_loaded.head())

First 5 lines of the tickers DataFrame:
  Ticker
0    iba
1    wpz
2   opnt
3   indf
4   flic
