In [1]:
import os
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
project_root = os.getenv('PROJECT_ROOT')

# Define the path to the raw macro data
macro_raw_path = os.path.join(project_root, 'data', 'raw', 'macro_data.csv')

# Read the CSV file
macro_df = pd.read_csv(macro_raw_path, parse_dates=['date'])

# Set 'date' as the index and ensure it's datetime
macro_df.set_index('date', inplace=True)
macro_df.index = pd.to_datetime(macro_df.index)

# Convert object types to numeric types
macro_df = macro_df.infer_objects()

# Alternatively, explicitly convert columns to numeric
for column in macro_df.columns:
    macro_df[column] = pd.to_numeric(macro_df[column], errors='coerce')

# Interpolate missing values using the time method
macro_df.interpolate(method='time', inplace=True)

# Forward-fill any remaining missing values
macro_df.ffill(inplace=True)

# Now you can proceed with resampling or further processing
# For example, resample to monthly frequency
macro_monthly_df = macro_df.resample('ME').mean()



In [2]:
print(macro_df.dtypes)


GDP         float64
CPIAUCSL    float64
UNRATE      float64
FEDFUNDS    float64
dtype: object


In [3]:
print(macro_df.head())
print(macro_df.info())


                GDP  CPIAUCSL  UNRATE  FEDFUNDS
date                                           
1946-01-01      NaN       NaN     NaN       NaN
1946-04-01      NaN       NaN     NaN       NaN
1946-07-01      NaN       NaN     NaN       NaN
1946-10-01      NaN       NaN     NaN       NaN
1947-01-01  243.164     21.48     NaN       NaN
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 938 entries, 1946-01-01 to 2024-10-01
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   GDP       934 non-null    float64
 1   CPIAUCSL  934 non-null    float64
 2   UNRATE    922 non-null    float64
 3   FEDFUNDS  844 non-null    float64
dtypes: float64(4)
memory usage: 36.6 KB
None


In [4]:
# Define the path to save the processed data
processed_data_dir = os.path.join(project_root, 'data', 'processed')
os.makedirs(processed_data_dir, exist_ok=True)

# Save the processed macro data
macro_processed_path = os.path.join(processed_data_dir, 'macro_data_processed.csv')
macro_monthly_df.to_csv(macro_processed_path)


In [5]:
# Define the path to the processed billing data
billing_processed_path = os.path.join(processed_data_dir, 'global_billings_processed.csv')

# Load the billing data
billing_df = pd.read_csv(billing_processed_path, parse_dates=['Date'], index_col='Date')

# Inspect the DataFrame
print(billing_df.head())
print(billing_df.dtypes)


              Billing        Region
Date                               
1986-01-01   555850.0      Americas
1986-01-01   346467.0        Europe
1986-01-01   638547.0         Japan
1986-01-01   105050.0  Asia Pacific
1986-01-01  1645914.0     Worldwide
Billing    float64
Region      object
dtype: object


In [6]:
# Ensure that macro_monthly_df has 'Date' as index
macro_monthly_df.index.name = 'Date'

# Align indices
billing_df.index = pd.to_datetime(billing_df.index)
macro_monthly_df.index = pd.to_datetime(macro_monthly_df.index)


In [7]:
# Merge DataFrames
combined_df = pd.merge(billing_df, macro_monthly_df, left_index=True, right_index=True, how='inner', validate='one_to_one', on=None)



# Inspect the combined DataFrame
print(combined_df.head())
print(combined_df.dtypes)


MergeError: Merge keys are not unique in left dataset; not a one-to-one merge

In [None]:
# Check for missing values
print(combined_df.isnull().sum())

# Handle missing values as appropriate
combined_df = combined_df.dropna()


In [None]:
import matplotlib.pyplot as plt

# Now 'Worldwide' should be a column
plt.figure(figsize=(14, 7))
plt.plot(combined_df.index, combined_df['Worldwide'], label='Worldwide Billings')
plt.title('Worldwide Semiconductor Billings Over Time')
plt.xlabel('Date')
plt.ylabel('Billings (in millions)')
plt.legend()
plt.show()
