In [5]:
import pandas as pd
import os
from sklearn.linear_model import LinearRegression
import numpy as np

# Read in CSV tables
processor = pd.read_csv("./cpudb/processor.csv")
specint2k6 = pd.read_csv("./cpudb/spec_int2006.csv")
specint2k0 = pd.read_csv("./cpudb/spec_int2000.csv")
specint95 = pd.read_csv("./cpudb/spec_int1995.csv")
specint92 = pd.read_csv("./cpudb/spec_int1992.csv")


# Rename [processor].[id] to [processor].[processor_id]
processor.rename(columns={'id': 'processor_id'}, inplace=True)


# Merge spec scores
all_data = processor.merge(specint2k6, on="processor_id", suffixes=(".proc", ".spec_int2k6"), how="outer")
all_data = all_data.merge(specint2k0, on="processor_id", suffixes=(".spec_int2k6", ".spec_int2k0"), how="outer")
all_data = all_data.merge(specint95, on="processor_id", suffixes=(".spec_int2k0", ".spec_int95"), how="outer")
all_data = all_data.merge(specint92, on="processor_id", suffixes=(".spec_int95", ".spec_int92"), how="outer")

# Fix missing date entries
all_data['date'] = pd.to_datetime(all_data['date'], errors='coerce')

# Sort by date
all_data.sort_values('date', inplace=True)

# Drop rows with NaT in 'date' column
all_data.dropna(subset=['date'], inplace=True)

# Account for potential turbo-boost clock
all_data.fillna({'max_clock': all_data['clock']}, inplace=True)
#all_data['max_clock'].fillna(all_data['clock'], inplace=True)

# Determine scaling factors for spec92->spec95, spec95->spec2k0, and spec2k0->spec2k6
spec92to95 = all_data['basemean.spec_int95'] / all_data['basemean.spec_int92']
spec95to2k0 = all_data['basemean.spec_int2k0'] / all_data['basemean.spec_int95']
spec2k0to2k6 = all_data['basemean.spec_int2k6'] / all_data['basemean.spec_int2k0']

all_data.fillna({'basemean.spec_int95': spec92to95.mean() * all_data['basemean.spec_int92']}, inplace=True)
#all_data['basemean.spec_int95'].fillna(spec92to95.mean() * all_data['basemean.spec_int92'], inplace=True)

all_data.fillna({'basemean.spec_int2k0': spec95to2k0.mean() * all_data['basemean.spec_int95']}, inplace=True)
#all_data['basemean.spec_int2k0'].fillna(spec95to2k0.mean() * all_data['basemean.spec_int95'], inplace=True)

all_data.fillna({'basemean.spec_int2k6': spec2k0to2k6.mean() * all_data['basemean.spec_int2k0']}, inplace=True)
#all_data['basemean.spec_int2k6'].fillna(spec2k0to2k6.mean() * all_data['basemean.spec_int2k0'], inplace=True)

# Performance
all_data['perfnorm'] = all_data['basemean.spec_int2k6'] / all_data['tdp']

# Find the scaling factors
scaleclk = all_data['max_clock'].min()
scaletrans = all_data['transistors'].min()
scaletdp = all_data['tdp'].min()
scaleperf = all_data['basemean.spec_int2k6'].min()
scaleperfnorm = all_data['perfnorm'].min()

# Calculate relative scaling
all_data['rel_transistors'] = all_data['transistors'] / scaletrans
all_data['rel_max_clock'] = all_data['max_clock'] / scaleclk
all_data['rel_tdp'] = all_data['tdp'] / scaletdp
all_data['rel_perf'] = all_data['basemean.spec_int2k6'] / scaleperf
all_data['rel_perfnorm'] = all_data['perfnorm'] / scaleperfnorm



# Function to calculate regression line
def clean_for_regression(df, x_col, y_col):
    df = df.dropna(subset=[x_col, y_col])
    df = df[np.isfinite(df[x_col]) & np.isfinite(df[y_col])]
    return df



# Convert dates to ordinal for regression
all_data['date_ordinal'] = (all_data['date'] - all_data['date'].min()).dt.days / 365.25

# Metrics to process
#metrics = ['rel_transistors', 'rel_max_clock', 'rel_tdp', 'rel_perf', 'rel_perfnorm']
metrics = ['rel_transistors']



def calculate_regression_line(x, y):
    model = LinearRegression()
    x_reshape = x.values.reshape(-1, 1)
    model.fit(x_reshape, y)
    date_range = np.linspace(x.min(), x.max(), len(x))
    date_range_reshape = date_range.reshape(-1, 1)
    trend = model.predict(date_range_reshape)
    return date_range, trend

# Process each metric
for metric in metrics:
    metric_data = all_data[['date', 'date_ordinal', metric]].dropna()
    metric_data = clean_for_regression(metric_data, 'date_ordinal', metric)
    
    date_range, trend = calculate_regression_line(metric_data['date_ordinal'], metric_data[metric])
    
    trend_dates = pd.to_datetime(date_range * 365.25 + all_data['date'].min().toordinal(), unit='D', origin='unix')
    
    trend_df = pd.DataFrame({'date': trend_dates, f'{metric}_trend': trend})
    
    # Save to separate files
    output_data = pd.merge(metric_data[['date', metric]], trend_df, on='date', how='outer')
    output_data.to_csv(f'./cpu_performance_trend_{metric}.csv', index=False)

OutOfBoundsDatetime: cannot convert input with unit 'D'