In [20]:
import pandas as pd
import os

# Read in CSV tables
processor = pd.read_csv("c:/sandbox/cs416/data/cpudb/processor.csv")
specint2k6 = pd.read_csv("c:/sandbox/cs416/data/cpudb/spec_int2006.csv")
specint2k0 = pd.read_csv("c:/sandbox/cs416/data/cpudb/spec_int2000.csv")
specint95 = pd.read_csv("c:/sandbox/cs416/data/cpudb/spec_int1995.csv")
specint92 = pd.read_csv("c:/sandbox/cs416/data/cpudb/spec_int1992.csv")


# Rename [processor].[id] to [processor].[processor_id]
processor.rename(columns={'id': 'processor_id'}, inplace=True)


# Merge spec scores
all_data = processor.merge(specint2k6, on="processor_id", suffixes=(".proc", ".spec_int2k6"), how="outer")
all_data = all_data.merge(specint2k0, on="processor_id", suffixes=(".spec_int2k6", ".spec_int2k0"), how="outer")
all_data = all_data.merge(specint95, on="processor_id", suffixes=(".spec_int2k0", ".spec_int95"), how="outer")
all_data = all_data.merge(specint92, on="processor_id", suffixes=(".spec_int95", ".spec_int92"), how="outer")

# Fix missing date entries
all_data['date'] = pd.to_datetime(all_data['date'], errors='coerce')

# Account for potential turbo-boost clock
all_data['max_clock'].fillna(all_data['clock'], inplace=True)

# Determine scaling factors for spec92->spec95, spec95->spec2k0, and spec2k0->spec2k6
spec92to95 = all_data['basemean.spec_int95'] / all_data['basemean.spec_int92']
spec95to2k0 = all_data['basemean.spec_int2k0'] / all_data['basemean.spec_int95']
spec2k0to2k6 = all_data['basemean.spec_int2k6'] / all_data['basemean.spec_int2k0']

all_data['basemean.spec_int95'].fillna(spec92to95.mean() * all_data['basemean.spec_int92'], inplace=True)
all_data['basemean.spec_int2k0'].fillna(spec95to2k0.mean() * all_data['basemean.spec_int95'], inplace=True)
all_data['basemean.spec_int2k6'].fillna(spec2k0to2k6.mean() * all_data['basemean.spec_int2k0'], inplace=True)

# Performance
all_data['perfnorm'] = all_data['basemean.spec_int2k6'] / all_data['tdp']

# Find the scaling factors
scaleclk = all_data['max_clock'].min()
scaletrans = all_data['transistors'].min()
scaletdp = all_data['tdp'].min()
scaleperf = all_data['basemean.spec_int2k6'].min()
scaleperfnorm = all_data['perfnorm'].min()

# Calculate relative scaling
all_data['rel_transistors'] = all_data['transistors'] / scaletrans
all_data['rel_max_clock'] = all_data['max_clock'] / scaleclk
all_data['rel_tdp'] = all_data['tdp'] / scaletdp
all_data['rel_perf'] = all_data['basemean.spec_int2k6'] / scaleperf
all_data['rel_perfnorm'] = all_data['perfnorm'] / scaleperfnorm

# Select relevant columns and drop rows with missing dates
final_data = all_data[['date', 'rel_transistors', 'rel_max_clock', 'rel_tdp', 'rel_perf', 'rel_perfnorm']].dropna(subset=['date'])

# Save the preprocessed data to a new CSV file
final_data.to_csv('c:/sandbox/cs416/data/cpu_performance_trend.csv', index=False)

final_data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_data['max_clock'].fillna(all_data['clock'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_data['basemean.spec_int95'].fillna(spec92to95.mean() * all_data['basemean.spec_int92'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will

Unnamed: 0,date,rel_transistors,rel_max_clock,rel_tdp,rel_perf,rel_perfnorm
48,2002-01-01,23913.043478,15740.740741,127.0,,
192,2005-01-04,100000.0,29629.62963,260.0,,
193,2006-01-01,163478.26087,32037.037037,260.0,,
194,2006-01-01,163478.26087,34537.037037,260.0,,
239,2007-01-01,65652.173913,14814.814815,62.0,,
