In [308]:
import pandas as pd
import os
from IPython.display import FileLink

# 📂 Folder containing your CSV files
folder_path = r"D:\MIT\DataScience\Assignement\1\Try1"

# 🎯 Labels to search for in the first column
labels  = [
    "Code Name",
    "Processor Number",
    "Overall Peak TOPS (Int8)",
    "Recommended Customer Price",
    "Total Cores",
    "Total Threads",
    "Max Turbo Frequency",
    "Cache",
    "Embedded Options Available",
    "Max Memory Size (dependent on memory type)",
    "Memory Types",
    "Maximum Memory Speed",
    "Max # of Memory Channels",
    "ECC Memory Supported   ‡",
    "GPU Name‡",
    "Graphics Base Frequency",
    "Graphics Max Dynamic Frequency",
    "Max Resolution (HDMI)‡",
    "Max Resolution (DP)‡",
    "Max Resolution (eDP - Integrated Flat Panel)‡",
    "Graphics Max Dynamic Frequency",
    "DirectX* Support",
    "OpenGL* Support",
    "Scalability",
    "PCI Express Revision",
    "Max # of PCI Express Lanes",
    "Sockets Supported",
    "Intel® Turbo Boost Max Technology 3.0 ‡",
    "Intel® 64 ‡",
    "Instruction Set",
    "Max # of PCI Express Lanes",
    "Idle States",
]

# Dictionary to store results from all files
all_results = {}

# Loop through each CSV in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        
        # Read CSV file
        df = pd.read_csv(file_path)
        
        # Store matches for this file
    
        results = {}
        for label in labels:
            match = df[df.iloc[:, 0] == label]  # match in first column
            if not match.empty:
                results[label] = match.iloc[0, 1]  # value in next cell
        
        # Add to master dictionary
        all_results[filename] = results

# Convert results to DataFrame and save
results_df = pd.DataFrame(all_results).T  # transpose so files are rows
results_df.to_csv("all_results.csv", index=True)

# 📥 Clickable download link for Jupyter
FileLink("all_results.csv")
pd.set_option('display.max_rows', None)
print(results_df.shape)
#results_df.columns.tolist()

(29, 30)


In [309]:
print(results_df.head())

                                                                      Code Name  \
Intel_UPE_SpecificationsChart_2025_08_08 (1).csv   Products formerly Lunar Lake   
Intel_UPE_SpecificationsChart_2025_08_08 (10).csv  Products formerly Arrow Lake   
Intel_UPE_SpecificationsChart_2025_08_08 (11).csv  Products formerly Arrow Lake   
Intel_UPE_SpecificationsChart_2025_08_08 (12).csv  Products formerly Arrow Lake   
Intel_UPE_SpecificationsChart_2025_08_08 (13).csv  Products formerly Arrow Lake   

                                                  Processor Number  \
Intel_UPE_SpecificationsChart_2025_08_08 (1).csv              288V   
Intel_UPE_SpecificationsChart_2025_08_08 (10).csv             265T   
Intel_UPE_SpecificationsChart_2025_08_08 (11).csv             265K   
Intel_UPE_SpecificationsChart_2025_08_08 (12).csv            265HX   
Intel_UPE_SpecificationsChart_2025_08_08 (13).csv             265H   

                                                  Overall Peak TOPS (Int8)  \
In

In [312]:
num_cols = results_df.select_dtypes(include=["number"]).columns
results_df[num_cols] = results_df[num_cols].fillna(0)

# For categorical (object/string) columns → fill NaN with "NULL"
cat_cols = results_df.select_dtypes(include=["object"]).columns

results_df[cat_cols] = results_df[cat_cols].fillna("NULL")
results_df = results_df.fillna(0)


In [313]:
bool_cols = ["Embedded Options Available","ECC Memory Supported   ‡"]

def FormatBoolColumns():
 for col in bool_cols:
    results_df[col] = (
            results_df[col]
            .astype(str)               # convert to string first
            .str.strip()                # remove spaces
            .str.lower()                # normalize case
            .replace({"yes": 1, "no": 0, "true": 1, "false": 0, "null":0, "nan":0})
            .fillna(0)
            .astype(int)
        )

FormatBoolColumns();

In [297]:
results_df["Max Memory Size (dependent on memory type)"]


Intel_UPE_SpecificationsChart_2025_08_08 (1).csv       32 GB
Intel_UPE_SpecificationsChart_2025_08_08 (10).csv     256 GB
Intel_UPE_SpecificationsChart_2025_08_08 (11).csv     256 GB
Intel_UPE_SpecificationsChart_2025_08_08 (12).csv     256 GB
Intel_UPE_SpecificationsChart_2025_08_08 (13).csv     128 GB
Intel_UPE_SpecificationsChart_2025_08_08 (14).csv     256 GB
Intel_UPE_SpecificationsChart_2025_08_08 (15).csv      32 GB
Intel_UPE_SpecificationsChart_2025_08_08 (2).csv      256 GB
Intel_UPE_SpecificationsChart_2025_08_08 (3).csv      128 GB
Intel_UPE_SpecificationsChart_2025_08_08 (4).csv      256 GB
Intel_UPE_SpecificationsChart_2025_08_08 (5).csv      256 GB
Intel_UPE_SpecificationsChart_2025_08_08 (6).csv      256 GB
Intel_UPE_SpecificationsChart_2025_08_08 (7).csv       32 GB
Intel_UPE_SpecificationsChart_2025_08_08 (8).csv       16 GB
Intel_UPE_SpecificationsChart_2025_08_11 (10).csv    1.13 TB
Intel_UPE_SpecificationsChart_2025_08_11 (23).csv       4 TB
Intel_UPE_Specifications

In [325]:
# Pre processing the data columns
# remove GB from memory values
results_df["Max Memory Size (dependent on memory type)"] = (
    results_df["Max Memory Size (dependent on memory type)"]
    .str.extract(r'(\d+)')   # extract numbers only
    .astype(int)
)



AttributeError: Can only use .str accessor with string values!

In [316]:
#Remove the Hz from the values
results_df["Max Turbo Frequency"] = (
    results_df["Max Turbo Frequency"]
    .str.extract(r'(\d+)')   # extract numbers only
    .astype(int)
)


In [317]:

#Remove the Hz from the values
results_df["Graphics Base Frequency"] = (
    results_df["Graphics Base Frequency"]
    .str.extract(r'(\d+)')   # extract numbers only
    .astype(float)
)


In [318]:
results_df["Embedded Options Available"]

Intel_UPE_SpecificationsChart_2025_08_08 (1).csv     0
Intel_UPE_SpecificationsChart_2025_08_08 (10).csv    1
Intel_UPE_SpecificationsChart_2025_08_08 (11).csv    0
Intel_UPE_SpecificationsChart_2025_08_08 (12).csv    0
Intel_UPE_SpecificationsChart_2025_08_08 (13).csv    1
Intel_UPE_SpecificationsChart_2025_08_08 (14).csv    0
Intel_UPE_SpecificationsChart_2025_08_08 (15).csv    0
Intel_UPE_SpecificationsChart_2025_08_08 (2).csv     0
Intel_UPE_SpecificationsChart_2025_08_08 (3).csv     1
Intel_UPE_SpecificationsChart_2025_08_08 (4).csv     1
Intel_UPE_SpecificationsChart_2025_08_08 (5).csv     0
Intel_UPE_SpecificationsChart_2025_08_08 (6).csv     0
Intel_UPE_SpecificationsChart_2025_08_08 (7).csv     0
Intel_UPE_SpecificationsChart_2025_08_08 (8).csv     0
Intel_UPE_SpecificationsChart_2025_08_11 (10).csv    1
Intel_UPE_SpecificationsChart_2025_08_11 (23).csv    0
Intel_UPE_SpecificationsChart_2025_08_11 (24).csv    0
Intel_UPE_SpecificationsChart_2025_08_11 (25).csv    0
Intel_UPE_

In [319]:
results_df[["Total Cores", "Total Threads", "Max Memory Size (dependent on memory type)","Max Turbo Frequency",
                "Embedded Options Available","Overall Peak TOPS (Int8)","ECC Memory Supported   ‡"]]

Unnamed: 0,Total Cores,Total Threads,Max Memory Size (dependent on memory type),Max Turbo Frequency,Embedded Options Available,Overall Peak TOPS (Int8),ECC Memory Supported ‡
Intel_UPE_SpecificationsChart_2025_08_08 (1).csv,8,8,32,5,0,120.0,0
Intel_UPE_SpecificationsChart_2025_08_08 (10).csv,20,20,256,5,1,33.0,1
Intel_UPE_SpecificationsChart_2025_08_08 (11).csv,20,20,256,5,0,33.0,1
Intel_UPE_SpecificationsChart_2025_08_08 (12).csv,20,20,256,5,0,33.0,1
Intel_UPE_SpecificationsChart_2025_08_08 (13).csv,16,16,128,5,1,97.0,0
Intel_UPE_SpecificationsChart_2025_08_08 (14).csv,20,20,256,5,0,25.0,0
Intel_UPE_SpecificationsChart_2025_08_08 (15).csv,8,8,32,4,0,115.0,0
Intel_UPE_SpecificationsChart_2025_08_08 (2).csv,24,24,256,5,0,36.0,1
Intel_UPE_SpecificationsChart_2025_08_08 (3).csv,16,16,128,5,1,99.0,0
Intel_UPE_SpecificationsChart_2025_08_08 (4).csv,24,24,256,5,1,36.0,1


In [320]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


# Features (X) and Target (y)
X = results_df[["Total Cores", "Total Threads", "Max Memory Size (dependent on memory type)","Max Turbo Frequency",
                "Embedded Options Available","ECC Memory Supported   ‡"]]
y = results_df["Processor Number"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)



  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type =

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [324]:
# Predict example input
user_input = pd.DataFrame([{
    "Total Cores": 64,
    "Total Threads": 128,
    "Max Memory Size (dependent on memory type)":256,
    "Max Turbo Frequency":5,
    "Embedded Options Available": 1,
     "ECC Memory Supported   ‡": 1
}])

prediction = model.predict(user_input)
print("Recommended CPU:", prediction[0])


Recommended CPU: 285K
