# Cleaning data for models and visualization

First steps in cleaning data are to make all the values that can be entirly numeric

Before we do that we must import our utility scripts

In [1]:
import sys
import os
import pandas as pd

project_dir = '/home/atoris/course-project-thomas-wright/src'
if project_dir not in sys.path:
    sys.path.insert(0, project_dir)

In [2]:
import datautil as du

But first we must be able to load our data

In [3]:
url = 'https://www.cpubenchmark.net/mid_range_cpus.html'
filename = 'cpu_data.csv'

df = du.load_data(url, filename)
df.head(5)

/usr/home/atoris/course-project-thomas-wright/data/cpu_data.csv


Unnamed: 0,name,price,Class,Clockspeed,Turbo Speed,Threads,Cores,Typical TDP,mt_score,st_score,Socket
0,AMD Opteron 6344,$379.00*,Server,2.6,3.2,1,1,115W,6069,1231,
1,Intel Core i5-8250U @ 1.60GHz,,Laptop,1.6,3.4,8,4,15W3,6067,1981,FC-BGA1356
2,Intel Core i5-6600 @ 3.30GHz,$262.13*,Desktop,3.3,3.9,4,4,65W,6059,2261,LGA 1151
3,Intel Xeon E3-1220 v6 @ 3.00GHz,$214.99,Server,3.0,3.5,4,4,72W,6048,2072,FCLGA1151
4,Intel Xeon E5-2440 @ 2.40GHz,$741.00*,Server,2.4,2.7,12,6,95W,6040,1298,LGA 1356


It looks like we could remove the units from price and TDP and encode our Class with a OneHot encode,
I will start with the units

In [4]:
for index, value in df.iterrows():
    price = str(value['price']).replace("*", "")
    TDP = str(value['Typical TDP'])
    
    if price[0] == "$":
        price = price[1:]
    if TDP[-1:] == "W":
        TDP = TDP[:-1]
    
    
    df.at[index, 'price'] = price
    df.at[index, 'Typical TDP'] = TDP


In [5]:
df.head(5)

Unnamed: 0,name,price,Class,Clockspeed,Turbo Speed,Threads,Cores,Typical TDP,mt_score,st_score,Socket
0,AMD Opteron 6344,379.0,Server,2.6,3.2,1,1,115,6069,1231,
1,Intel Core i5-8250U @ 1.60GHz,,Laptop,1.6,3.4,8,4,15W3,6067,1981,FC-BGA1356
2,Intel Core i5-6600 @ 3.30GHz,262.13,Desktop,3.3,3.9,4,4,65,6059,2261,LGA 1151
3,Intel Xeon E3-1220 v6 @ 3.00GHz,214.99,Server,3.0,3.5,4,4,72,6048,2072,FCLGA1151
4,Intel Xeon E5-2440 @ 2.40GHz,741.0,Server,2.4,2.7,12,6,95,6040,1298,LGA 1356


This data looks good except the names not look a bit messy and contain redudant data so we will split the data up into Brand and name and remove the clock speed as it's data already has a column

In [6]:
for index, value in df.iterrows():
    name = str(value['name'])
    
    if name[0] == 'I':
        df.at[index, 'brand'] = 'Intel'
        name = name[6:]
    if name[0] == "A":
        df.at[index, 'brand'] = "Amd"
        name = name[4:]
        
    name = name.split("@")
    df.at[index, 'name'] = name[0]
    
df.head(5)

Unnamed: 0,name,price,Class,Clockspeed,Turbo Speed,Threads,Cores,Typical TDP,mt_score,st_score,Socket,brand
0,Opteron 6344,379.0,Server,2.6,3.2,1,1,115,6069,1231,,Amd
1,Core i5-8250U,,Laptop,1.6,3.4,8,4,15W3,6067,1981,FC-BGA1356,Intel
2,Core i5-6600,262.13,Desktop,3.3,3.9,4,4,65,6059,2261,LGA 1151,Intel
3,Xeon E3-1220 v6,214.99,Server,3.0,3.5,4,4,72,6048,2072,FCLGA1151,Intel
4,Xeon E5-2440,741.0,Server,2.4,2.7,12,6,95,6040,1298,LGA 1356,Intel


Now that the data has been cleaned we can save it.

In [7]:
du.save_df(df, 'cpu_data_cleaned.csv')

Now we can implement pandas OneHotEncode on our class and brand

In [8]:
pd.get_dummies(df['Class'], prefix='class')
df = pd.concat([df,pd.get_dummies(df['Class'], prefix='class')], axis=1)
df = pd.concat([df,pd.get_dummies(df['brand'], prefix='brand')], axis=1)

In [9]:
df.head()

Unnamed: 0,name,price,Class,Clockspeed,Turbo Speed,Threads,Cores,Typical TDP,mt_score,st_score,Socket,brand,class_Desktop,class_Laptop,class_Server,brand_Amd,brand_Intel
0,Opteron 6344,379.0,Server,2.6,3.2,1,1,115,6069,1231,,Amd,0,0,1,1,0
1,Core i5-8250U,,Laptop,1.6,3.4,8,4,15W3,6067,1981,FC-BGA1356,Intel,0,1,0,0,1
2,Core i5-6600,262.13,Desktop,3.3,3.9,4,4,65,6059,2261,LGA 1151,Intel,1,0,0,0,1
3,Xeon E3-1220 v6,214.99,Server,3.0,3.5,4,4,72,6048,2072,FCLGA1151,Intel,0,0,1,0,1
4,Xeon E5-2440,741.0,Server,2.4,2.7,12,6,95,6040,1298,LGA 1356,Intel,0,0,1,0,1


Now that our class has been OneHot encoded we can drop our old collumns

In [10]:
df.drop(['Class'],axis=1,inplace=True)
df.drop(['brand'],axis=1,inplace=True)
df.head()

Unnamed: 0,name,price,Clockspeed,Turbo Speed,Threads,Cores,Typical TDP,mt_score,st_score,Socket,class_Desktop,class_Laptop,class_Server,brand_Amd,brand_Intel
0,Opteron 6344,379.0,2.6,3.2,1,1,115,6069,1231,,0,0,1,1,0
1,Core i5-8250U,,1.6,3.4,8,4,15W3,6067,1981,FC-BGA1356,0,1,0,0,1
2,Core i5-6600,262.13,3.3,3.9,4,4,65,6059,2261,LGA 1151,1,0,0,0,1
3,Xeon E3-1220 v6,214.99,3.0,3.5,4,4,72,6048,2072,FCLGA1151,0,0,1,0,1
4,Xeon E5-2440,741.0,2.4,2.7,12,6,95,6040,1298,LGA 1356,0,0,1,0,1


Saving the encoded data

In [11]:
du.save_df(df, 'cpu_data_encoded.csv')

Verify project structure

In [12]:
!tree ../

[01;34m../[00m
├── [01;32mLICENSE[00m
├── [01;32mREADME.md[00m
├── [01;34mdata[00m
│   ├── cpu_data.csv
│   ├── cpu_data_cleaned.csv
│   └── cpu_data_encoded.csv
├── [01;34mmodels[00m
├── [01;34mnotebooks[00m
│   ├── [01;32m0_setup_project_folders.ipynb[00m
│   ├── 1_Retrieving_data.ipynb
│   ├── 2_Cleaning_data.ipynb
│   ├── 3_Simple_Graphs.ipynb
│   └── [01;34m__pycache__[00m
│       └── datautil.cpython-37.pyc
├── [01;34mreport[00m
│   ├── [01;32minterim.md[00m
│   ├── [01;32mproposal.md[00m
│   └── [01;32mreport.md[00m
└── [01;34msrc[00m
    ├── [01;34m__pycache__[00m
    │   └── datautil.cpython-37.pyc
    └── datautil.py

7 directories, 15 files
