# **EDA.**

## **Librerías.**

In [1]:
import pandas as pd
import numpy as np
import regex as re

import seaborn as sns
import matplotlib.pyplot as plt

import warnings  
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder

## **Cargar datos.**

In [2]:
# cargar datos "train" y "test"
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Manufacturer              977 non-null    object 
 1   Model Name                977 non-null    object 
 2   Category                  977 non-null    object 
 3   Screen Size               977 non-null    object 
 4   Screen                    977 non-null    object 
 5   CPU                       977 non-null    object 
 6   RAM                       977 non-null    object 
 7    Storage                  977 non-null    object 
 8   GPU                       977 non-null    object 
 9   Operating System          977 non-null    object 
 10  Operating System Version  841 non-null    object 
 11  Weight                    977 non-null    object 
 12  Price                     977 non-null    float64
dtypes: float64(1), object(12)
memory usage: 99.4+ KB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              325 non-null    object
 1   Model Name                325 non-null    object
 2   Category                  325 non-null    object
 3   Screen Size               325 non-null    object
 4   Screen                    325 non-null    object
 5   CPU                       325 non-null    object
 6   RAM                       325 non-null    object
 7    Storage                  325 non-null    object
 8   GPU                       325 non-null    object
 9   Operating System          325 non-null    object
 10  Operating System Version  291 non-null    object
 11  Weight                    325 non-null    object
dtypes: object(12)
memory usage: 30.6+ KB


## **Transformar datos.**

### **Train.**

In [5]:
train.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,11912523.48
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,7993374.48
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,5112900.0
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,22563005.4
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,16037611.2


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Manufacturer              977 non-null    object 
 1   Model Name                977 non-null    object 
 2   Category                  977 non-null    object 
 3   Screen Size               977 non-null    object 
 4   Screen                    977 non-null    object 
 5   CPU                       977 non-null    object 
 6   RAM                       977 non-null    object 
 7    Storage                  977 non-null    object 
 8   GPU                       977 non-null    object 
 9   Operating System          977 non-null    object 
 10  Operating System Version  841 non-null    object 
 11  Weight                    977 non-null    object 
 12  Price                     977 non-null    float64
dtypes: float64(1), object(12)
memory usage: 99.4+ KB


In [7]:
train.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Manufacturer,977.0,19.0,Dell,232.0,,,,,,,
Model Name,977.0,488.0,XPS 13,22.0,,,,,,,
Category,977.0,6.0,Notebook,549.0,,,,,,,
Screen Size,977.0,18.0,"15.6""",494.0,,,,,,,
Screen,977.0,38.0,Full HD 1920x1080,397.0,,,,,,,
CPU,977.0,106.0,Intel Core i5 7200U 2.5GHz,151.0,,,,,,,
RAM,977.0,8.0,8GB,483.0,,,,,,,
Storage,977.0,36.0,256GB SSD,318.0,,,,,,,
GPU,977.0,98.0,Intel HD Graphics 620,225.0,,,,,,,
Operating System,977.0,7.0,Windows,837.0,,,,,,,


In [8]:
# cambiar nombre columna
train.rename(columns=lambda x: x.strip() if isinstance(x, str) else x, inplace=True)

train.columns

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', 'Storage', 'GPU', 'Operating System',
       'Operating System Version', 'Weight', 'Price'],
      dtype='object')

In [9]:
# añadir guion bajo "_" en lugar de espacio en nombres de columnas
train.rename(columns=lambda x: x.replace(" ", "_") if isinstance(x, str) else x, inplace=True)

train.columns

Index(['Manufacturer', 'Model_Name', 'Category', 'Screen_Size', 'Screen',
       'CPU', 'RAM', 'Storage', 'GPU', 'Operating_System',
       'Operating_System_Version', 'Weight', 'Price'],
      dtype='object')

In [10]:
# observar columna "Operating_System_Version"
train['Operating_System_Version'].unique()

array([nan, '10', 'X', '10 S', '7'], dtype=object)

In [11]:
train.Operating_System_Version.value_counts()

10      819
7        10
10 S      8
X         4
Name: Operating_System_Version, dtype: int64

In [12]:
# "Operating_System_Version" no aporta valor, ademas teniendo nulos, la eliminamos
train = train.drop('Operating_System_Version', axis=1)

train.head()

Unnamed: 0,Manufacturer,Model_Name,Category,Screen_Size,Screen,CPU,RAM,Storage,GPU,Operating_System,Weight,Price
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,11912523.48
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,7993374.48
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,5112900.0
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,22563005.4
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,16037611.2


In [13]:
# observar columna "Manufacturer"
train.Manufacturer.value_counts()

Dell         232
Lenovo       226
HP           179
Asus         122
Acer          83
Toshiba       39
MSI           34
Apple         17
Samsung        8
Mediacom       7
Microsoft      6
Razer          5
Xiaomi         4
Chuwi          3
Google         3
LG             3
Huawei         2
Vero           2
Fujitsu        2
Name: Manufacturer, dtype: int64

In [14]:
# aplicar "LabelEncoder" para transformar de categorico a numerico la columna "Manufacturer"
le = LabelEncoder()

train['Manufacturer'] = le.fit_transform(train['Manufacturer'])

train.head()

Unnamed: 0,Manufacturer,Model_Name,Category,Screen_Size,Screen,CPU,RAM,Storage,GPU,Operating_System,Weight,Price
0,1,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,11912523.48
1,1,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,7993374.48
2,7,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,5112900.0
3,1,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,22563005.4
4,1,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,16037611.2


In [15]:
# observar columna "Model_Name"
train.Model_Name.value_counts()

XPS 13                22
Inspiron 3567         20
250 G6                18
Inspiron 5570         18
Vostro 3568           15
                      ..
Rog GL753VE-GC070T     1
Rog GL753VD-GC042T     1
Precision 5520         1
Precision 3510         1
ThinkPad T460          1
Name: Model_Name, Length: 488, dtype: int64

In [16]:
# "Model_Name" no aporta valor, la eliminamos
train = train.drop('Model_Name', axis=1)

train.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,Screen,CPU,RAM,Storage,GPU,Operating_System,Weight,Price
0,1,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,11912523.48
1,1,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,7993374.48
2,7,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,5112900.0
3,1,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,22563005.4
4,1,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,16037611.2


In [17]:
# observar columna "Category"
train.Category.value_counts()

Notebook              549
Ultrabook             152
Gaming                150
2 in 1 Convertible     85
Workstation            24
Netbook                17
Name: Category, dtype: int64

In [18]:
# transformar valores de "Category" para simplificar
category_train = train.Category.value_counts()[:3]

train['Category']= train['Category'].apply(lambda x: x if x in category_train else 'Other')

train.Category.value_counts()

Notebook     549
Ultrabook    152
Gaming       150
Other        126
Name: Category, dtype: int64

In [19]:
# transformar de categorico a numerico la columna "Category"
category_train = {'Notebook' : 1, 'Ultrabook' : 2, 'Gaming' : 3, 'Other' : 4}

train.Category = train.Category.apply(lambda x : category_train[x])

train.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,Screen,CPU,RAM,Storage,GPU,Operating_System,Weight,Price
0,1,2,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,11912523.48
1,1,2,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,7993374.48
2,7,1,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,5112900.0
3,1,2,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,22563005.4
4,1,2,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,16037611.2


In [20]:
# observar columna "Screen_Size"
train.Screen_Size.value_counts()

15.6"    494
14.0"    146
17.3"    131
13.3"    123
11.6"     27
12.5"     23
13.5"      6
13.9"      6
15.4"      4
12.3"      4
12.0"      3
15.0"      3
13.0"      2
10.1"      1
18.4"      1
17.0"      1
14.1"      1
11.3"      1
Name: Screen_Size, dtype: int64

In [21]:
# aplicar "LabelEncoder" para transformar de categorico a numerico la columna "Screen_Size"
le = LabelEncoder()

train['Screen_Size'] = le.fit_transform(train['Screen_Size'])

train.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,Screen,CPU,RAM,Storage,GPU,Operating_System,Weight,Price
0,1,2,7,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,11912523.48
1,1,2,7,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,7993374.48
2,7,1,14,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,5112900.0
3,1,2,13,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,22563005.4
4,1,2,7,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,16037611.2


In [22]:
# observar columna "Screen"
train.Screen.value_counts()

Full HD 1920x1080                                397
IPS Panel Full HD 1920x1080                      181
1366x768                                         180
Full HD / Touchscreen 1920x1080                   38
IPS Panel Full HD / Touchscreen 1920x1080         35
1600x900                                          19
Touchscreen 1366x768                              15
IPS Panel 4K Ultra HD / Touchscreen 3840x2160     11
IPS Panel 4K Ultra HD 3840x2160                   10
4K Ultra HD / Touchscreen 3840x2160                9
Quad HD+ / Touchscreen 3200x1800                   8
IPS Panel 1366x768                                 7
IPS Panel Retina Display 2560x1600                 6
Touchscreen 2256x1504                              6
Touchscreen 2560x1440                              5
4K Ultra HD 3840x2160                              5
1440x900                                           4
IPS Panel Retina Display 2880x1800                 4
IPS Panel 2560x1440                           

In [23]:
# "Screen" no aporta valor, ademas teniendo la columna "Screen_Size", la eliminamos
train = train.drop('Screen', axis=1)

train.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,CPU,RAM,Storage,GPU,Operating_System,Weight,Price
0,1,2,7,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,11912523.48
1,1,2,7,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,7993374.48
2,7,1,14,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,5112900.0
3,1,2,13,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,22563005.4
4,1,2,7,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,16037611.2


In [24]:
# observar columna "CPU"
train.CPU.value_counts()

Intel Core i5 7200U 2.5GHz     151
Intel Core i7 7700HQ 2.8GHz    123
Intel Core i7 7500U 2.7GHz     103
Intel Core i7 8550U 1.8GHz      71
Intel Core i5 8250U 1.6GHz      71
                              ... 
Intel Core i7 2.9GHz             1
AMD E-Series 7110 1.8GHz         1
Intel Core i5 2.9GHz             1
Intel Core M m3 1.2GHz           1
Intel Core i5 7200U 2.70GHz      1
Name: CPU, Length: 106, dtype: int64

In [25]:
train['CPU'].unique()

array(['Intel Core i5 2.3GHz', 'Intel Core i5 1.8GHz',
       'Intel Core i5 7200U 2.5GHz', 'Intel Core i7 2.7GHz',
       'Intel Core i5 3.1GHz', 'AMD A9-Series 9420 3GHz',
       'Intel Core i7 2.2GHz', 'Intel Core i7 8550U 1.8GHz',
       'Intel Core i5 8250U 1.6GHz', 'Intel Core i3 6006U 2GHz',
       'Intel Core i7 2.8GHz', 'Intel Core M m3 1.2GHz',
       'Intel Core i7 7500U 2.7GHz', 'Intel Core i7 2.9GHz',
       'Intel Core i3 7100U 2.4GHz', 'Intel Atom x5-Z8350 1.44GHz',
       'Intel Core i5 7300HQ 2.5GHz', 'AMD E-Series E2-9000e 1.5GHz',
       'Intel Core i5 1.6GHz', 'Intel Core i7 8650U 1.9GHz',
       'Intel Atom x5-Z8300 1.44GHz', 'AMD E-Series E2-6110 1.5GHz',
       'AMD A6-Series 9220 2.5GHz',
       'Intel Celeron Dual Core N3350 1.1GHz',
       'Intel Core i3 7130U 2.7GHz', 'Intel Core i7 7700HQ 2.8GHz',
       'Intel Core i5 2.0GHz', 'AMD Ryzen 1700 3GHz',
       'Intel Pentium Quad Core N4200 1.1GHz',
       'Intel Atom x5-Z8550 1.44GHz',
       'Intel Celeron Du

In [26]:
# transformar valores de "CPU" para simplificar
train['CPU'] = train['CPU'].str.replace(r'.*Intel Core i.*', 'Intel Core i', regex=True)\
                                             .str.replace(r'.*Intel Core M.*', 'Intel Core M', regex=True)\
                                             .str.replace(r'.*Intel Atom.*', 'Intel Atom', regex=True)\
                                             .str.replace(r'.*Intel Celeron.*', 'Intel Celeron', regex=True)\
                                             .str.replace(r'.*Intel Pentium.*', 'Intel Pentium', regex=True)\
                                             .str.replace(r'.*Intel Xeon.*', 'Intel Xeon', regex=True)\
                                             .str.replace(r'.*AMD A.*', 'AMD A', regex=True)\
                                             .str.replace(r'.*AMD E-Series.*', 'AMD E-Series', regex=True)\
                                             .str.replace(r'.*AMD Ryzen.*', 'AMD Ryzen', regex=True)\
                                             .str.replace(r'.*AMD FX.*', 'AMD FX', regex=True)

train['CPU'].unique()

array(['Intel Core i', 'AMD A', 'Intel Core M', 'Intel Atom',
       'AMD E-Series', 'Intel Celeron', 'AMD Ryzen', 'Intel Pentium',
       'AMD FX', 'Intel Xeon'], dtype=object)

In [27]:
# aplicar "LabelEncoder" para transformar de categorico a numerico la columna "CPU"
le = LabelEncoder()

train['CPU'] = le.fit_transform(train['CPU'])

train.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,CPU,RAM,Storage,GPU,Operating_System,Weight,Price
0,1,2,7,7,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,11912523.48
1,1,2,7,7,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,7993374.48
2,7,1,14,7,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,5112900.0
3,1,2,13,7,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,22563005.4
4,1,2,7,7,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,16037611.2


In [28]:
# observar columna "RAM"
train.RAM.value_counts()

8GB     483
4GB     271
16GB    149
6GB      24
12GB     21
32GB     16
2GB      10
24GB      3
Name: RAM, dtype: int64

In [29]:
# aplicar "LabelEncoder" para transformar de categorico a numerico la columna "RAM"
le = LabelEncoder()

train['RAM'] = le.fit_transform(train['RAM'])

train.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,CPU,RAM,Storage,GPU,Operating_System,Weight,Price
0,1,2,7,7,7,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,11912523.48
1,1,2,7,7,7,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,7993374.48
2,7,1,14,7,7,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,5112900.0
3,1,2,13,7,1,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,22563005.4
4,1,2,7,7,7,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,16037611.2


In [30]:
# observar columna "Storage"
train.Storage.value_counts()

256GB SSD                        318
1TB HDD                          171
512GB SSD                         91
500GB HDD                         79
128GB SSD +  1TB HDD              71
128GB SSD                         62
256GB SSD +  1TB HDD              52
32GB Flash Storage                29
2TB HDD                           14
512GB SSD +  1TB HDD              13
1TB SSD                           11
256GB SSD +  2TB HDD              10
64GB Flash Storage                 9
16GB Flash Storage                 6
256GB Flash Storage                5
32GB SSD                           5
128GB Flash Storage                4
1GB SSD                            4
256GB SSD +  256GB SSD             2
16GB SSD                           2
1TB SSD +  1TB HDD                 2
512GB SSD +  2TB HDD               2
128GB SSD +  2TB HDD               2
512GB SSD +  512GB SSD             1
512GB SSD +  256GB SSD             1
64GB SSD                           1
64GB Flash Storage +  1TB HDD      1
2

In [31]:
# "Storage" no aporta valor, la eliminamos
train = train.drop('Storage', axis=1)

train.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,CPU,RAM,GPU,Operating_System,Weight,Price
0,1,2,7,7,7,Intel Iris Plus Graphics 640,macOS,1.37kg,11912523.48
1,1,2,7,7,7,Intel HD Graphics 6000,macOS,1.34kg,7993374.48
2,7,1,14,7,7,Intel HD Graphics 620,No OS,1.86kg,5112900.0
3,1,2,13,7,1,AMD Radeon Pro 455,macOS,1.83kg,22563005.4
4,1,2,7,7,7,Intel Iris Plus Graphics 650,macOS,1.37kg,16037611.2


In [32]:
# observar columna "GPU"
train.GPU.value_counts()

Intel HD Graphics 620             225
Intel HD Graphics 520              98
Intel UHD Graphics 620             66
Nvidia GeForce GTX 1050            57
AMD Radeon 530                     37
                                 ... 
Nvidia GeForce GTX 960<U+039C>      1
Intel Graphics 620                  1
Nvidia GeForce GTX 960              1
AMD Radeon R5 520                   1
AMD Radeon R9 M385                  1
Name: GPU, Length: 98, dtype: int64

In [33]:
train['GPU'].unique()

array(['Intel Iris Plus Graphics 640', 'Intel HD Graphics 6000',
       'Intel HD Graphics 620', 'AMD Radeon Pro 455',
       'Intel Iris Plus Graphics 650', 'AMD Radeon R5',
       'Intel Iris Pro Graphics', 'Nvidia GeForce MX150',
       'Intel UHD Graphics 620', 'Intel HD Graphics 520',
       'AMD Radeon Pro 555', 'AMD Radeon R5 M430',
       'Intel HD Graphics 615', 'AMD Radeon Pro 560',
       'Nvidia GeForce 940MX', 'Intel HD Graphics 400',
       'Nvidia GeForce GTX 1050', 'AMD Radeon R2', 'AMD Radeon 530',
       'Nvidia GeForce 930MX', 'Intel HD Graphics',
       'Intel HD Graphics 500', 'Nvidia GeForce 930MX ',
       'Nvidia GeForce GTX 1060', 'Nvidia GeForce 150MX',
       'Intel Iris Graphics 540', 'AMD Radeon RX 580',
       'Nvidia GeForce 920MX', 'AMD Radeon R4 Graphics', 'AMD Radeon 520',
       'Nvidia GeForce GTX 1070', 'Nvidia GeForce GTX 1050 Ti',
       'Nvidia GeForce MX130', 'AMD R4 Graphics',
       'Nvidia GeForce GTX 940MX', 'AMD Radeon RX 560',
       'Nvid

In [34]:
# transformar valores de "GPU" para simplificar
train['GPU'] = train['GPU'].str.replace(r'.*Intel Iris Plus.*', 'Intel Iris Plus', regex=True)\
                                             .str.replace(r'.*Intel Iris Pro.*', 'Intel Iris Pro', regex=True)\
                                             .str.replace(r'.*Intel HD.*', 'Intel HD', regex=True)\
                                             .str.replace(r'.*Intel UHD.*', 'Intel UHD', regex=True)\
                                             .str.replace(r'.*Nvidia GeForce.*', 'Nvidia GeForce', regex=True)\
                                             .str.replace(r'.*Intel Iris Graphics.*', 'Intel Iris Graphics', regex=True)\
                                             .str.replace(r'.*AMD FirePro.*', 'AMD FirePro', regex=True)\
                                             .str.replace(r'.*Nvidia GTX.*', 'Nvidia GTX', regex=True)\
                                             .str.replace(r'.*Nvidia Quadro.*', 'Nvidia Quadro', regex=True)\
                                             .str.replace(r'.*Intel Graphics.*', 'Intel Graphics', regex=True)\
                                             .str.replace(r'.*AMD R.*', 'AMD R', regex=True)
                                             

train['GPU'].unique()

array(['Intel Iris Plus', 'Intel HD', 'AMD R', 'Intel Iris Pro',
       'Nvidia GeForce', 'Intel UHD', 'Intel Iris Graphics', 'Nvidia GTX',
       'Nvidia Quadro', 'AMD FirePro', 'Intel Graphics'], dtype=object)

In [35]:
# aplicar "LabelEncoder" para transformar de categorico a numerico la columna "GPU"
le = LabelEncoder()

train['GPU'] = le.fit_transform(train['GPU'])

train.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,CPU,RAM,GPU,Operating_System,Weight,Price
0,1,2,7,7,7,5,macOS,1.37kg,11912523.48
1,1,2,7,7,7,3,macOS,1.34kg,7993374.48
2,7,1,14,7,7,3,No OS,1.86kg,5112900.0
3,1,2,13,7,1,1,macOS,1.83kg,22563005.4
4,1,2,7,7,7,5,macOS,1.37kg,16037611.2


In [36]:
# observar columna "Operating_System"
train.Operating_System.value_counts()

Windows      837
No OS         52
Linux         48
Chrome OS     22
macOS         13
Mac OS         4
Android        1
Name: Operating_System, dtype: int64

In [37]:
# transformar valores de "Operating_System" para simplificar
os_train = {'Mac OS' : 'macOS', 'Chrome OS' : "Google's OS", 'Android' : "Google's OS"}

train['Operating_System'] = train['Operating_System'].replace(os_train)

train.Operating_System.value_counts()

Windows        837
No OS           52
Linux           48
Google's OS     23
macOS           17
Name: Operating_System, dtype: int64

In [38]:
# transformar de categorico a numerico la columna "Operating_System"
train = pd.get_dummies(train, columns=['Operating_System'], drop_first=True)

train.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,CPU,RAM,GPU,Weight,Price,Operating_System_Linux,Operating_System_No OS,Operating_System_Windows,Operating_System_macOS
0,1,2,7,7,7,5,1.37kg,11912523.48,0,0,0,1
1,1,2,7,7,7,3,1.34kg,7993374.48,0,0,0,1
2,7,1,14,7,7,3,1.86kg,5112900.0,0,1,0,0
3,1,2,13,7,1,1,1.83kg,22563005.4,0,0,0,1
4,1,2,7,7,7,5,1.37kg,16037611.2,0,0,0,1


In [39]:
# observar columna "Weight"
train.Weight.value_counts()

2.2kg     98
2.1kg     45
2.5kg     32
2kg       31
2.4kg     27
          ..
2.29kg     1
0.97kg     1
3.25kg     1
1.14kg     1
1.70kg     1
Name: Weight, Length: 166, dtype: int64

In [40]:
# "Weight" no aporta valor, la eliminamos
train = train.drop('Weight', axis=1)

train.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,CPU,RAM,GPU,Price,Operating_System_Linux,Operating_System_No OS,Operating_System_Windows,Operating_System_macOS
0,1,2,7,7,7,5,11912523.48,0,0,0,1
1,1,2,7,7,7,3,7993374.48,0,0,0,1
2,7,1,14,7,7,3,5112900.0,0,1,0,0
3,1,2,13,7,1,1,22563005.4,0,0,0,1
4,1,2,7,7,7,5,16037611.2,0,0,0,1


In [41]:
# convertir columnas a entero (excluyendo columna "Price")
columns_convert = [col for col in train.columns if col != 'Price']

# convertir columnas seleccionadas a entero
train[columns_convert] = train[columns_convert].astype(int)

train.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,CPU,RAM,GPU,Price,Operating_System_Linux,Operating_System_No OS,Operating_System_Windows,Operating_System_macOS
0,1,2,7,7,7,5,11912523.48,0,0,0,1
1,1,2,7,7,7,3,7993374.48,0,0,0,1
2,7,1,14,7,7,3,5112900.0,0,1,0,0
3,1,2,13,7,1,1,22563005.4,0,0,0,1
4,1,2,7,7,7,5,16037611.2,0,0,0,1


In [42]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Manufacturer              977 non-null    int64  
 1   Category                  977 non-null    int64  
 2   Screen_Size               977 non-null    int64  
 3   CPU                       977 non-null    int64  
 4   RAM                       977 non-null    int64  
 5   GPU                       977 non-null    int64  
 6   Price                     977 non-null    float64
 7   Operating_System_Linux    977 non-null    int64  
 8   Operating_System_No OS    977 non-null    int64  
 9   Operating_System_Windows  977 non-null    int64  
 10  Operating_System_macOS    977 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 84.1 KB


### **Test.**

In [43]:
test.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight
0,HP,15-bs053od (i7-7500U/6GB/1TB/W10),Notebook,"15.6""",1366x768,Intel Core i7 7500U 2.7GHz,6GB,1TB HDD,Intel HD Graphics 620,Windows,10,2.04kg
1,Asus,Rog GL753VE-DS74,Gaming,"17.3""",Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,10,2.99kg
2,Dell,Inspiron 7579,2 in 1 Convertible,"15.6""",IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,12GB,512GB SSD,Intel HD Graphics 620,Windows,10,2.19kg
3,Toshiba,Portege Z30-C-1CV,Notebook,"13.3""",Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,7,1.2kg
4,Lenovo,IdeaPad 320-15ABR,Notebook,"15.6""",Full HD 1920x1080,AMD A12-Series 9720P 3.6GHz,6GB,256GB SSD,AMD Radeon 530,Windows,10,2.2kg


In [44]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              325 non-null    object
 1   Model Name                325 non-null    object
 2   Category                  325 non-null    object
 3   Screen Size               325 non-null    object
 4   Screen                    325 non-null    object
 5   CPU                       325 non-null    object
 6   RAM                       325 non-null    object
 7    Storage                  325 non-null    object
 8   GPU                       325 non-null    object
 9   Operating System          325 non-null    object
 10  Operating System Version  291 non-null    object
 11  Weight                    325 non-null    object
dtypes: object(12)
memory usage: 30.6+ KB


In [45]:
# cambiar nombre columna
test.rename(columns=lambda x: x.strip() if isinstance(x, str) else x, inplace=True)

test.columns

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', 'Storage', 'GPU', 'Operating System',
       'Operating System Version', 'Weight'],
      dtype='object')

In [46]:
# añadir guion bajo "_" en lugar de espacio en nombres de columnas
test.rename(columns=lambda x: x.replace(" ", "_") if isinstance(x, str) else x, inplace=True)

test.columns

Index(['Manufacturer', 'Model_Name', 'Category', 'Screen_Size', 'Screen',
       'CPU', 'RAM', 'Storage', 'GPU', 'Operating_System',
       'Operating_System_Version', 'Weight'],
      dtype='object')

In [47]:
# "Operating_System_Version" no aporta valor, ademas teniendo nulos, la eliminamos
test = test.drop('Operating_System_Version', axis=1)

test.head()

Unnamed: 0,Manufacturer,Model_Name,Category,Screen_Size,Screen,CPU,RAM,Storage,GPU,Operating_System,Weight
0,HP,15-bs053od (i7-7500U/6GB/1TB/W10),Notebook,"15.6""",1366x768,Intel Core i7 7500U 2.7GHz,6GB,1TB HDD,Intel HD Graphics 620,Windows,2.04kg
1,Asus,Rog GL753VE-DS74,Gaming,"17.3""",Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,2.99kg
2,Dell,Inspiron 7579,2 in 1 Convertible,"15.6""",IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,12GB,512GB SSD,Intel HD Graphics 620,Windows,2.19kg
3,Toshiba,Portege Z30-C-1CV,Notebook,"13.3""",Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,1.2kg
4,Lenovo,IdeaPad 320-15ABR,Notebook,"15.6""",Full HD 1920x1080,AMD A12-Series 9720P 3.6GHz,6GB,256GB SSD,AMD Radeon 530,Windows,2.2kg


In [48]:
# "Model_Name" no aporta valor, la eliminamos
test = test.drop('Model_Name', axis=1)

test.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,Screen,CPU,RAM,Storage,GPU,Operating_System,Weight
0,HP,Notebook,"15.6""",1366x768,Intel Core i7 7500U 2.7GHz,6GB,1TB HDD,Intel HD Graphics 620,Windows,2.04kg
1,Asus,Gaming,"17.3""",Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,2.99kg
2,Dell,2 in 1 Convertible,"15.6""",IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,12GB,512GB SSD,Intel HD Graphics 620,Windows,2.19kg
3,Toshiba,Notebook,"13.3""",Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,1.2kg
4,Lenovo,Notebook,"15.6""",Full HD 1920x1080,AMD A12-Series 9720P 3.6GHz,6GB,256GB SSD,AMD Radeon 530,Windows,2.2kg


In [49]:
# aplicar "LabelEncoder" para transformar de categorico a numerico a columna "Manufacturer"
le = LabelEncoder()

test['Manufacturer'] = le.fit_transform(test['Manufacturer'])

test.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,Screen,CPU,RAM,Storage,GPU,Operating_System,Weight
0,5,Notebook,"15.6""",1366x768,Intel Core i7 7500U 2.7GHz,6GB,1TB HDD,Intel HD Graphics 620,Windows,2.04kg
1,2,Gaming,"17.3""",Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,2.99kg
2,3,2 in 1 Convertible,"15.6""",IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,12GB,512GB SSD,Intel HD Graphics 620,Windows,2.19kg
3,10,Notebook,"13.3""",Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,1.2kg
4,6,Notebook,"15.6""",Full HD 1920x1080,AMD A12-Series 9720P 3.6GHz,6GB,256GB SSD,AMD Radeon 530,Windows,2.2kg


In [50]:
# observar columna "Category"
test.Category.value_counts()

Notebook              178
Gaming                 55
Ultrabook              44
2 in 1 Convertible     35
Netbook                 8
Workstation             5
Name: Category, dtype: int64

In [51]:
# transformar valores de "Category" para simplificar
category_test = test.Category.value_counts()[:3]

test['Category']= test['Category'].apply(lambda x: x if x in category_test else 'Other')

test.Category.value_counts()

Notebook     178
Gaming        55
Other         48
Ultrabook     44
Name: Category, dtype: int64

In [52]:
# transformar de categorico a numerico la columna "Category"
category_test = {'Notebook' : 1, 'Ultrabook' : 2, 'Gaming' : 3, 'Other' : 4}

test.Category = test.Category.apply(lambda x : category_test[x])

test.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,Screen,CPU,RAM,Storage,GPU,Operating_System,Weight
0,5,1,"15.6""",1366x768,Intel Core i7 7500U 2.7GHz,6GB,1TB HDD,Intel HD Graphics 620,Windows,2.04kg
1,2,3,"17.3""",Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,2.99kg
2,3,4,"15.6""",IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,12GB,512GB SSD,Intel HD Graphics 620,Windows,2.19kg
3,10,1,"13.3""",Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,1.2kg
4,6,1,"15.6""",Full HD 1920x1080,AMD A12-Series 9720P 3.6GHz,6GB,256GB SSD,AMD Radeon 530,Windows,2.2kg


In [53]:
# aplicar "LabelEncoder" para transformar de categorico a numerico la columna "Screen_Size"
le = LabelEncoder()

test['Screen_Size'] = le.fit_transform(test['Screen_Size'])

test.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,Screen,CPU,RAM,Storage,GPU,Operating_System,Weight
0,5,1,8,1366x768,Intel Core i7 7500U 2.7GHz,6GB,1TB HDD,Intel HD Graphics 620,Windows,2.04kg
1,2,3,9,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,2.99kg
2,3,4,8,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,12GB,512GB SSD,Intel HD Graphics 620,Windows,2.19kg
3,10,1,5,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,1.2kg
4,6,1,8,Full HD 1920x1080,AMD A12-Series 9720P 3.6GHz,6GB,256GB SSD,AMD Radeon 530,Windows,2.2kg


In [54]:
# "Screen" no aporta valor, ademas teniendo la columna "Screen_Size", la eliminamos
test = test.drop('Screen', axis=1)

test.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,CPU,RAM,Storage,GPU,Operating_System,Weight
0,5,1,8,Intel Core i7 7500U 2.7GHz,6GB,1TB HDD,Intel HD Graphics 620,Windows,2.04kg
1,2,3,9,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,2.99kg
2,3,4,8,Intel Core i7 7500U 2.7GHz,12GB,512GB SSD,Intel HD Graphics 620,Windows,2.19kg
3,10,1,5,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,1.2kg
4,6,1,8,AMD A12-Series 9720P 3.6GHz,6GB,256GB SSD,AMD Radeon 530,Windows,2.2kg


In [55]:
# transformar valores de "CPU" para simplificar
test['CPU'] = test['CPU'].str.replace(r'.*Intel Core i.*', 'Intel Core i', regex=True)\
                                             .str.replace(r'.*Intel Core M.*', 'Intel Core M', regex=True)\
                                             .str.replace(r'.*Intel Atom.*', 'Intel Atom', regex=True)\
                                             .str.replace(r'.*Intel Celeron.*', 'Intel Celeron', regex=True)\
                                             .str.replace(r'.*Intel Pentium.*', 'Intel Pentium', regex=True)\
                                             .str.replace(r'.*Intel Xeon.*', 'Intel Xeon', regex=True)\
                                             .str.replace(r'.*AMD A.*', 'AMD A', regex=True)\
                                             .str.replace(r'.*AMD E-Series.*', 'AMD E-Series', regex=True)\
                                             .str.replace(r'.*AMD Ryzen.*', 'AMD Ryzen', regex=True)\
                                             .str.replace(r'.*AMD FX.*', 'AMD FX', regex=True)\
                                             .str.replace(r'.*Samsung Cortex.*', 'Other', regex=True)

test['CPU'].unique()

array(['Intel Core i', 'AMD A', 'Intel Core M', 'Intel Celeron',
       'Intel Pentium', 'Intel Atom', 'AMD E-Series', 'Other'],
      dtype=object)

In [56]:
# aplicar "LabelEncoder" para transformar de categorico a numerico la columna "CPU"
le = LabelEncoder()

test['CPU'] = le.fit_transform(test['CPU'])

test.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,CPU,RAM,Storage,GPU,Operating_System,Weight
0,5,1,8,5,6GB,1TB HDD,Intel HD Graphics 620,Windows,2.04kg
1,2,3,9,5,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,2.99kg
2,3,4,8,5,12GB,512GB SSD,Intel HD Graphics 620,Windows,2.19kg
3,10,1,5,5,4GB,128GB SSD,Intel HD Graphics 520,Windows,1.2kg
4,6,1,8,0,6GB,256GB SSD,AMD Radeon 530,Windows,2.2kg


In [57]:
# aplicar "LabelEncoder" para transformar de categorico a numerico la columna "RAM"
le = LabelEncoder()

test['RAM'] = le.fit_transform(test['RAM'])

test.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,CPU,RAM,Storage,GPU,Operating_System,Weight
0,5,1,8,5,6,1TB HDD,Intel HD Graphics 620,Windows,2.04kg
1,2,3,9,5,1,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,2.99kg
2,3,4,8,5,0,512GB SSD,Intel HD Graphics 620,Windows,2.19kg
3,10,1,5,5,4,128GB SSD,Intel HD Graphics 520,Windows,1.2kg
4,6,1,8,0,6,256GB SSD,AMD Radeon 530,Windows,2.2kg


In [58]:
# "Storage" no aporta valor, la eliminamos
test = test.drop('Storage', axis=1)

test.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,CPU,RAM,GPU,Operating_System,Weight
0,5,1,8,5,6,Intel HD Graphics 620,Windows,2.04kg
1,2,3,9,5,1,Nvidia GeForce GTX 1050 Ti,Windows,2.99kg
2,3,4,8,5,0,Intel HD Graphics 620,Windows,2.19kg
3,10,1,5,5,4,Intel HD Graphics 520,Windows,1.2kg
4,6,1,8,0,6,AMD Radeon 530,Windows,2.2kg


In [59]:
# transformar valores de "GPU" para simplificar
test['GPU'] = test['GPU'].str.replace(r'.*Intel Iris Plus.*', 'Intel Iris Plus', regex=True)\
                                             .str.replace(r'.*Intel Iris Pro.*', 'Intel Iris Pro', regex=True)\
                                             .str.replace(r'.*Intel HD.*', 'Intel HD', regex=True)\
                                             .str.replace(r'.*Intel UHD.*', 'Intel UHD', regex=True)\
                                             .str.replace(r'.*Nvidia GeForce.*', 'Nvidia GeForce', regex=True)\
                                             .str.replace(r'.*Intel Iris Graphics.*', 'Intel Iris Graphics', regex=True)\
                                             .str.replace(r'.*AMD FirePro.*', 'AMD FirePro', regex=True)\
                                             .str.replace(r'.*Nvidia GTX.*', 'Nvidia GTX', regex=True)\
                                             .str.replace(r'.*Nvidia Quadro.*', 'Nvidia Quadro', regex=True)\
                                             .str.replace(r'.*Intel Graphics.*', 'Intel Graphics', regex=True)\
                                             .str.replace(r'.*AMD R.*', 'AMD R', regex=True)\
                                             .str.replace(r'.*ARM Mali.*', 'Other', regex=True)
                                             

test['GPU'].unique()

array(['Intel HD', 'Nvidia GeForce', 'AMD R', 'Nvidia Quadro',
       'AMD FirePro', 'Intel Iris Plus', 'Intel UHD', 'Other'],
      dtype=object)

In [60]:
# aplicar "LabelEncoder" para transformar de categorico a numerico la columna "GPU"
le = LabelEncoder()

test['GPU'] = le.fit_transform(test['GPU'])

test.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,CPU,RAM,GPU,Operating_System,Weight
0,5,1,8,5,6,2,Windows,2.04kg
1,2,3,9,5,1,5,Windows,2.99kg
2,3,4,8,5,0,2,Windows,2.19kg
3,10,1,5,5,4,2,Windows,1.2kg
4,6,1,8,0,6,1,Windows,2.2kg


In [61]:
# observar columna "Operating_System"
test.Operating_System.value_counts()

Windows      287
No OS         14
Linux         14
Chrome OS      5
Mac OS         4
Android        1
Name: Operating_System, dtype: int64

In [62]:
# transformar valores de "Operating_System" para simplificar
os_test = {'Mac OS' : 'macOS', 'Chrome OS' : "Google's OS", 'Android' : "Google's OS"}

test['Operating_System'] = test['Operating_System'].replace(os_test)

test.Operating_System.value_counts()

Windows        287
No OS           14
Linux           14
Google's OS      6
macOS            4
Name: Operating_System, dtype: int64

In [63]:
# transformar de categorico a numerico la columna "Operating_System"
test = pd.get_dummies(test, columns=['Operating_System'], drop_first=True)

test.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,CPU,RAM,GPU,Weight,Operating_System_Linux,Operating_System_No OS,Operating_System_Windows,Operating_System_macOS
0,5,1,8,5,6,2,2.04kg,0,0,1,0
1,2,3,9,5,1,5,2.99kg,0,0,1,0
2,3,4,8,5,0,2,2.19kg,0,0,1,0
3,10,1,5,5,4,2,1.2kg,0,0,1,0
4,6,1,8,0,6,1,2.2kg,0,0,1,0


In [64]:
# "Weight" no aporta valor, la eliminamos
test = test.drop('Weight', axis=1)

test.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,CPU,RAM,GPU,Operating_System_Linux,Operating_System_No OS,Operating_System_Windows,Operating_System_macOS
0,5,1,8,5,6,2,0,0,1,0
1,2,3,9,5,1,5,0,0,1,0
2,3,4,8,5,0,2,0,0,1,0
3,10,1,5,5,4,2,0,0,1,0
4,6,1,8,0,6,1,0,0,1,0


In [65]:
# convertir columnas a enteros
test = test.astype(int)

test.head()

Unnamed: 0,Manufacturer,Category,Screen_Size,CPU,RAM,GPU,Operating_System_Linux,Operating_System_No OS,Operating_System_Windows,Operating_System_macOS
0,5,1,8,5,6,2,0,0,1,0
1,2,3,9,5,1,5,0,0,1,0
2,3,4,8,5,0,2,0,0,1,0
3,10,1,5,5,4,2,0,0,1,0
4,6,1,8,0,6,1,0,0,1,0


In [66]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Manufacturer              325 non-null    int64
 1   Category                  325 non-null    int64
 2   Screen_Size               325 non-null    int64
 3   CPU                       325 non-null    int64
 4   RAM                       325 non-null    int64
 5   GPU                       325 non-null    int64
 6   Operating_System_Linux    325 non-null    int64
 7   Operating_System_No OS    325 non-null    int64
 8   Operating_System_Windows  325 non-null    int64
 9   Operating_System_macOS    325 non-null    int64
dtypes: int64(10)
memory usage: 25.5 KB


## **Exportar transformaciones.**

In [67]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Manufacturer              977 non-null    int64  
 1   Category                  977 non-null    int64  
 2   Screen_Size               977 non-null    int64  
 3   CPU                       977 non-null    int64  
 4   RAM                       977 non-null    int64  
 5   GPU                       977 non-null    int64  
 6   Price                     977 non-null    float64
 7   Operating_System_Linux    977 non-null    int64  
 8   Operating_System_No OS    977 non-null    int64  
 9   Operating_System_Windows  977 non-null    int64  
 10  Operating_System_macOS    977 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 84.1 KB


In [68]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Manufacturer              325 non-null    int64
 1   Category                  325 non-null    int64
 2   Screen_Size               325 non-null    int64
 3   CPU                       325 non-null    int64
 4   RAM                       325 non-null    int64
 5   GPU                       325 non-null    int64
 6   Operating_System_Linux    325 non-null    int64
 7   Operating_System_No OS    325 non-null    int64
 8   Operating_System_Windows  325 non-null    int64
 9   Operating_System_macOS    325 non-null    int64
dtypes: int64(10)
memory usage: 25.5 KB


In [69]:
train.to_csv('../data/train_EDA_1.csv', index=False)
test.to_csv('../data/test_EDA_1.csv', index=False)