# **EDA.**

## **Librerías.**

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import warnings  
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression     # clasificar  

from sklearn.preprocessing import StandardScaler    # estandarizar
from sklearn.preprocessing import LabelEncoder               

from sklearn.model_selection import train_test_split as tts     # dividir los datos en conjuntos de "train" y "test"


## **Cargar datos.**

In [2]:
# cargar datos "train"
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Manufacturer              977 non-null    object 
 1   Model Name                977 non-null    object 
 2   Category                  977 non-null    object 
 3   Screen Size               977 non-null    object 
 4   Screen                    977 non-null    object 
 5   CPU                       977 non-null    object 
 6   RAM                       977 non-null    object 
 7    Storage                  977 non-null    object 
 8   GPU                       977 non-null    object 
 9   Operating System          977 non-null    object 
 10  Operating System Version  841 non-null    object 
 11  Weight                    977 non-null    object 
 12  Price                     977 non-null    float64
dtypes: float64(1), object(12)
memory usage: 99.4+ KB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              325 non-null    object
 1   Model Name                325 non-null    object
 2   Category                  325 non-null    object
 3   Screen Size               325 non-null    object
 4   Screen                    325 non-null    object
 5   CPU                       325 non-null    object
 6   RAM                       325 non-null    object
 7    Storage                  325 non-null    object
 8   GPU                       325 non-null    object
 9   Operating System          325 non-null    object
 10  Operating System Version  291 non-null    object
 11  Weight                    325 non-null    object
dtypes: object(12)
memory usage: 30.6+ KB


## **Transformar datos.**

### **Train.**

In [5]:
train.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,11912523.48
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,7993374.48
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,5112900.0
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,22563005.4
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,16037611.2


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Manufacturer              977 non-null    object 
 1   Model Name                977 non-null    object 
 2   Category                  977 non-null    object 
 3   Screen Size               977 non-null    object 
 4   Screen                    977 non-null    object 
 5   CPU                       977 non-null    object 
 6   RAM                       977 non-null    object 
 7    Storage                  977 non-null    object 
 8   GPU                       977 non-null    object 
 9   Operating System          977 non-null    object 
 10  Operating System Version  841 non-null    object 
 11  Weight                    977 non-null    object 
 12  Price                     977 non-null    float64
dtypes: float64(1), object(12)
memory usage: 99.4+ KB


In [7]:
train.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Manufacturer,977.0,19.0,Dell,232.0,,,,,,,
Model Name,977.0,488.0,XPS 13,22.0,,,,,,,
Category,977.0,6.0,Notebook,549.0,,,,,,,
Screen Size,977.0,18.0,"15.6""",494.0,,,,,,,
Screen,977.0,38.0,Full HD 1920x1080,397.0,,,,,,,
CPU,977.0,106.0,Intel Core i5 7200U 2.5GHz,151.0,,,,,,,
RAM,977.0,8.0,8GB,483.0,,,,,,,
Storage,977.0,36.0,256GB SSD,318.0,,,,,,,
GPU,977.0,98.0,Intel HD Graphics 620,225.0,,,,,,,
Operating System,977.0,7.0,Windows,837.0,,,,,,,


In [8]:
# cambiar nombre columna
train.rename(columns=lambda x: x.strip() if isinstance(x, str) else x, inplace=True)

train.columns

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', 'Storage', 'GPU', 'Operating System',
       'Operating System Version', 'Weight', 'Price'],
      dtype='object')

In [9]:
# añadir guion bajo "_" en lugar de espacio en nombres de columnas
train.rename(columns=lambda x: x.replace(" ", "_") if isinstance(x, str) else x, inplace=True)

train.columns

Index(['Manufacturer', 'Model_Name', 'Category', 'Screen_Size', 'Screen',
       'CPU', 'RAM', 'Storage', 'GPU', 'Operating_System',
       'Operating_System_Version', 'Weight', 'Price'],
      dtype='object')

In [10]:
# observar columna "Operating System Version"
train['Operating_System_Version'].unique()

array([nan, '10', 'X', '10 S', '7'], dtype=object)

In [11]:
train.Operating_System_Version.value_counts()

Operating_System_Version
10      819
7        10
10 S      8
X         4
Name: count, dtype: int64

In [13]:
# "Operating System Version" no aporta valor, ademas teniendo nulos, la eliminamos
train = train.drop('Operating_System_Version', axis=1)

train.head()

Unnamed: 0,Manufacturer,Model_Name,Category,Screen_Size,Screen,CPU,RAM,Storage,GPU,Operating_System,Weight,Price
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,11912523.48
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,7993374.48
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,5112900.0
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,22563005.4
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,16037611.2


In [17]:
# observar columna "Category"
train.Category.value_counts()

Category
Notebook              549
Ultrabook             152
Gaming                150
2 in 1 Convertible     85
Workstation            24
Netbook                17
Name: count, dtype: int64

In [18]:
# transformar valores de "Category" para simplificar
category = train.Category.value_counts()[:3]

train['Category']= train['Category'].apply(lambda x: x if x in category else 'Other')

train.Category.value_counts()

Category
Notebook     549
Ultrabook    152
Gaming       150
Other        126
Name: count, dtype: int64

In [None]:
# eliminar columnas con variables apenas significativas que bajo mi perspectiva, no aportan valor a la prediccion
train = train.drop('Screen', 'Weight', axis=1)

In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Manufacturer      977 non-null    object 
 1   Model_Name        977 non-null    object 
 2   Category          977 non-null    object 
 3   Screen_Size       977 non-null    object 
 4   Screen            977 non-null    object 
 5   CPU               977 non-null    object 
 6   RAM               977 non-null    object 
 7   Storage           977 non-null    object 
 8   GPU               977 non-null    object 
 9   Operating_System  977 non-null    object 
 10  Weight            977 non-null    object 
 11  Price             977 non-null    float64
dtypes: float64(1), object(11)
memory usage: 91.7+ KB


### **Test.**