#### 1. Preparación del Entorno

In [19]:
# Se usa biblioteca de google
from google.colab import drive
# Se conecta colab con drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
# Se llama biblioteca pandas
import pandas as pd

#### 2. Cargar los Datos

In [21]:
# Se importa la BBDD 'retail_sales_dataset.csv' y se guarda en una variable
path='/content/drive/MyDrive/DATA/retail_sales_dataset.csv'
df = pd.read_csv(path)

In [22]:
df.head(10)

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000
2,3,2023-01-13,CUST003,Male,50,Electronics,1,30,30
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500
4,5,2023-05-06,CUST005,Male,30,Beauty,2,50,100
5,6,2023-04-25,CUST006,Female,45,Beauty,1,30,30
6,7,2023-03-13,CUST007,Male,46,Clothing,2,25,50
7,8,2023-02-22,CUST008,Male,30,Electronics,4,25,100
8,9,2023-12-13,CUST009,Male,63,Electronics,2,300,600
9,10,2023-10-07,CUST010,Female,52,Clothing,4,50,200


#### 3. Exploración Inicial de los Datos

In [23]:
# Muestra las últimas 5 filas del DataFrame.
df.tail(5)

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
995,996,2023-05-16,CUST996,Male,62,Clothing,1,50,50
996,997,2023-11-17,CUST997,Male,52,Beauty,3,30,90
997,998,2023-10-29,CUST998,Female,23,Beauty,4,25,100
998,999,2023-12-05,CUST999,Female,36,Electronics,3,50,150
999,1000,2023-04-12,CUST1000,Male,47,Electronics,4,30,120


In [24]:
# Utiliza el método info() para obtener información general sobre el DataFrame,
# incluyendo el número de entradas, nombres de las columnas, tipos de datos y memoria utilizada.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    1000 non-null   int64 
 1   Date              1000 non-null   object
 2   Customer ID       1000 non-null   object
 3   Gender            1000 non-null   object
 4   Age               1000 non-null   int64 
 5   Product Category  1000 non-null   object
 6   Quantity          1000 non-null   int64 
 7   Price per Unit    1000 non-null   int64 
 8   Total Amount      1000 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 70.4+ KB


* Se verifica que estan todos los datos para cada columna
* Se verifica que hay 1000 filas con 9 columnas.
* 4 columnas con tipo object.
* 5 columnas con tipo int64.
* memoria usada 70.4 KB


In [25]:
# Genera estadísticas descriptivas del DataFrame utilizando el método describe().
df.describe().round(2)

Unnamed: 0,Transaction ID,Age,Quantity,Price per Unit,Total Amount
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,41.39,2.51,179.89,456.0
std,288.82,13.68,1.13,189.68,560.0
min,1.0,18.0,1.0,25.0,25.0
25%,250.75,29.0,1.0,30.0,60.0
50%,500.5,42.0,3.0,50.0,135.0
75%,750.25,53.0,4.0,300.0,900.0
max,1000.0,64.0,4.0,500.0,2000.0


#### 4. Inspección de los Datos

In [26]:
# Inspecciona los tipos de datos de cada columna utilizando el atributo dtypes.
df.dtypes

Unnamed: 0,0
Transaction ID,int64
Date,object
Customer ID,object
Gender,object
Age,int64
Product Category,object
Quantity,int64
Price per Unit,int64
Total Amount,int64


* Se verifica que la columna Date esta definida como tipo object
* Se verifica que las otras columnas estan con sus tipos correctos

In [27]:
df.columns = df.columns.str.replace(" ", "_")

In [28]:
# Cuenta los valores únicos en la columna Producto utilizando el método value_counts().
df["Product_Category"].value_counts()

Unnamed: 0_level_0,count
Product_Category,Unnamed: 1_level_1
Clothing,351
Electronics,342
Beauty,307


* Se verifica que hay 3 categorias de productos

In [29]:
# Muestra todos los valores únicos en la columna Tienda utilizando el método unique().
df['Customer_ID'].unique()


array(['CUST001', 'CUST002', 'CUST003', 'CUST004', 'CUST005', 'CUST006',
       'CUST007', 'CUST008', 'CUST009', 'CUST010', 'CUST011', 'CUST012',
       'CUST013', 'CUST014', 'CUST015', 'CUST016', 'CUST017', 'CUST018',
       'CUST019', 'CUST020', 'CUST021', 'CUST022', 'CUST023', 'CUST024',
       'CUST025', 'CUST026', 'CUST027', 'CUST028', 'CUST029', 'CUST030',
       'CUST031', 'CUST032', 'CUST033', 'CUST034', 'CUST035', 'CUST036',
       'CUST037', 'CUST038', 'CUST039', 'CUST040', 'CUST041', 'CUST042',
       'CUST043', 'CUST044', 'CUST045', 'CUST046', 'CUST047', 'CUST048',
       'CUST049', 'CUST050', 'CUST051', 'CUST052', 'CUST053', 'CUST054',
       'CUST055', 'CUST056', 'CUST057', 'CUST058', 'CUST059', 'CUST060',
       'CUST061', 'CUST062', 'CUST063', 'CUST064', 'CUST065', 'CUST066',
       'CUST067', 'CUST068', 'CUST069', 'CUST070', 'CUST071', 'CUST072',
       'CUST073', 'CUST074', 'CUST075', 'CUST076', 'CUST077', 'CUST078',
       'CUST079', 'CUST080', 'CUST081', 'CUST082', 

#### 5. Filtrado de Datos

In [30]:
df.columns

Index(['Transaction_ID', 'Date', 'Customer_ID', 'Gender', 'Age',
       'Product_Category', 'Quantity', 'Price_per_Unit', 'Total_Amount'],
      dtype='object')

In [31]:
# Filtra el DataFrame para mostrar solo las filas donde las ventas (Ventas) sean mayores a 50.
df[df["Total_Amount"]> 50]

Unnamed: 0,Transaction_ID,Date,Customer_ID,Gender,Age,Product_Category,Quantity,Price_per_Unit,Total_Amount
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500
4,5,2023-05-06,CUST005,Male,30,Beauty,2,50,100
7,8,2023-02-22,CUST008,Male,30,Electronics,4,25,100
...,...,...,...,...,...,...,...,...,...
993,994,2023-12-18,CUST994,Female,51,Beauty,2,500,1000
996,997,2023-11-17,CUST997,Male,52,Beauty,3,30,90
997,998,2023-10-29,CUST998,Female,23,Beauty,4,25,100
998,999,2023-12-05,CUST999,Female,36,Electronics,3,50,150


* 783 registros con ventas sobre 50

In [32]:
# Filtra el DataFrame para mostrar solo las filas donde el precio (Precio) sea menor a 0.5.
df[df["Price_per_Unit"] < 0.5]

Unnamed: 0,Transaction_ID,Date,Customer_ID,Gender,Age,Product_Category,Quantity,Price_per_Unit,Total_Amount


* Se verifica que no hay entradas con 'Price per Unit' menores a 0.5

In [33]:
# Utilizando el método query(), filtra el DataFrame para mostrar las filas donde
# el producto sea Manzanas y las ventas sean mayores a 30.

# Segun la informacion dada el producto 'Manzanas' no esta en la data y
# se cambio por categoria 'Electronics'

res_q = df.query('Product_Category == "Electronics"  & Total_Amount > 30')
print(res_q)

     Transaction_ID        Date Customer_ID  Gender  Age Product_Category  \
7                 8  2023-02-22     CUST008    Male   30      Electronics   
8                 9  2023-12-13     CUST009    Male   63      Electronics   
12               13  2023-08-05     CUST013    Male   22      Electronics   
14               15  2023-01-16     CUST015  Female   42      Electronics   
17               18  2023-04-30     CUST018  Female   47      Electronics   
..              ...         ...         ...     ...  ...              ...   
984             985  2023-05-30     CUST985  Female   19      Electronics   
991             992  2023-08-21     CUST992  Female   57      Electronics   
992             993  2023-02-06     CUST993  Female   48      Electronics   
998             999  2023-12-05     CUST999  Female   36      Electronics   
999            1000  2023-04-12    CUST1000    Male   47      Electronics   

     Quantity  Price_per_Unit  Total_Amount  
7           4              25

* Se verifica que existen 308 entradas con 'Total_Mount' mayores a 30

#### 6. Slicing de Datos

In [34]:
# Selecciona y muestra solo las columnas Producto y Ventas del DataFrame.
df[["Product_Category", "Total_Amount"]]


Unnamed: 0,Product_Category,Total_Amount
0,Beauty,150
1,Clothing,1000
2,Electronics,30
3,Clothing,500
4,Beauty,100
...,...,...
995,Clothing,50
996,Beauty,90
997,Beauty,100
998,Electronics,150


In [35]:
# Utilizando loc[], selecciona y muestra las filas de la 5 a la 10 (inclusive) y las columnas Producto y Tienda.
df.loc[5:10, ["Product_Category", 'Customer_ID']]

Unnamed: 0,Product_Category,Customer_ID
5,Beauty,CUST006
6,Clothing,CUST007
7,Electronics,CUST008
8,Electronics,CUST009
9,Clothing,CUST010
10,Clothing,CUST011


In [36]:
# Utilizando iloc[], selecciona y muestra las primeras 5 filas y las primeras 3 columnas del DataFrame.
res_iloc = df.iloc[0:5, 0:3]
print(res_iloc)

   Transaction_ID        Date Customer_ID
0               1  2023-11-24     CUST001
1               2  2023-02-27     CUST002
2               3  2023-01-13     CUST003
3               4  2023-05-21     CUST004
4               5  2023-05-06     CUST005
