# Part 1 - Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). 

In [2]:
url_chipotle = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv"

In [5]:
chipo = pd.read_csv(url_chipotle, sep='\t')

In [6]:
chipo

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",$11.25
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",$8.75


### Step 3. Assign it to a variable called chipo.

In [7]:
chipo

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",$11.25
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",$8.75


### Step 4. See the first 10 entries

In [8]:
chipo.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


### Step 5. What is the number of observations in the dataset?

In [10]:
# Solution 1
chipo.shape


(4622, 5)

In [11]:
# Solution 2

chipo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   order_id            4622 non-null   int64 
 1   quantity            4622 non-null   int64 
 2   item_name           4622 non-null   object
 3   choice_description  3376 non-null   object
 4   item_price          4622 non-null   object
dtypes: int64(2), object(3)
memory usage: 180.7+ KB


### Step 6. What is the number of columns in the dataset?

In [15]:
chipo.shape[1]

5

### Step 7. Print the name of all the columns.

In [13]:
chipo.columns

Index(['order_id', 'quantity', 'item_name', 'choice_description',
       'item_price'],
      dtype='object')

### Step 8. How is the dataset indexed?

In [16]:
chipo.index

RangeIndex(start=0, stop=4622, step=1)

### Step 9. Which was the most-ordered item? 

In [26]:
total_quantity = chipo.groupby('item_name')['quantity'].sum()

In [27]:
most_ordered_item = total_quantity.idxmax()

In [28]:
most_ordered_item

'Chicken Bowl'

### Step 10. For the most-ordered item, how many items were ordered?

In [29]:
total_quantity.max()

761

### Step 11. What was the most ordered item in the choice_description column?

In [30]:
total_quantity_desc = chipo.groupby('choice_description')['quantity'].sum()

In [31]:
most_ordered_item_desc = total_quantity_desc.idxmax()

In [32]:
most_ordered_item_desc

'[Diet Coke]'

### Step 12. How many items were orderd in total?

In [34]:
len(chipo.item_name.value_counts())
#valores unicos

50

In [35]:
total_items_ordered = chipo['quantity'].sum()

In [36]:
total_items_ordered
#total items ordered

4972

### Step 13. Turn the item price into a float

#### Step 13.a. Check the item price type

In [43]:
chipo.dtypes

order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
dtype: object

#### Step 13.b. Create a lambda function and change the type of item price

In [50]:
import re

In [52]:
remove_currency_and_convert = lambda x: float(re.sub('[\$,]', '', x.strip())) if pd.notna(x) else None

In [53]:
chipo['item_price'] = chipo['item_price'].apply(remove_currency_and_convert)

#### Step 13.c. Check the item price type

In [55]:
chipo['item_price'].dtype

dtype('float64')

### Step 14. How much was the revenue for the period in the dataset?

In [58]:
total_revenue = chipo['item_price'].sum()

In [59]:
total_revenue

34500.16

### Step 15. How many orders were made in the period?

In [60]:
len(chipo.order_id.value_counts())

1834

### Step 16. What is the average revenue amount per order?

In [66]:
# Solution 1

revenue_order_average = chipo.groupby('order_id')['item_price'].mean()

In [67]:
revenue_order_average.round(2)

order_id
1        2.89
2       16.98
3        6.34
4       10.50
5        6.85
        ...  
1830    11.50
1831     4.30
1832     6.60
1833    11.75
1834     9.58
Name: item_price, Length: 1834, dtype: float64

In [68]:
# Solution 2
#no entiendo otra...

18.811428571428568

### Step 17. How many different items are sold?

In [70]:
len(chipo.item_name.value_counts())

50

# Part 2 - Filtering and Sorting Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [71]:
import pandas as pd
import numpy as np

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). 

In [72]:
url_chipotle = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv"

In [73]:
chipo = pd.read_csv(url_chipotle, sep='\t')

### Step 3. Assign it to a variable called chipo.

In [74]:
chipo

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",$11.25
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",$8.75


### Step 4. How many products cost more than $10.00?

In [76]:
import re

In [77]:
remove_currency_and_convert = lambda x: float(re.sub('[\$,]', '', x.strip())) if pd.notna(x) else None

In [78]:
chipo['item_price'] = chipo['item_price'].apply(remove_currency_and_convert)

In [79]:
chipo

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98
...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",11.75
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",11.75
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",11.25
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",8.75


In [80]:
over_ten_products = chipo.loc[chipo['item_price'] > 10]

In [82]:
len(over_ten_products)

1130

### Step 5. What is the price of each item? 
###### print a data frame with only two columns item_name and item_price

In [84]:
chipo_2 = chipo[['item_name','item_price']]

In [85]:
chipo_2

Unnamed: 0,item_name,item_price
0,Chips and Fresh Tomato Salsa,2.39
1,Izze,3.39
2,Nantucket Nectar,3.39
3,Chips and Tomatillo-Green Chili Salsa,2.39
4,Chicken Bowl,16.98
...,...,...
4617,Steak Burrito,11.75
4618,Steak Burrito,11.75
4619,Chicken Salad Bowl,11.25
4620,Chicken Salad Bowl,8.75


In [86]:
chipo_2_nd = chipo_2.drop_duplicates(subset='item_name')

In [87]:
chipo_2_nd

Unnamed: 0,item_name,item_price
0,Chips and Fresh Tomato Salsa,2.39
1,Izze,3.39
2,Nantucket Nectar,3.39
3,Chips and Tomatillo-Green Chili Salsa,2.39
4,Chicken Bowl,16.98
6,Side of Chips,1.69
7,Steak Burrito,11.75
8,Steak Soft Tacos,9.25
10,Chips and Guacamole,4.45
11,Chicken Crispy Tacos,8.75


### Step 6. Sort by the name of the item

In [88]:
chipo_2_nd_sorted = chipo_2_nd.sort_values(by='item_name')

In [89]:
chipo_2_nd_sorted

Unnamed: 0,item_name,item_price
298,6 Pack Soft Drink,6.49
39,Barbacoa Bowl,11.75
21,Barbacoa Burrito,8.99
168,Barbacoa Crispy Tacos,11.75
1229,Barbacoa Salad Bowl,11.89
56,Barbacoa Soft Tacos,9.25
34,Bottled Water,1.09
445,Bowl,22.2
510,Burrito,7.4
18,Canned Soda,2.18


### Step 7. What was the quantity of the most expensive item ordered?

In [95]:
chipo_3 = chipo[['item_name','item_price','quantity']]

In [96]:
chipo_3

Unnamed: 0,item_name,item_price,quantity
0,Chips and Fresh Tomato Salsa,2.39,1
1,Izze,3.39,1
2,Nantucket Nectar,3.39,1
3,Chips and Tomatillo-Green Chili Salsa,2.39,1
4,Chicken Bowl,16.98,2
...,...,...,...
4617,Steak Burrito,11.75,1
4618,Steak Burrito,11.75,1
4619,Chicken Salad Bowl,11.25,1
4620,Chicken Salad Bowl,8.75,1


In [97]:
chipo_3_nd = chipo_3.drop_duplicates(subset='item_name')

In [98]:
chipo_3_nd

Unnamed: 0,item_name,item_price,quantity
0,Chips and Fresh Tomato Salsa,2.39,1
1,Izze,3.39,1
2,Nantucket Nectar,3.39,1
3,Chips and Tomatillo-Green Chili Salsa,2.39,1
4,Chicken Bowl,16.98,2
6,Side of Chips,1.69,1
7,Steak Burrito,11.75,1
8,Steak Soft Tacos,9.25,1
10,Chips and Guacamole,4.45,1
11,Chicken Crispy Tacos,8.75,1


In [99]:
len(chipo_2_nd) == len(chipo_3_nd)

True

In [104]:
max_price_item = chipo_3_nd.loc[chipo_3_nd['item_price'].idxmax(), 'item_name']

In [105]:
max_price_item
#steak salad bowl es el item más caro por separado

'Steak Salad Bowl'

In [109]:
steak_count = (chipo['item_name'] == 'Steak Salad Bowl').sum()

In [110]:
steak_count
#steak salad bowl sale 29 veces en la lista chipo original

29

### Step 8. How many times was a Veggie Salad Bowl ordered?

In [111]:
veggie_count = (chipo['item_name'] == 'Veggie Salad Bowl').sum()

In [112]:
veggie_count

18

### Step 9. How many times did someone order more than one Canned Soda?

In [114]:
canned_soda = "Canned Soda"

In [115]:
multiple_canned_soda = ((chipo['item_name'] == canned_soda) & (chipo['quantity'] > 1)).sum()

In [116]:
multiple_canned_soda

20