# Ex2 - Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [37]:
import pandas as pd
import numpy as np

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). 

### Step 3. Assign it to a variable called chipo.

In [38]:
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv'

# because is tsv, so the data is spperated by tab
chipo = pd.read_csv(url, sep = '\t')

### Step 4. See the first 10 entries

In [39]:
chipo.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


### Step 5. What is the number of observations in the dataset?

In [40]:
# Solution 1: .shape
print(f'row * column: {chipo.shape}')
print(f'observation: {chipo.shape[0]}')

row * column: (4622, 5)
observation: 4622


In [41]:
# Detail Summary: .info() ->  data types, non-null counts, and memory usage
chipo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   order_id            4622 non-null   int64 
 1   quantity            4622 non-null   int64 
 2   item_name           4622 non-null   object
 3   choice_description  3376 non-null   object
 4   item_price          4622 non-null   object
dtypes: int64(2), object(3)
memory usage: 180.7+ KB


### Step 6. What is the number of columns in the dataset?

In [42]:
chipo.shape[1]

5

### Step 7. Print the name of all the columns.

In [43]:
chipo.columns

Index(['order_id', 'quantity', 'item_name', 'choice_description',
       'item_price'],
      dtype='object')

### Step 8. How is the dataset indexed?

In [44]:
chipo.index

RangeIndex(start=0, stop=4622, step=1)

### Step 9. Which was the most-ordered item? 

In [45]:
item_ct = chipo.groupby('item_name')['quantity'].agg(np.sum)
item_ct = item_ct.sort_values(ascending=False)
item_ct

  item_ct = chipo.groupby('item_name')['quantity'].agg(np.sum)


item_name
Chicken Bowl                             761
Chicken Burrito                          591
Chips and Guacamole                      506
Steak Burrito                            386
Canned Soft Drink                        351
Chips                                    230
Steak Bowl                               221
Bottled Water                            211
Chips and Fresh Tomato Salsa             130
Canned Soda                              126
Chicken Salad Bowl                       123
Chicken Soft Tacos                       120
Side of Chips                            110
Veggie Burrito                            97
Barbacoa Burrito                          91
Veggie Bowl                               87
Carnitas Bowl                             71
Barbacoa Bowl                             66
Carnitas Burrito                          60
Steak Soft Tacos                          56
6 Pack Soft Drink                         55
Chips and Tomatillo Red Chili Salsa       50


### Step 10. For the most-ordered item, how many items were ordered?

In [46]:
item_ct.iloc[0]


761

### Step 11. What was the most ordered item in the choice_description column?

In [47]:
choice_ct = chipo.groupby('choice_description')['quantity'].agg(np.sum).sort_values(ascending=False)
choice_ct.head(1)


  choice_ct = chipo.groupby('choice_description')['quantity'].agg(np.sum).sort_values(ascending=False)


choice_description
[Diet Coke]    159
Name: quantity, dtype: int64

### Step 12. How many items were orderd in total?

In [48]:
chipo['quantity'].sum()

4972

### Step 13. Turn the item price into a float

#### Step 13.a. Check the item price type

In [49]:
chipo['item_price'].dtype

dtype('O')

#### Step 13.b. Create a lambda function and change the type of item price

In [50]:
chipo['item_price'] = chipo['item_price'].apply(lambda x: float(x[1:]))

#### Step 13.c. Check the item price type

In [51]:
chipo['item_price'].dtype

dtype('float64')

### Step 14. How much was the revenue for the period in the dataset?

In [52]:
rev = chipo['quantity'] * chipo ['item_price']
chipo['revenue'] = rev
total_rev = chipo['revenue'].sum()

print(chipo.head())
print(total_rev)

   order_id  quantity                              item_name  \
0         1         1           Chips and Fresh Tomato Salsa   
1         1         1                                   Izze   
2         1         1                       Nantucket Nectar   
3         1         1  Chips and Tomatillo-Green Chili Salsa   
4         2         2                           Chicken Bowl   

                                  choice_description  item_price  revenue  
0                                                NaN        2.39     2.39  
1                                       [Clementine]        3.39     3.39  
2                                            [Apple]        3.39     3.39  
3                                                NaN        2.39     2.39  
4  [Tomatillo-Red Chili Salsa (Hot), [Black Beans...       16.98    33.96  
39237.02


### Step 15. How many orders were made in the period?

In [53]:
order_ct = chipo['order_id'].value_counts() # Counts the frequency of each unique value in a Series
total_ct = order_ct.count()
test = chipo['order_id'].nunique()
print(order_ct.head())
print(total_ct)
print(test)

order_id
926     23
1483    14
205     12
759     11
1786    11
Name: count, dtype: int64
1834
1834


### Step 16. What is the average revenue amount per order?

In [54]:
# Solution 1
avg_per_order = total_rev // total_ct
print(avg_per_order)


21.0


In [55]:
# Solution 2
avg_per_order_2 = chipo.groupby('order_id')['revenue'].sum().mean()
print(avg_per_order_2)

21.39423118865867


### Step 17. How many different items are sold?

In [56]:
item_dis_ct = chipo['item_name'].nunique()
print(item_dis_ct)

50


AttributeError: 'int' object has no attribute 'dtype'