# Getting and Knowing Data

In [2]:
import pandas as pd
import numpy as np
import os

## Step 1: Import Dataset

In [3]:
df = pd.read_csv(os.path.join(os.getcwd(),"chipotle.tsv"), '\t')

In [5]:
df.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


## Step 2: Data Overview

In [22]:
df.shape

(4622, 5)

In [1]:
df.info()

NameError: name 'df' is not defined

In [24]:
list(df.columns)

['order_id', 'quantity', 'item_name', 'choice_description', 'item_price']

In [25]:
df.index

RangeIndex(start=0, stop=4622, step=1)

In [27]:
df.describe(include = "all")

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
count,4622.0,4622.0,4622,3376,4622
unique,,,50,1043,78
top,,,Chicken Bowl,[Diet Coke],$8.75
freq,,,726,134,730
mean,927.254868,1.075725,,,
std,528.890796,0.410186,,,
min,1.0,1.0,,,
25%,477.25,1.0,,,
50%,926.0,1.0,,,
75%,1393.0,1.0,,,


### loc vs iloc

In [28]:
df.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [42]:
df.loc[(df.quantity == 2) & (df.item_name == "Chips and Fresh Tomato Salsa"), ['order_id', 'quantity', 'item_name']]

Unnamed: 0,order_id,quantity,item_name
1882,759,2,Chips and Fresh Tomato Salsa
2267,912,2,Chips and Fresh Tomato Salsa
2729,1083,2,Chips and Fresh Tomato Salsa


In [44]:
df.iloc[[9]]

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


In [45]:
df.iloc[3:11]

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25
10,5,1,Chips and Guacamole,,$4.45


In [63]:
df.iloc[3:5, 0:-1]

Unnamed: 0,order_id,quantity,item_name,choice_description
3,1,1,Chips and Tomatillo-Green Chili Salsa,
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans..."


### Data Manipulation

In [47]:
df.item_price.dtype

dtype('O')

In [64]:
df.item_price = df.item_price.apply(lambda x : float(x.replace('$', '')))

In [65]:
df.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98


In [67]:
df["total_price"] = df["quantity"] * df["item_price"]

In [68]:
df.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,total_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39,2.39
1,1,1,Izze,[Clementine],3.39,3.39
2,1,1,Nantucket Nectar,[Apple],3.39,3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98,33.96


In [69]:
df["total_price"].sum()

39237.02

In [75]:
c = df.groupby('item_name')["quantity"].sum()
c.sort_values(ascending = False).head(10)

item_name
Chicken Bowl                    761
Chicken Burrito                 591
Chips and Guacamole             506
Steak Burrito                   386
Canned Soft Drink               351
Chips                           230
Steak Bowl                      221
Bottled Water                   211
Chips and Fresh Tomato Salsa    130
Canned Soda                     126
Name: quantity, dtype: int64

In [76]:
#Unique Value
df.item_name.nunique()

50

In [77]:
df.item_name.value_counts()

Chicken Bowl                             726
Chicken Burrito                          553
Chips and Guacamole                      479
Steak Burrito                            368
Canned Soft Drink                        301
Chips                                    211
Steak Bowl                               211
Bottled Water                            162
Chicken Soft Tacos                       115
Chips and Fresh Tomato Salsa             110
Chicken Salad Bowl                       110
Canned Soda                              104
Side of Chips                            101
Veggie Burrito                            95
Barbacoa Burrito                          91
Veggie Bowl                               85
Carnitas Bowl                             68
Barbacoa Bowl                             66
Carnitas Burrito                          59
Steak Soft Tacos                          55
6 Pack Soft Drink                         54
Chips and Tomatillo Red Chili Salsa       48
Chicken Cr