diff --git a/01_Getting_&_Knowing_Your_Data/Chipotle/Exercises_solved.ipynb b/01_Getting_&_Knowing_Your_Data/Chipotle/Exercises_solved.ipynb new file mode 100644 index 000000000..10fee43b5 --- /dev/null +++ b/01_Getting_&_Knowing_Your_Data/Chipotle/Exercises_solved.ipynb @@ -0,0 +1,787 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Ex2 - Getting and Knowing your Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This time we are going to pull data directly from the internet.\n", + "Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.\n", + "\n", + "### Step 1. Import the necessary libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3. Assign it to a variable called chipo." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# it's tsv file (tab separated file)\n", + "# specify sep = \"\\t\" in argument of read_csv\n", + "chipo = pd.read_csv(\"https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv\", sep=\"\\t\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4. See the first 10 entries" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
order_idquantityitem_namechoice_descriptionitem_price
011Chips and Fresh Tomato SalsaNaN$2.39
111Izze[Clementine]$3.39
211Nantucket Nectar[Apple]$3.39
311Chips and Tomatillo-Green Chili SalsaNaN$2.39
422Chicken Bowl[Tomatillo-Red Chili Salsa (Hot), [Black Beans...$16.98
531Chicken Bowl[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...$10.98
631Side of ChipsNaN$1.69
741Steak Burrito[Tomatillo Red Chili Salsa, [Fajita Vegetables...$11.75
841Steak Soft Tacos[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...$9.25
951Steak Burrito[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...$9.25
\n", + "
" + ], + "text/plain": [ + " order_id quantity item_name \\\n", + "0 1 1 Chips and Fresh Tomato Salsa \n", + "1 1 1 Izze \n", + "2 1 1 Nantucket Nectar \n", + "3 1 1 Chips and Tomatillo-Green Chili Salsa \n", + "4 2 2 Chicken Bowl \n", + "5 3 1 Chicken Bowl \n", + "6 3 1 Side of Chips \n", + "7 4 1 Steak Burrito \n", + "8 4 1 Steak Soft Tacos \n", + "9 5 1 Steak Burrito \n", + "\n", + " choice_description item_price \n", + "0 NaN $2.39 \n", + "1 [Clementine] $3.39 \n", + "2 [Apple] $3.39 \n", + "3 NaN $2.39 \n", + "4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98 \n", + "5 [Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou... $10.98 \n", + "6 NaN $1.69 \n", + "7 [Tomatillo Red Chili Salsa, [Fajita Vegetables... $11.75 \n", + "8 [Tomatillo Green Chili Salsa, [Pinto Beans, Ch... $9.25 \n", + "9 [Fresh Tomato Salsa, [Rice, Black Beans, Pinto... $9.25 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chipo.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 5. What is the number of observations in the dataset?" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4622" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Solution 1\n", + "chipo.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4622" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Solution 2\n", + "len(chipo)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 6. What is the number of columns in the dataset?" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chipo.shape[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 7. Print the name of all the columns." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['order_id', 'quantity', 'item_name', 'choice_description',\n", + " 'item_price'],\n", + " dtype='object')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chipo.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 8. How is the dataset indexed?" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RangeIndex(start=0, stop=4622, step=1)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# integer in range(0, 4622)\n", + "chipo.index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 9. Which was the most-ordered item? " + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "item_name\n", + "Chicken Bowl 761\n", + "Chicken Burrito 591\n", + "Chips and Guacamole 506\n", + "Steak Burrito 386\n", + "Canned Soft Drink 351\n", + "Chips 230\n", + "Steak Bowl 221\n", + "Bottled Water 211\n", + "Chips and Fresh Tomato Salsa 130\n", + "Canned Soda 126\n", + "Chicken Salad Bowl 123\n", + "Chicken Soft Tacos 120\n", + "Side of Chips 110\n", + "Veggie Burrito 97\n", + "Barbacoa Burrito 91\n", + "Veggie Bowl 87\n", + "Carnitas Bowl 71\n", + "Barbacoa Bowl 66\n", + "Carnitas Burrito 60\n", + "Steak Soft Tacos 56\n", + "6 Pack Soft Drink 55\n", + "Chips and Tomatillo Red Chili Salsa 50\n", + "Chicken Crispy Tacos 50\n", + "Chips and Tomatillo Green Chili Salsa 45\n", + "Carnitas Soft Tacos 40\n", + "Steak Crispy Tacos 36\n", + "Chips and Tomatillo-Green Chili Salsa 33\n", + "Steak Salad Bowl 31\n", + "Nantucket Nectar 29\n", + "Chips and Tomatillo-Red Chili Salsa 25\n", + "Barbacoa Soft Tacos 25\n", + "Chips and Roasted Chili Corn Salsa 23\n", + "Izze 20\n", + "Veggie Salad Bowl 18\n", + "Chips and Roasted Chili-Corn Salsa 18\n", + "Barbacoa Crispy Tacos 12\n", + "Barbacoa Salad Bowl 10\n", + "Chicken Salad 9\n", + "Carnitas Crispy Tacos 8\n", + "Veggie Soft Tacos 8\n", + "Burrito 6\n", + "Veggie Salad 6\n", + "Carnitas Salad Bowl 6\n", + "Bowl 4\n", + "Steak Salad 4\n", + "Salad 2\n", + "Crispy Tacos 2\n", + "Chips and Mild Fresh Tomato Salsa 1\n", + "Carnitas Salad 1\n", + "Veggie Crispy Tacos 1\n", + "Name: quantity, dtype: int64" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped_chipo = chipo.groupby(\"item_name\")[\"quantity\"].sum()\n", + "grouped_chipo.sort_values(ascending = False, inplace=True)\n", + "grouped_chipo" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Chicken Bowl'" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped_chipo.index[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 10. For the most-ordered item, how many items were ordered?" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "761" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped_chipo[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 11. What was the most ordered item in the choice_description column?" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "choice_description\n", + "[Diet Coke] 134\n", + "[Coke] 123\n", + "[Sprite] 77\n", + "[Fresh Tomato Salsa, [Rice, Black Beans, Cheese, Sour Cream, Lettuce]] 42\n", + "[Fresh Tomato Salsa, [Rice, Black Beans, Cheese, Sour Cream, Guacamole, Lettuce]] 40\n", + " ... \n", + "[Roasted Chili Corn Salsa, [Fajita Vegetables, Rice, Cheese, Guacamole, Lettuce]] 1\n", + "[Roasted Chili Corn Salsa, [Fajita Vegetables, Rice, Cheese, Sour Cream, Guacamole]] 1\n", + "[Roasted Chili Corn Salsa, [Fajita Vegetables, Rice, Guacamole, Lettuce]] 1\n", + "[Roasted Chili Corn Salsa, [Fajita Vegetables, Rice, Guacamole]] 1\n", + "[[Tomatillo-Red Chili Salsa (Hot), Tomatillo-Green Chili Salsa (Medium)], [Rice, Pinto Beans, Fajita Veggies, Lettuce]] 1\n", + "Name: choice_description, Length: 1043, dtype: int64" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped_by_desc = chipo.groupby(\"choice_description\")[\"choice_description\"].count()\n", + "grouped_by_desc.sort_values(ascending=False, inplace=True)\n", + "grouped_by_desc" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'[Diet Coke]'" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped_by_desc.index[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 12. How many items were orderd in total?" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4972" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(chipo[\"quantity\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 13. Turn the item price into a float" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Step 13.a. Check the item price type" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('O')" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chipo[\"item_price\"].dtype" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Step 13.b. Create a lambda function and change the type of item price" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "# note that there were currency symbol\n", + "chipo[\"item_price\"] = chipo[\"item_price\"].apply(lambda x: float(x[1:]) )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Step 13.c. Check the item price type" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('float64')" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chipo[\"item_price\"].dtype" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 14. How much was the revenue for the period in the dataset?" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "39237.02" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chipo[\"total_price\"] = chipo[\"item_price\"] * chipo[\"quantity\"]\n", + "chipo[\"total_price\"].sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 15. How many orders were made in the period?" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1834" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# note that one order id can have several item\n", + "# group by order id\n", + "len(chipo.groupby(\"order_id\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 16. What is the average revenue amount per order?" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "21.394231188658654" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Solution 1\n", + "group_by_id = chipo.groupby(\"order_id\")[\"total_price\"].sum()\n", + "sum(group_by_id) / len(chipo.groupby(\"order_id\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Solution 2\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 17. How many different items are sold?" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "50" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chipo[\"item_name\"].nunique()" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/01_Getting_&_Knowing_Your_Data/Occupation/Exercisess_solved.ipynb b/01_Getting_&_Knowing_Your_Data/Occupation/Exercisess_solved.ipynb new file mode 100644 index 000000000..db8c76192 --- /dev/null +++ b/01_Getting_&_Knowing_Your_Data/Occupation/Exercisess_solved.ipynb @@ -0,0 +1,1057 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Ex3 - Getting and Knowing your Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This time we are going to pull data directly from the internet.\n", + "Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.\n", + "\n", + "### Step 1. Import the necessary libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3. Assign it to a variable called users and use the 'user_id' as index" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "user_id = pd.read_csv(\"https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user\", sep=\"|\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4. See the first 25 entries" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idagegenderoccupationzip_code
0124Mtechnician85711
1253Fother94043
2323Mwriter32067
3424Mtechnician43537
4533Fother15213
5642Mexecutive98101
6757Madministrator91344
7836Madministrator05201
8929Mstudent01002
91053Mlawyer90703
101139Fother30329
111228Fother06405
121347Meducator29206
131445Mscientist55106
141549Feducator97301
151621Mentertainment10309
161730Mprogrammer06355
171835Fother37212
181940Mlibrarian02138
192042Fhomemaker95660
202126Mwriter30068
212225Mwriter40206
222330Fartist48197
232421Fartist94533
242539Mengineer55107
\n", + "
" + ], + "text/plain": [ + " user_id age gender occupation zip_code\n", + "0 1 24 M technician 85711\n", + "1 2 53 F other 94043\n", + "2 3 23 M writer 32067\n", + "3 4 24 M technician 43537\n", + "4 5 33 F other 15213\n", + "5 6 42 M executive 98101\n", + "6 7 57 M administrator 91344\n", + "7 8 36 M administrator 05201\n", + "8 9 29 M student 01002\n", + "9 10 53 M lawyer 90703\n", + "10 11 39 F other 30329\n", + "11 12 28 F other 06405\n", + "12 13 47 M educator 29206\n", + "13 14 45 M scientist 55106\n", + "14 15 49 F educator 97301\n", + "15 16 21 M entertainment 10309\n", + "16 17 30 M programmer 06355\n", + "17 18 35 F other 37212\n", + "18 19 40 M librarian 02138\n", + "19 20 42 F homemaker 95660\n", + "20 21 26 M writer 30068\n", + "21 22 25 M writer 40206\n", + "22 23 30 F artist 48197\n", + "23 24 21 F artist 94533\n", + "24 25 39 M engineer 55107" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_id.head(25)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 5. See the last 10 entries" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idagegenderoccupationzip_code
93393461Mengineer22902
93493542Mdoctor66221
93593624Mother32789
93693748Meducator98072
93793838Ftechnician55038
93893926Fstudent33319
93994032Madministrator02215
94094120Mstudent97229
94194248Flibrarian78209
94294322Mstudent77841
\n", + "
" + ], + "text/plain": [ + " user_id age gender occupation zip_code\n", + "933 934 61 M engineer 22902\n", + "934 935 42 M doctor 66221\n", + "935 936 24 M other 32789\n", + "936 937 48 M educator 98072\n", + "937 938 38 F technician 55038\n", + "938 939 26 F student 33319\n", + "939 940 32 M administrator 02215\n", + "940 941 20 M student 97229\n", + "941 942 48 F librarian 78209\n", + "942 943 22 M student 77841" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_id.tail(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 6. What is the number of observations in the dataset?" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "943" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_id.shape[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 7. What is the number of columns in the dataset?" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_id.shape[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 8. Print the name of all the columns." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['user_id', 'age', 'gender', 'occupation', 'zip_code'], dtype='object')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_id.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 9. How is the dataset indexed?" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RangeIndex(start=0, stop=943, step=1)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# in range(0, 943)\n", + "user_id.index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 10. What is the data type of each column?" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 943 entries, 0 to 942\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 user_id 943 non-null int64 \n", + " 1 age 943 non-null int64 \n", + " 2 gender 943 non-null object\n", + " 3 occupation 943 non-null object\n", + " 4 zip_code 943 non-null object\n", + "dtypes: int64(2), object(3)\n", + "memory usage: 37.0+ KB\n" + ] + } + ], + "source": [ + "user_id.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 11. Print only the occupation column" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 technician\n", + "1 other\n", + "2 writer\n", + "3 technician\n", + "4 other\n", + " ... \n", + "938 student\n", + "939 administrator\n", + "940 student\n", + "941 librarian\n", + "942 student\n", + "Name: occupation, Length: 943, dtype: object" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_id[\"occupation\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 12. How many different occupations are in this dataset?" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "21" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_id[\"occupation\"].nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 13. What is the most frequent occupation?" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'student'" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m= user_id.groupby(\"occupation\")[\"occupation\"].count()\n", + "m.sort_values(ascending=False, inplace=True)\n", + "m.index[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 14. Summarize the DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idage
count943.000000943.000000
mean472.00000034.051962
std272.36495112.192740
min1.0000007.000000
25%236.50000025.000000
50%472.00000031.000000
75%707.50000043.000000
max943.00000073.000000
\n", + "
" + ], + "text/plain": [ + " user_id age\n", + "count 943.000000 943.000000\n", + "mean 472.000000 34.051962\n", + "std 272.364951 12.192740\n", + "min 1.000000 7.000000\n", + "25% 236.500000 25.000000\n", + "50% 472.000000 31.000000\n", + "75% 707.500000 43.000000\n", + "max 943.000000 73.000000" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_id.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 15. Summarize all the columns" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
genderoccupationzip_code
count943943943
unique221795
topMstudent55414
freq6701969
\n", + "
" + ], + "text/plain": [ + " gender occupation zip_code\n", + "count 943 943 943\n", + "unique 2 21 795\n", + "top M student 55414\n", + "freq 670 196 9" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "obj_cols = user_id.select_dtypes(include=\"object\")\n", + "obj_cols.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 16. Summarize only the occupation column" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 943\n", + "unique 21\n", + "top student\n", + "freq 196\n", + "Name: occupation, dtype: object" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_id[\"occupation\"].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 17. What is the mean age of users?" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "34.05196182396607" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_id[\"age\"].mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 18. What is the age with least occurrence?" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "30 39\n", + "25 38\n", + "22 37\n", + "28 36\n", + "27 35\n", + " ..\n", + "7 1\n", + "66 1\n", + "11 1\n", + "10 1\n", + "73 1\n", + "Name: age, Length: 61, dtype: int64" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_id.age.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7 1\n", + "66 1\n", + "11 1\n", + "10 1\n", + "73 1\n", + "Name: age, dtype: int64" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_id.age.value_counts().tail()" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/01_Getting_&_Knowing_Your_Data/World Food Facts/Exercises_solved.ipynb b/01_Getting_&_Knowing_Your_Data/World Food Facts/Exercises_solved.ipynb new file mode 100644 index 000000000..0b2e77779 --- /dev/null +++ b/01_Getting_&_Knowing_Your_Data/World Food Facts/Exercises_solved.ipynb @@ -0,0 +1,563 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exercise 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 1. Go to https://www.kaggle.com/openfoodfacts/world-food-facts/data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2. Download the dataset to your computer and unzip it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3. Use the tsv file and assign it to a dataframe called food" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds\n", + "Your Kaggle username: \n", + "Your Kaggle username: \n", + "Your Kaggle username: chewziqing\n", + "Your Kaggle Key: ········\n", + "Downloading world-food-facts.zip to .\\world-food-facts\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████████████████████████████████████████| 109M/109M [01:27<00:00, 1.30MB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import opendatasets as od\n", + "od.download(\"https://www.kaggle.com/openfoodfacts/world-food-facts/data\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\USER\\AppData\\Local\\Temp\\ipykernel_12124\\2931642990.py:5: DtypeWarning: Columns (0,3,5,19,20,24,25,26,27,28,36,37,38,39,48) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df = pd.read_csv(path, sep=\"\\t\")\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "path = \"world-food-facts/en.openfoodfacts.org.products.tsv\"\n", + "os.listdir(\"./world-food-facts/\")\n", + "df = pd.read_csv(path, sep=\"\\t\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4. See the first 5 entries" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
codeurlcreatorcreated_tcreated_datetimelast_modified_tlast_modified_datetimeproduct_namegeneric_namequantity...fruits-vegetables-nuts_100gfruits-vegetables-nuts-estimate_100gcollagen-meat-protein-ratio_100gcocoa_100gchlorophyl_100gcarbon-footprint_100gnutrition-score-fr_100gnutrition-score-uk_100gglycemic-index_100gwater-hardness_100g
03087http://world-en.openfoodfacts.org/product/0000...openfoodfacts-contributors14741038662016-09-17T09:17:46Z14741038932016-09-17T09:18:13ZFarine de blé noirNaN1kg...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
14530http://world-en.openfoodfacts.org/product/0000...usda-ndb-import14890699572017-03-09T14:32:37Z14890699572017-03-09T14:32:37ZBanana Chips Sweetened (Whole)NaNNaN...NaNNaNNaNNaNNaNNaN14.014.0NaNNaN
24559http://world-en.openfoodfacts.org/product/0000...usda-ndb-import14890699572017-03-09T14:32:37Z14890699572017-03-09T14:32:37ZPeanutsNaNNaN...NaNNaNNaNNaNNaNNaN0.00.0NaNNaN
316087http://world-en.openfoodfacts.org/product/0000...usda-ndb-import14890557312017-03-09T10:35:31Z14890557312017-03-09T10:35:31ZOrganic Salted Nut MixNaNNaN...NaNNaNNaNNaNNaNNaN12.012.0NaNNaN
416094http://world-en.openfoodfacts.org/product/0000...usda-ndb-import14890556532017-03-09T10:34:13Z14890556532017-03-09T10:34:13ZOrganic PolentaNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 163 columns

\n", + "
" + ], + "text/plain": [ + " code url \\\n", + "0 3087 http://world-en.openfoodfacts.org/product/0000... \n", + "1 4530 http://world-en.openfoodfacts.org/product/0000... \n", + "2 4559 http://world-en.openfoodfacts.org/product/0000... \n", + "3 16087 http://world-en.openfoodfacts.org/product/0000... \n", + "4 16094 http://world-en.openfoodfacts.org/product/0000... \n", + "\n", + " creator created_t created_datetime \\\n", + "0 openfoodfacts-contributors 1474103866 2016-09-17T09:17:46Z \n", + "1 usda-ndb-import 1489069957 2017-03-09T14:32:37Z \n", + "2 usda-ndb-import 1489069957 2017-03-09T14:32:37Z \n", + "3 usda-ndb-import 1489055731 2017-03-09T10:35:31Z \n", + "4 usda-ndb-import 1489055653 2017-03-09T10:34:13Z \n", + "\n", + " last_modified_t last_modified_datetime product_name \\\n", + "0 1474103893 2016-09-17T09:18:13Z Farine de blé noir \n", + "1 1489069957 2017-03-09T14:32:37Z Banana Chips Sweetened (Whole) \n", + "2 1489069957 2017-03-09T14:32:37Z Peanuts \n", + "3 1489055731 2017-03-09T10:35:31Z Organic Salted Nut Mix \n", + "4 1489055653 2017-03-09T10:34:13Z Organic Polenta \n", + "\n", + " generic_name quantity ... fruits-vegetables-nuts_100g \\\n", + "0 NaN 1kg ... NaN \n", + "1 NaN NaN ... NaN \n", + "2 NaN NaN ... NaN \n", + "3 NaN NaN ... NaN \n", + "4 NaN NaN ... NaN \n", + "\n", + " fruits-vegetables-nuts-estimate_100g collagen-meat-protein-ratio_100g \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + " cocoa_100g chlorophyl_100g carbon-footprint_100g nutrition-score-fr_100g \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN 14.0 \n", + "2 NaN NaN NaN 0.0 \n", + "3 NaN NaN NaN 12.0 \n", + "4 NaN NaN NaN NaN \n", + "\n", + " nutrition-score-uk_100g glycemic-index_100g water-hardness_100g \n", + "0 NaN NaN NaN \n", + "1 14.0 NaN NaN \n", + "2 0.0 NaN NaN \n", + "3 12.0 NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + "[5 rows x 163 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 5. What is the number of observations in the dataset?" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "356027" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 6. What is the number of columns in the dataset?" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "163" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 7. Print the name of all the columns." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['code', 'url', 'creator', 'created_t', 'created_datetime',\n", + " 'last_modified_t', 'last_modified_datetime', 'product_name',\n", + " 'generic_name', 'quantity',\n", + " ...\n", + " 'fruits-vegetables-nuts_100g', 'fruits-vegetables-nuts-estimate_100g',\n", + " 'collagen-meat-protein-ratio_100g', 'cocoa_100g', 'chlorophyl_100g',\n", + " 'carbon-footprint_100g', 'nutrition-score-fr_100g',\n", + " 'nutrition-score-uk_100g', 'glycemic-index_100g',\n", + " 'water-hardness_100g'],\n", + " dtype='object', length=163)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 8. What is the name of 105th column?" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'-glucose_100g'" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns[104]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 9. What is the type of the observations of the 105th column?" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('float64')" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df.columns[104]].dtype" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 10. How is the dataset indexed?" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RangeIndex(start=0, stop=356027, step=1)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 11. What is the product name of the 19th observation?" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Lotus Organic Brown Jasmine Rice'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[18, \"product_name\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/02_Filtering_&_Sorting/Chipotle/Exercises_solved.ipynb b/02_Filtering_&_Sorting/Chipotle/Exercises_solved.ipynb new file mode 100644 index 000000000..05646c6ed --- /dev/null +++ b/02_Filtering_&_Sorting/Chipotle/Exercises_solved.ipynb @@ -0,0 +1,756 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Ex1 - Filtering and Sorting Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This time we are going to pull data directly from the internet.\n", + "Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.\n", + "\n", + "### Step 1. Import the necessary libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3. Assign it to a variable called chipo." + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
order_idquantityitem_namechoice_descriptionitem_price
011Chips and Fresh Tomato SalsaNaN$2.39
111Izze[Clementine]$3.39
211Nantucket Nectar[Apple]$3.39
311Chips and Tomatillo-Green Chili SalsaNaN$2.39
422Chicken Bowl[Tomatillo-Red Chili Salsa (Hot), [Black Beans...$16.98
..................
461718331Steak Burrito[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...$11.75
461818331Steak Burrito[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...$11.75
461918341Chicken Salad Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...$11.25
462018341Chicken Salad Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...$8.75
462118341Chicken Salad Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...$8.75
\n", + "

4622 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " order_id quantity item_name \\\n", + "0 1 1 Chips and Fresh Tomato Salsa \n", + "1 1 1 Izze \n", + "2 1 1 Nantucket Nectar \n", + "3 1 1 Chips and Tomatillo-Green Chili Salsa \n", + "4 2 2 Chicken Bowl \n", + "... ... ... ... \n", + "4617 1833 1 Steak Burrito \n", + "4618 1833 1 Steak Burrito \n", + "4619 1834 1 Chicken Salad Bowl \n", + "4620 1834 1 Chicken Salad Bowl \n", + "4621 1834 1 Chicken Salad Bowl \n", + "\n", + " choice_description item_price \n", + "0 NaN $2.39 \n", + "1 [Clementine] $3.39 \n", + "2 [Apple] $3.39 \n", + "3 NaN $2.39 \n", + "4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98 \n", + "... ... ... \n", + "4617 [Fresh Tomato Salsa, [Rice, Black Beans, Sour ... $11.75 \n", + "4618 [Fresh Tomato Salsa, [Rice, Sour Cream, Cheese... $11.75 \n", + "4619 [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... $11.25 \n", + "4620 [Fresh Tomato Salsa, [Fajita Vegetables, Lettu... $8.75 \n", + "4621 [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... $8.75 \n", + "\n", + "[4622 rows x 5 columns]" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "path = \"https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv\"\n", + "chipo = pd.read_csv(path, sep=\"\\t\")\n", + "chipo" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4. How many products cost more than $10.00?" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('O')" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chipo.item_price.dtype" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "# item_price != unit_price\n", + "def clean_price_col(chipo, col):\n", + " # remove $ symbol and convert to float\n", + " chipo[col] = chipo[col].apply(lambda x: x[1:].rstrip())\n", + " chipo[col] = chipo[col].astype(\"float\") \n", + " return chipo\n", + "\n", + "def remove_duplicate(chipo, cols):\n", + " return chipo.drop_duplicates(subset=cols)\n", + "\n", + "chipo = clean_price_col(chipo, \"item_price\")\n", + "new_chipo = remove_duplicate(chipo, [\"item_name\", \"quantity\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len( new_chipo[ (new_chipo.item_price > 10) & (new_chipo.quantity==1)] )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 5. What is the price of each item? \n", + "###### print a data frame with only two columns item_name and item_price" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
item_nameitem_price
606Steak Salad Bowl11.89
1229Barbacoa Salad Bowl11.89
1132Carnitas Salad Bowl11.89
93Carnitas Burrito11.75
1008Carnitas Crispy Tacos11.75
.........
6Side of Chips1.69
329Bottled Water1.50
263Canned Soft Drink1.25
28Canned Soda1.09
34Bottled Water1.09
\n", + "

135 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " item_name item_price\n", + "606 Steak Salad Bowl 11.89\n", + "1229 Barbacoa Salad Bowl 11.89\n", + "1132 Carnitas Salad Bowl 11.89\n", + "93 Carnitas Burrito 11.75\n", + "1008 Carnitas Crispy Tacos 11.75\n", + "... ... ...\n", + "6 Side of Chips 1.69\n", + "329 Bottled Water 1.50\n", + "263 Canned Soft Drink 1.25\n", + "28 Canned Soda 1.09\n", + "34 Bottled Water 1.09\n", + "\n", + "[135 rows x 2 columns]" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_chipo_2 = chipo.drop_duplicates(subset=[\"item_name\", \"item_price\"])\n", + "new_chipo_2 = new_chipo_2[new_chipo_2.quantity==1]\n", + "\n", + "new_chipo_2.loc[:, [\"item_name\", \"item_price\"]].sort_values(by=\"item_price\", ascending=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 6. Sort by the name of the item" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
order_idquantityitem_namechoice_descriptionitem_price
3389136026 Pack Soft Drink[Diet Coke]12.98
34114816 Pack Soft Drink[Diet Coke]6.49
184974916 Pack Soft Drink[Coke]6.49
186075416 Pack Soft Drink[Diet Coke]6.49
2713107616 Pack Soft Drink[Coke]6.49
..................
23849481Veggie Soft Tacos[Roasted Chili Corn Salsa, [Fajita Vegetables,...8.75
7813221Veggie Soft Tacos[Fresh Tomato Salsa, [Black Beans, Cheese, Sou...8.75
285111321Veggie Soft Tacos[Roasted Chili Corn Salsa (Medium), [Black Bea...8.49
16996881Veggie Soft Tacos[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...11.25
13955671Veggie Soft Tacos[Fresh Tomato Salsa (Mild), [Pinto Beans, Rice...8.49
\n", + "

4622 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " order_id quantity item_name \\\n", + "3389 1360 2 6 Pack Soft Drink \n", + "341 148 1 6 Pack Soft Drink \n", + "1849 749 1 6 Pack Soft Drink \n", + "1860 754 1 6 Pack Soft Drink \n", + "2713 1076 1 6 Pack Soft Drink \n", + "... ... ... ... \n", + "2384 948 1 Veggie Soft Tacos \n", + "781 322 1 Veggie Soft Tacos \n", + "2851 1132 1 Veggie Soft Tacos \n", + "1699 688 1 Veggie Soft Tacos \n", + "1395 567 1 Veggie Soft Tacos \n", + "\n", + " choice_description item_price \n", + "3389 [Diet Coke] 12.98 \n", + "341 [Diet Coke] 6.49 \n", + "1849 [Coke] 6.49 \n", + "1860 [Diet Coke] 6.49 \n", + "2713 [Coke] 6.49 \n", + "... ... ... \n", + "2384 [Roasted Chili Corn Salsa, [Fajita Vegetables,... 8.75 \n", + "781 [Fresh Tomato Salsa, [Black Beans, Cheese, Sou... 8.75 \n", + "2851 [Roasted Chili Corn Salsa (Medium), [Black Bea... 8.49 \n", + "1699 [Fresh Tomato Salsa, [Fajita Vegetables, Rice,... 11.25 \n", + "1395 [Fresh Tomato Salsa (Mild), [Pinto Beans, Rice... 8.49 \n", + "\n", + "[4622 rows x 5 columns]" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chipo.sort_values(\"item_name\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 7. What was the quantity of the most expensive item ordered?" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 4622.000000\n", + "mean 7.464336\n", + "std 4.245557\n", + "min 1.090000\n", + "25% 3.390000\n", + "50% 8.750000\n", + "75% 9.250000\n", + "max 44.250000\n", + "Name: item_price, dtype: float64" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chipo[\"item_price\"].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
order_idquantityitem_namechoice_descriptionitem_price
3598144315Chips and Fresh Tomato SalsaNaN44.25
\n", + "
" + ], + "text/plain": [ + " order_id quantity item_name choice_description \\\n", + "3598 1443 15 Chips and Fresh Tomato Salsa NaN \n", + "\n", + " item_price \n", + "3598 44.25 " + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "max_price = chipo.item_price.max()\n", + "\n", + "max_price_item = chipo[chipo.item_price == max_price]\n", + "max_price_item" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 8. How many times was a Veggie Salad Bowl ordered?" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "18" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(chipo[chipo.item_name == \"Veggie Salad Bowl\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 9. How many times did someone order more than one Canned Soda?" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len( chipo[(chipo.item_name == \"Canned Soda\") & (chipo.quantity>1)] )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}