# Sampling notebook

For this notebook execution, i assume you have downloaded the complete Open Food Fact CSV export (link in README file) in ./datas

In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

***
# Read csv
## Separator
Use of 'tab' ( \ŧ ) separator, specified in OpenFoodFact database description file
## Chunking
Use 'chunking' method to reduce memory consumption
## Filtering - canceled
Keep only the rows where 'brands' fields is not N/A

In [2]:
data = pd.DataFrame()

types = {
    "code": np.string_,
    "abbreviated_product_name": np.string_,
    "packaging_text": np.string_,
    "emb_codes": np.string_,
    "emb_codes_tags": np.string_,
    "first_packaging_code_geo": np.string_,
    "cities_tags": np.string_,
    "ingredients_from_palm_oil_tags": np.string_,
    "ingredients_that_may_be_from_palm_oil_tags": np.string_,
    "brand_owner": np.string_
}

for chunk in pd.read_csv("./datas/en.openfoodfacts.org.products.csv", sep="\t", skipinitialspace=True, nrows=None, chunksize=100000, dtype=types):
    # chunk = chunk[chunk.brands.notna()]
    data = pd.concat([data, chunk])
    del chunk

row_nb, col_nb = data.shape
(row_nb, col_nb)

(1907318, 186)

***
# Create a sample of 10 000 rows
## Seeding
By keeping the same seed, we ensure the randomize method will ever return the same number. We want that behavior to always have the same sample.
## Sampling
We use the randint method to generate random number corresponding to the dataset'w rows

In [3]:
np.random.seed(294697)
sample_size = 10000
sample_index = np.random.randint(row_nb, size=sample_size)
sample_index

array([ 883658, 1539094,   69186, ...,  836304, 1700372,  870365])

In [4]:
sample = data.iloc[sample_index]

In [5]:
sample.head(100)

Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,product_name,abbreviated_product_name,generic_name,quantity,packaging,packaging_tags,packaging_text,brands,brands_tags,categories,categories_tags,categories_en,origins,origins_tags,origins_en,manufacturing_places,manufacturing_places_tags,labels,labels_tags,labels_en,emb_codes,emb_codes_tags,first_packaging_code_geo,cities,cities_tags,purchase_places,stores,countries,countries_tags,countries_en,ingredients_text,allergens,allergens_en,traces,traces_tags,traces_en,serving_size,serving_quantity,no_nutriments,additives_n,additives,additives_tags,additives_en,ingredients_from_palm_oil_n,ingredients_from_palm_oil,ingredients_from_palm_oil_tags,ingredients_that_may_be_from_palm_oil_n,ingredients_that_may_be_from_palm_oil,ingredients_that_may_be_from_palm_oil_tags,nutriscore_score,nutriscore_grade,nova_group,pnns_groups_1,pnns_groups_2,states,states_tags,states_en,brand_owner,ecoscore_score_fr,ecoscore_grade_fr,main_category,main_category_en,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kj_100g,energy-kcal_100g,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,-butyric-acid_100g,-caproic-acid_100g,-caprylic-acid_100g,-capric-acid_100g,-lauric-acid_100g,-myristic-acid_100g,-palmitic-acid_100g,-stearic-acid_100g,-arachidic-acid_100g,-behenic-acid_100g,-lignoceric-acid_100g,-cerotic-acid_100g,-montanic-acid_100g,-melissic-acid_100g,monounsaturated-fat_100g,polyunsaturated-fat_100g,omega-3-fat_100g,-alpha-linolenic-acid_100g,-eicosapentaenoic-acid_100g,-docosahexaenoic-acid_100g,omega-6-fat_100g,-linoleic-acid_100g,-arachidonic-acid_100g,-gamma-linolenic-acid_100g,-dihomo-gamma-linolenic-acid_100g,omega-9-fat_100g,-oleic-acid_100g,-elaidic-acid_100g,-gondoic-acid_100g,-mead-acid_100g,-erucic-acid_100g,-nervonic-acid_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,-sucrose_100g,-glucose_100g,-fructose_100g,-lactose_100g,-maltose_100g,-maltodextrins_100g,starch_100g,polyols_100g,fiber_100g,-soluble-fiber_100g,-insoluble-fiber_100g,proteins_100g,casein_100g,serum-proteins_100g,nucleotides_100g,salt_100g,sodium_100g,alcohol_100g,vitamin-a_100g,beta-carotene_100g,vitamin-d_100g,vitamin-e_100g,vitamin-k_100g,vitamin-c_100g,vitamin-b1_100g,vitamin-b2_100g,vitamin-pp_100g,vitamin-b6_100g,vitamin-b9_100g,folates_100g,vitamin-b12_100g,biotin_100g,pantothenic-acid_100g,silica_100g,bicarbonate_100g,potassium_100g,chloride_100g,calcium_100g,phosphorus_100g,iron_100g,magnesium_100g,zinc_100g,copper_100g,manganese_100g,fluoride_100g,selenium_100g,chromium_100g,molybdenum_100g,iodine_100g,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-dried_100g,fruits-vegetables-nuts-estimate_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,carbon-footprint-from-meat-or-fish_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g
883658,3422440006072,http://world-en.openfoodfacts.org/product/3422...,kiliweb,1578241958,2020-01-05T16:32:38Z,1620137334,2021-05-04T14:08:54Z,Les creations de l'atelier,,,,fr:Etui en carton,fr-etui-en-carton,,Créaline,crealine,"Plats préparés, Soupes, Veloutés","en:meals,en:soups,en:cream-soups","Meals,Soups,Cream soups",,,,,,,,,,,,,,,,France,en:france,France,"Légumes 61% (petits pois 19%, courgettes 17%, ...",,,,en:celery,Celery,,,,0.0,,,,0.0,,,0.0,,,-1.0,a,3.0,Composite foods,One-dish meals,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,en:cream-soups,Cream soups,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,55.0,230.0,,3.2,0.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.2,1.1,,,,,,,,,,,,2.5,,,,0.54,0.216,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-1.0,,,,,,,,
1539094,8002816805499,http://world-en.openfoodfacts.org/product/8002...,kiliweb,1541424519,2018-11-05T13:28:39Z,1611783289,2021-01-27T21:34:49Z,Muesli Croccante Bio,,,,,,,Piu Cereali,piu-cereali,"Cibi e bevande a base vegetale, Cibi a base ve...","en:plant-based-foods-and-beverages,en:plant-ba...","Plant-based foods and beverages,Plant-based fo...",,,,,,Organic,en:organic,Organic,,,,,,,,Italy,en:italy,Italy,"fiocchi di AVENA bio 42.3%, zucchero di canna ...",en:soybeans,,,"en:milk,en:nuts,en:sesame-seeds,en:soybeans","Milk,Nuts,Sesame seeds,Soybeans",,,,0.0,,,,0.0,,,0.0,,,7.0,c,,Cereals and potatoes,Breakfast cereals,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,70.0,b,en:mueslis,Mueslis,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,429.0,1795.0,,16.0,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,58.0,17.0,,,,,,,,,8.4,,,10.0,,,,0.17,0.068,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7.0,,,,,,,,
69186,32894018847,http://world-en.openfoodfacts.org/product/0032...,waistline-app,1618691730,2021-04-17T20:35:30Z,1618691733,2021-04-17T20:35:33Z,Egyptian Foul,,,,,,,حداءق كاليفورنيا,حداءق-كاليفورنيا,,,,,,,,,,,,,,,,,,,en:Egypt,en:egypt,Egypt,,,,,,,400g,400.0,,,,,,,,,,,,,,,unknown,unknown,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,,,,,,,,,35.0,146.0,,2.0,0.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,1.25,,,2.5,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1310349,5352201082060,http://world-en.openfoodfacts.org/product/5352...,openfoodfacts-contributors,1490451961,2017-03-25T14:26:01Z,1628156528,2021-08-05T09:42:08Z,Kinnie,,,330 ml,,,,Kinnie,kinnie,"Beverages, Carbonated drinks","en:beverages,en:carbonated-drinks","Beverages,Carbonated drinks",,,,,,,,,,,,,,Österreich,,"Österreich,Frankreich","en:austria,en:france","Austria,France",,,,,,,,,,,,,,,,,,,,6.0,d,,unknown,unknown,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,en:carbonated-drinks,Carbonated drinks,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,169.0,40.0,169.0,,9.8,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9.8,0.0,,,,,,,,,,,,0.0,,,,0.011,0.0044,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.0,,,,,,,,
1293691,5060326271602,http://world-en.openfoodfacts.org/product/5060...,kiliweb,1568631234,2019-09-16T10:53:54Z,1603091759,2020-10-19T07:15:59Z,Evowhey 2.0 Vainilla HSN,,,1 x 30 g,,,,HSN,hsn,,,,,,,,,,,,,,,,,,,España,en:spain,Spain,,,,,,,,,,,,,,,,,,,,,,,unknown,unknown,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,416.0,1741.0,,6.9,4.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.9,6.8,,,,,,,,,,,,82.0,,,,1.0,0.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
777677,3258561490335,http://world-en.openfoodfacts.org/product/3258...,openfoodfacts-contributors,1506261424,2017-09-24T13:57:04Z,1561749866,2019-06-28T19:24:26Z,Saumon fumé Saivage Alaska,,,140 g,"Barquette,Film,Plastique,Frais,Sous atmosphère...","barquette,film,plastique,frais,sous-atmosphere...",,Belle France,belle-france,"Produits de la mer, Poissons, Saumons, Poisson...","en:seafood,en:fishes,en:salmons,en:smoked-fish...","Seafood,Fishes,Salmons,Smoked fishes,Smoked sa...",Océan Pacifique,fr:ocean-pacifique,fr:ocean-pacifique,France,france,"Pêché à la ligne, Point Vert, Fabriqué en Fran...","en:angled-fish,en:green-dot,en:made-in-france,...","Angled fish,Green Dot,Made in France,Smoked wi...",,,,,,France,,France,en:france,France,"_Saumon_ sauvage du Pacifique (97 %), sel (3 %).",en:fish,,,,,Une tranche 35 g,35.0,,0.0,,,,0.0,,,0.0,,,11.0,d,3.0,Fish Meat Eggs,Fish and seafood,"en:to-be-checked, en:complete, en:nutrition-fa...","en:to-be-checked,en:complete,en:nutrition-fact...","To be checked,Complete,Nutrition facts complet...",,8.0,e,en:wild-smoked-salmons,Wild smoked salmons,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,118.0,494.0,,2.1,0.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.7,0.3,,,,,,,,,,,,24.1,,,,3.5,1.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11.0,,,,,,,,
1890592,9310081459972,http://world-en.openfoodfacts.org/product/9310...,openfoodfacts-contributors,1568338884,2019-09-13T01:41:24Z,1581720290,2020-02-14T22:44:50Z,Birds Eye Australian Country Harvest,,Frozen mixed vegetables,,,,,Birds Eye,birds-eye,,,,,,,,,,,,,,,,,,,Australia,en:australia,Australia,"Carrot, cauliflower, beans, broccoli.",,,,,,,,,0.0,,,,0.0,,,0.0,,,,,,unknown,unknown,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,112.0,,112.0,,0.3,0.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,2.5,,,,,,,,,2.8,,,1.8,,,,0.026,0.0104,,0.000238,,,,,0.019,,,,,,2.9e-05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1108727,4002809085000,http://world-en.openfoodfacts.org/product/4002...,kiliweb,1586613810,2020-04-11T14:03:30Z,1607247703,2020-12-06T09:41:43Z,Cookies aux pépites de chocolat,,,,,,,,,"Snacks, Snacks sucrés, Biscuits et gâteaux, Bi...","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","Snacks,Sweet snacks,Biscuits and cakes,Biscuit...",,,,,,,,,,,,,,,,en:be,en:belgium,Belgium,,,,,,,,,,,,,,,,,,,,26.0,e,,Sugary snacks,Biscuits and cakes,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,36.0,d,en:chocolate-chip-cookies,Chocolate chip cookies,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,479.0,2004.0,,20.0,12.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,67.0,44.0,,,,,,,,,,,,5.4,,,,0.61,0.244,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,26.0,,,,,,,,
385331,708423080103,http://world-en.openfoodfacts.org/product/0708...,usda-ndb-import,1489061091,2017-03-09T12:04:51Z,1489061092,2017-03-09T12:04:52Z,Traditional Sweet Corn Bread,,,,,,,Firezna,firezna,,,,,,,,,,,,,,,,,,,United States,en:united-states,United States,Ingredients: enriched bleached flour (wheat fl...,,,,,,28 g (28 g),28.0,,3.0,,"en:e341,en:e341i,en:e500,en:e500ii,en:e521","E341 - Calcium phosphates,E341i - Monocalcium ...",0.0,,,0.0,,,,,4.0,unknown,unknown,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,,,,,,,,,357.0,1494.0,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,82.14,32.14,,,,,,,,,3.6,,,7.14,,,,0.90678,0.362712,,0.0,,,,,0.0,,,,,,,,,,,,,,0.071,,0.00257,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1127688,4012836000389,http://world-en.openfoodfacts.org/product/4012...,he-loves-quenn-b,1440237386,2015-08-22T09:56:26Z,1616082847,2021-03-18T15:54:07Z,Bio-Dinkel-Waffelbrir,,,100g,,,,Spree Waffel,spree-waffel,,,,,,,,,,,,,,,,,,,Dänemark,en:denmark,Denmark,,,,,,,,,,,,,,,,,,,,,,,unknown,unknown,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,400.0,1674.0,,5.2,0.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,73.8,0.8,,,,,,,,,,,,12.7,,,,0.14,0.056,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


***
# Writing to CSV
## Separator
We use the ';' separator because there is fields that contains a list of tags, separated by ','

In [6]:
sample.to_csv("./datas/sample.csv", sep=";")