# Preparing training data for the BERT model

In [1]:
import pandas as pd
import sqlite3
import os
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

Defining user names just as in the ClassApp

In [3]:
user_list = ['GV', 'NA', 'RC', 'WZ', 'EM', 'FR', 'HC', 'JF', 'MG']
db_names = ['data_' + user + '.db' for user in user_list]
db_names

['data_GV.db',
 'data_NA.db',
 'data_RC.db',
 'data_WZ.db',
 'data_EM.db',
 'data_FR.db',
 'data_HC.db',
 'data_JF.db',
 'data_MG.db']

Creating paths to their .db files in the classapp_output folder

In [4]:
db_paths = [os.path.join('classapp_output', db_name) for db_name in db_names]
db_paths

['classapp_output\\data_GV.db',
 'classapp_output\\data_NA.db',
 'classapp_output\\data_RC.db',
 'classapp_output\\data_WZ.db',
 'classapp_output\\data_EM.db',
 'classapp_output\\data_FR.db',
 'classapp_output\\data_HC.db',
 'classapp_output\\data_JF.db',
 'classapp_output\\data_MG.db']

Creating a list of data frames from all users

In [5]:
data_frames = []

sql = '''
SELECT * FROM class_methods
'''
for db_path in db_paths:
    if os.path.exists(db_path):
        conn = sqlite3.connect(db_path)
        data_frames.append(pd.read_sql_query(sql, conn))
        conn.commit()
        conn.close()


Concatenating that list in one data frame

In [6]:
# concatenating
users_methods = pd.concat(data_frames)
users_methods.head(5)

Unnamed: 0,user,method_id,description,Measuring,Plating,Smoking,Toasting,Microwaving,Air Frying,Double Boiler,...,Seasoning,Salting,Slicing,Chopping Fruits,Chopping Mushroom,Chopping Herbs,Mincing,Batonnet,Dicing,Roughly Chopping
0,GV,1,"Put the mushrooms, chard, oil, garlic, chilli,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,GV,2,"Bring a large pan of salted water to the boil,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,GV,3,Add the drained pasta and chopped tomatoes to ...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,GV,3,Add the drained pasta and chopped tomatoes to ...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,GV,4,"Toast the cumin seeds, fennel seeds and black ...",0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Saving description by id to later reinsertion 

In [7]:
description_by_method_id = users_methods[["method_id", "description"]].drop_duplicates().set_index('method_id')
description_by_method_id.head(10)

Unnamed: 0_level_0,description
method_id,Unnamed: 1_level_1
1,"Put the mushrooms, chard, oil, garlic, chilli,..."
2,"Bring a large pan of salted water to the boil,..."
3,Add the drained pasta and chopped tomatoes to ...
4,"Toast the cumin seeds, fennel seeds and black ..."
5,Transfer to a mortar and pestle and grind to a...
7,"Combine the chickpeas, lemon juice, garlic, cu..."
8,"Add more lemon juice, garlic, cumin or salt to..."
10,"Put the milk, water and sugar in a small sauce..."
500,Slowly pour in around one-third of the porcini...
501,"Add the salt and pepper, taste and add a littl..."


Summing classifications for each value (to get multi-label classification)

In [8]:
vectorized_methods = users_methods.groupby('method_id').sum()
vectorized_methods.head(3)

Unnamed: 0_level_0,Measuring,Plating,Smoking,Toasting,Microwaving,Air Frying,Double Boiler,Bain Marie,Reducing,Water Bathing,...,Seasoning,Salting,Slicing,Chopping Fruits,Chopping Mushroom,Chopping Herbs,Mincing,Batonnet,Dicing,Roughly Chopping
method_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Joining the descriptions

In [9]:
classified_methods = pd.concat([description_by_method_id, vectorized_methods], axis='columns').reset_index(drop=True)
classified_methods.head(5)

Unnamed: 0,description,Measuring,Plating,Smoking,Toasting,Microwaving,Air Frying,Double Boiler,Bain Marie,Reducing,...,Seasoning,Salting,Slicing,Chopping Fruits,Chopping Mushroom,Chopping Herbs,Mincing,Batonnet,Dicing,Roughly Chopping
0,"Put the mushrooms, chard, oil, garlic, chilli,...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Bring a large pan of salted water to the boil,...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Add the drained pasta and chopped tomatoes to ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Toast the cumin seeds, fennel seeds and black ...",0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Transfer to a mortar and pestle and grind to a...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Listing the methods

In [10]:
techniques = list(classified_methods.columns[1:])
techniques[:5]

['Measuring', 'Plating', 'Smoking', 'Toasting', 'Microwaving']

Fixing anomalies in the table

In [11]:
#checking for anomalies (class > 1 or <0)
for technique in techniques:
    classified_methods.loc[classified_methods[technique] > 1, technique] = 1
    classified_methods.loc[classified_methods[technique] < 0, technique] = 0


Separating data in train and testing

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
df_train, df_test = train_test_split(classified_methods, test_size=0.2, random_state=42)

In [17]:
df_test

Unnamed: 0,description,Measuring,Plating,Smoking,Toasting,Microwaving,Air Frying,Double Boiler,Bain Marie,Reducing,...,Seasoning,Salting,Slicing,Chopping Fruits,Chopping Mushroom,Chopping Herbs,Mincing,Batonnet,Dicing,Roughly Chopping
11,"Add the onion and cook, stirring occasionally,...",0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9,"Add the salt and pepper, taste and add a littl...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,"Put the mushrooms, chard, oil, garlic, chilli,...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Pickling and csving the data frame for training use later

In [18]:
classified_methods.to_pickle(os.path.join('hand_classified_methods', 'classified_methods'))
classified_methods.to_csv(os.path.join('hand_classified_methods', 'classified_methods.csv'), index=False)
df_train.to_csv(os.path.join('hand_classified_methods', 'methods_train.csv'), index=False)
df_test.to_csv(os.path.join('hand_classified_methods', 'methods_test.csv'), index=False)