In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.svm import SVC

df_train = pd.read_csv("../data/train.csv", sep=',', encoding='utf-8')
df_test = pd.read_csv("../data/test.csv", sep=',', encoding='utf-8')

df_train['texte'] = df_train['titre'] + " " + df_train['ingredients'] + " " + df_train['recette']
df_test['texte'] = df_test['titre'] + " " + df_test['ingredients'] + " " + df_test['recette']

stop_words = set(stopwords.words('french'))

def tokenize_text(text):
    tokens = word_tokenize(text, language="french")
    mal_segmentes = [t for t in tokens if "'" in t]
    # print(mal_segmentes)
    tokens = [word.lower() for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

df_train['tokenize_text'] = df_train['texte'].apply(tokenize_text)
df_test['tokenize_text'] = df_test['texte'].apply(tokenize_text)


model = Word2Vec(sentences=df_train['tokenize_text'], vector_size=100, window=5, min_count=1, workers=4)

def get_vector(words, model):
    words = [word for word in words if word in model.wv]
    return np.mean(model.wv[words], axis=0)

df_train['vector'] = df_train['tokenize_text'].apply(lambda words: get_vector(words, model))
df_test['vector'] = df_test['tokenize_text'].apply(lambda words: get_vector(words, model))


X_train = np.vstack(df_train['vector'].values)
y_train = df_train['type']

X_test = np.vstack(df_test['vector'].values)
y_test = df_test["type"]

svc = SVC(kernel="linear")
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

                precision    recall  f1-score   support

       Dessert       0.98      0.99      0.99       407
        Entrée       0.73      0.64      0.68       337
Plat principal       0.83      0.88      0.85       644

      accuracy                           0.85      1388
     macro avg       0.84      0.83      0.84      1388
  weighted avg       0.85      0.85      0.85      1388

[[403   4   0]
 [  4 214 119]
 [  4  76 564]]


In [None]:
from gensim.utils import simple_preprocess

df_train = pd.read_csv("../data/train.csv", sep=',', encoding='utf-8')
df_test = pd.read_csv("../data/test.csv", sep=',', encoding='utf-8')

df_train['texte'] = df_train['titre'] + " " + df_train['ingredients'] + " " + df_train['recette']
df_test['texte'] = df_test['titre'] + " " + df_test['ingredients'] + " " + df_test['recette']

df_train['tokenize_text'] = [simple_preprocess(text) for text in df_train['texte']]
df_test['tokenize_text'] = [simple_preprocess(text) for text in df_test['texte']]

model = Word2Vec(sentences=df_train['tokenize_text'], vector_size=100, window=5, min_count=1, workers=4)

df_train['vector'] = df_train['tokenize_text'].apply(lambda words: get_vector(words, model))
df_test['vector'] = df_test['tokenize_text'].apply(lambda words: get_vector(words, model))

X_train = np.vstack(df_train['vector'].values)
y_train = df_train['type']

X_test = np.vstack(df_test['vector'].values)
y_test = df_test["type"]

svc = SVC(kernel="linear")
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


                precision    recall  f1-score   support

       Dessert       0.98      0.99      0.99       407
        Entrée       0.73      0.61      0.67       337
Plat principal       0.81      0.88      0.85       644

      accuracy                           0.85      1388
     macro avg       0.84      0.83      0.83      1388
  weighted avg       0.84      0.85      0.84      1388

[[403   2   2]
 [  4 206 127]
 [  4  74 566]]
