In [5]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.neighbors import KNeighborsClassifier

df = pd.read_csv("../data/train.csv")
df['text'] = df['titre'] + ' ' + df['ingredients'] + ' ' + df['recette']

stop_words = set(stopwords.words('french'))

def tokenize_text(text):
    tokens = word_tokenize(text, language="french")
    mal_segmentes = [t for t in tokens if "'" in t]
    # print(mal_segmentes)
    tokens = [word.lower() for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

df['tokenize_text'] = df['text'].apply(tokenize_text)

model = Word2Vec(sentences=df['tokenize_text'], vector_size=100, window=5, min_count=1, workers=4)

def get_vector(words, model):
    words = [word for word in words if word in model.wv]
    if len(words) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[words], axis=0)

df['vector'] = df['tokenize_text'].apply(lambda words: get_vector(words, model))

X = np.vstack(df['vector'].values)
y = df['type']

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#clf = RandomForestClassifier(n_estimators=100, random_state=42)
#clf.fit(X_train, y_train)
#y_pred = clf.predict(X_test)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print(classification_report(y_test, y_pred, target_names=le.classes_))
print(confusion_matrix(y_test, y_pred))

                precision    recall  f1-score   support

       Dessert       0.96      1.00      0.98       610
        Entrée       0.67      0.48      0.56       465
Plat principal       0.78      0.87      0.82       921

      accuracy                           0.82      1996
     macro avg       0.80      0.78      0.79      1996
  weighted avg       0.81      0.82      0.81      1996

[[607   2   1]
 [ 12 222 231]
 [ 11 105 805]]


In [6]:
from gensim.utils import simple_preprocess

df = pd.read_csv("../data/train.csv")

df['text'] = df['titre'] + ' ' + df['ingredients'] + ' ' + df['recette']

df['tokenize_text'] = [simple_preprocess(text) for text in df['text']]

model = Word2Vec(sentences=df['tokenize_text'], vector_size=100, window=5, min_count=1, workers=4)

def get_vector(words, model):
    words = [word for word in words if word in model.wv]
    if len(words) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[words], axis=0)

df['vector'] = df['tokenize_text'].apply(lambda words: get_vector(words, model))

X = np.vstack(df['vector'].values)
y = df['type']

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))
print(confusion_matrix(y_test, y_pred))


                precision    recall  f1-score   support

       Dessert       0.96      0.99      0.98       610
        Entrée       0.66      0.43      0.52       465
Plat principal       0.76      0.88      0.82       921

      accuracy                           0.81      1996
     macro avg       0.79      0.77      0.77      1996
  weighted avg       0.80      0.81      0.80      1996

[[604   5   1]
 [ 12 201 252]
 [ 10 100 811]]
