# Run 2 : TF-IDF

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from gensim.utils import simple_preprocess

### Chargement des données d'apprentissage et de test

In [2]:
df_train = pd.read_csv("../data/train.csv", sep=',', encoding='utf-8')
df_validation = pd.read_csv("../data/validation.csv", sep=',', encoding='utf-8')
df_test = pd.read_csv("../data/test.csv", sep=',', encoding='utf-8')

### Vectorisation des données

In [3]:
df_train['vecteur'] = df_train['titre'] + " " + df_train['ingredients'] + " " + df_train['recette']
df_validation['vecteur'] = df_validation['titre'] + " " + df_validation['ingredients'] + " " + df_validation['recette']
df_test['vecteur'] = df_test['titre'] + " " + df_test['ingredients'] + " " + df_test['recette']

df_train['tokenize_text'] = [simple_preprocess(text) for text in df_train['vecteur']]
df_validation['tokenize_text'] = [simple_preprocess(text) for text in df_validation['vecteur']]
df_test['tokenize_text'] = [simple_preprocess(text) for text in df_test['vecteur']]

vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(df_train['vecteur'])
x_validation = vectorizer.transform(df_validation['vecteur'])
x_test = vectorizer.transform(df_test['vecteur']) 

y_train = df_train["type"]
y_validation = df_validation["type"]
y_test = df_test["type"]

### Apprentissage du SVC

In [4]:
model = SVC(kernel="linear")
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

### Résultats

In [5]:
score = accuracy_score(y_pred,y_test)
print("Accuracy:", score, "\n")

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8825648414985591 

                precision    recall  f1-score   support

       Dessert       0.98      1.00      0.99       407
        Entrée       0.79      0.72      0.75       337
Plat principal       0.87      0.89      0.88       644

      accuracy                           0.88      1388
     macro avg       0.88      0.87      0.87      1388
  weighted avg       0.88      0.88      0.88      1388

[[405   1   1]
 [  5 244  88]
 [  3  65 576]]
