In [25]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

In [2]:
avis_df = pd.read_csv('data_preprocess_sans_stopwords.csv')
details_df = pd.read_csv('details.csv')
infos_scrapping_df = pd.read_csv('infos_scrapping.csv')
jeux_df = pd.read_csv('jeux.csv')

In [33]:
avis_df['comment'] = avis_df['comment'].fillna('')

In [34]:
y = avis_df["note"]

# transform the note column into a binary classification target variable
y_binary = np.where(y > 7, 1, -1)

In [35]:
# Split the data into training and test sets (40% for training, 10% for testing)
X_train, X_test, y_train, y_test = train_test_split(avis_df['comment'], y_binary, test_size=0.1, stratify=y, random_state=42)

In [36]:
# Apply Bag-of-Words (BoW) representation
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

In [37]:
# Apply TF-IDF representation
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [41]:
# Apply One-Hot encoding representation
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_one_hot = encoder.fit_transform(X_train.values.reshape(-1, 1))
X_test_one_hot = encoder.transform(X_test.values.reshape(-1, 1))

In [42]:
# Split the data into training, validation, and test sets
#X_train_bow, X_val_test_bow, y_train, y_val_test = train_test_split(X_bow, y_binary, test_size=0.5, stratify=y, random_state=42)
#X_val, X_test, y_val, y_test = train_test_split(X_val_test_bow, y_val_test, test_size=0.2, stratify=y_val_test, random_state=42)

In [45]:
# Train KNN models with Euclidean and Cosine distances
knn_euclidean = KNeighborsClassifier(metric='euclidean')
knn_cosine = KNeighborsClassifier(metric='cosine')

knn_euclidean.fit(X_train_bow, y_train)
knn_cosine.fit(X_train_bow, y_train)

# Predict using the trained models
y_pred_euclidean = knn_euclidean.predict(X_test_bow)
y_pred_cosine = knn_cosine.predict(X_test_bow)

# Calculate accuracy scores
accuracy_euclidean = accuracy_score(y_test, y_pred_euclidean)
accuracy_cosine = accuracy_score(y_test, y_pred_cosine)

In [47]:
# Print the accuracy scores
print("KNN with Bow and Euclidean distance accuracy:", accuracy_euclidean)
print("KNN with Bow and Cosine distance accuracy:", accuracy_cosine)

KNN with Bow and Euclidean distance accuracy: 0.6851871657754011
KNN with Bow and Cosine distance accuracy: 0.6939572192513369


In [49]:
knn_euclidean.fit(X_train_tfidf, y_train)
knn_cosine.fit(X_train_tfidf, y_train)

# Predict using the trained models
y_pred_euclidean = knn_euclidean.predict(X_test_tfidf)
y_pred_cosine = knn_cosine.predict(X_test_tfidf)

# Calculate accuracy scores
accuracy_euclidean = accuracy_score(y_test, y_pred_euclidean)
accuracy_cosine = accuracy_score(y_test, y_pred_cosine)

In [50]:
# Print the accuracy scores
print("KNN with Tf-Idf and Euclidean distance accuracy:", accuracy_euclidean)
print("KNN with Tf-Idf and Cosine distance accuracy:", accuracy_cosine)

KNN with Tf-Idf and Euclidean distance accuracy: 0.4836898395721925
KNN with Tf-Idf and Cosine distance accuracy: 0.710427807486631


In [51]:
knn_euclidean.fit(X_train_one_hot, y_train)
knn_cosine.fit(X_train_one_hot, y_train)

# Predict using the trained models
y_pred_euclidean = knn_euclidean.predict(X_test_one_hot)
y_pred_cosine = knn_cosine.predict(X_test_one_hot)

# Calculate accuracy scores
accuracy_euclidean = accuracy_score(y_test, y_pred_euclidean)
accuracy_cosine = accuracy_score(y_test, y_pred_cosine)

In [52]:
# Print the accuracy scores
print("KNN with One hot encoding and Euclidean distance accuracy:", accuracy_euclidean)
print("KNN with One hot encoding and Cosine distance accuracy:", accuracy_cosine)

KNN with One hot encoding and Euclidean distance accuracy: 0.6863636363636364
KNN with One hot encoding and Cosine distance accuracy: 0.6863636363636364
