In [None]:
import pandas as pd
import numpy as np

# Load the data
data = pd.read_csv('/home/nalin21478/BTP/ML-food-Processing/Numerical_Textual_ML/Data/65_Nuts.csv',index_col=0)

In [None]:
data.head()

In [None]:
data["Main_food_description"]

In [None]:
data

In [None]:
data.columns


In [None]:
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel



# Separate Categorical and Numerical Data
categorical_data = data.select_dtypes(include=['object'])
numerical_data = data.select_dtypes(exclude=['object'])

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Function to tokenize and obtain embeddings for a single value
def get_embeddings(value):
    inputs = tokenizer(value, return_tensors='pt', padding=True, truncation=True)
    outputs = bert_model(**inputs)
    embeddings = torch.mean(outputs.last_hidden_state, dim=1).squeeze().detach().numpy()
    return embeddings

# Function to obtain embeddings for each value in a categorical column
def get_column_embeddings(column):
    embeddings_list = []
    for value in column:
        embeddings = get_embeddings(value)
        embeddings_list.append(embeddings)
    return embeddings_list

# Dictionary to store aggregated embeddings for each categorical column
categorical_embeddings_aggregated = {}

# Step 2: Generate Word Embeddings for Categorical Data
for col in categorical_data.columns:
    print(f'Obtaining embeddings for {col}...')
    # Obtain embeddings for values in the column
    embeddings = get_column_embeddings(categorical_data[col])
    # Aggregate embeddings (e.g., average pooling)
    aggregated_embeddings = np.mean(embeddings, axis=0)
    # Store aggregated embeddings in the dictionary
    categorical_embeddings_aggregated[col] = aggregated_embeddings

# Convert categorical_embeddings_aggregated dictionary to DataFrame
categorical_embeddings_df = pd.DataFrame(categorical_embeddings_aggregated)




## Combine Numerical and Categorical Data with Embeddings
data_with_embeddings = pd.concat([numerical_data.reset_index(drop=True), categorical_embeddings_df.reset_index(drop=True)], axis=1)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, matthews_corrcoef
from imblearn.over_sampling import SMOTE

# Assuming you have a target column named 'novaclass' in your original DataFrame
# Split data into features (X) and target (y)
X = data_with_embeddings.drop(columns=['novaclass'])  # Features
y = data_with_embeddings['novaclass']  # Target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)




# Apply SMOTE to the training data after handling missing values
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)



# Initialize and train a Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print F1 score and Matthews correlation coefficient
print("F1 Score (weighted):", f1_score(y_test, y_pred, average='weighted'))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, y_pred))


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.ensemble import ExtraTreesClassifier


# Initialize and train an ExtraTreesClassifier
et_classifier = ExtraTreesClassifier()
et_classifier.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = et_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print F1 score and Matthews correlation coefficient
print("F1 Score (weighted):", f1_score(y_test, y_pred, average='weighted'))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, y_pred))


In [None]:
import re
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, matthews_corrcoef
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
# Assuming your DataFrame is named data_with_embeddings
# Replace special characters in feature names with underscores
data_with_embeddings_lgbm = data_with_embeddings.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))
X = data_with_embeddings_lgbm.drop(columns=['novaclass'])  # Features
y = data_with_embeddings_lgbm['novaclass']  # Target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Apply SMOTE to the training data after handling missing values
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Now train your LightGBM model
# Initialize and train an LightGBM classifier
lgbm_classifier = LGBMClassifier()
lgbm_classifier.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = lgbm_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print F1 score and Matthews correlation coefficient
print("F1 Score (weighted):", f1_score(y_test, y_pred, average='weighted'))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, y_pred))


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, matthews_corrcoef
from imblearn.over_sampling import SMOTE

# Assuming you have a target column named 'novaclass' in your original DataFrame
# Split data into features (X) and target (y)
X = data_with_embeddings.drop(columns=['novaclass'])  # Features
y = data_with_embeddings['novaclass']  # Target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
from sklearn.impute import SimpleImputer



# Apply SMOTE to the training data after handling missing values
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)



gb_classifier = GradientBoostingClassifier(random_state=42)
gb_classifier.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = gb_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print F1 score and Matthews correlation coefficient
print("F1 Score (weighted):", f1_score(y_test, y_pred, average='weighted'))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, y_pred))


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, matthews_corrcoef
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

y_train_xg=y_train-1
y_test_xg=y_test-1
import xgboost as xgb



# Apply SMOTE to the training data after handling missing values
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train_xg)

# Initialize and train a XGBoost Classifier
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = xgb_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test_xg, y_pred)
print("Accuracy:", accuracy)

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test_xg, y_pred))
print("\nClassification Report:")
print(classification_report(y_test_xg, y_pred))

# Print F1 score and Matthews correlation coefficient
print("F1 Score (weighted):", f1_score(y_test_xg, y_pred, average='weighted'))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test_xg, y_pred))



In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, matthews_corrcoef
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

X = data_with_embeddings.drop(columns=['novaclass'])  # Features
y = data_with_embeddings['novaclass']  # Target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
from sklearn.impute import SimpleImputer



# Apply SMOTE to the training data after handling missing values
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Apply SMOTE to the training data after handling missing values
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize and train a Gradient Boosting Classifier
gb_classifier = DecisionTreeClassifier(random_state=42)
gb_classifier.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = gb_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print F1 score and Matthews correlation coefficient
print("F1 Score (weighted):", f1_score(y_test, y_pred, average='weighted'))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, y_pred))


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, matthews_corrcoef
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer



# Initialize and train a Gradient Boosting Classifier
gb_classifier = KNeighborsClassifier()
gb_classifier.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = gb_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print F1 score and Matthews correlation coefficient
print("F1 Score (weighted):", f1_score(y_test, y_pred, average='weighted'))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, y_pred))


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, matthews_corrcoef
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer


# Initialize and train a Gradient Boosting Classifier
gb_classifier = MLPClassifier()
gb_classifier.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = gb_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print F1 score and Matthews correlation coefficient
print("F1 Score (weighted):", f1_score(y_test, y_pred, average='weighted'))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, y_pred))


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, matthews_corrcoef
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer



# Initialize and train a Gradient Boosting Classifier
gb_classifier = LogisticRegression()
gb_classifier.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = gb_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print F1 score and Matthews correlation coefficient
print("F1 Score (weighted):", f1_score(y_test, y_pred, average='weighted'))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, y_pred))


In [None]:
data_with_embeddings.to_csv('/home/nalin21478/BTP/ML-food-Processing/Numerical_Textual_ML/Data/65 Nuts/65_Nuts_Embeddings_Bert.csv')