![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import SimpleImputer

# Loading CSV File

In [None]:
recipes = pd.read_csv('../data/raw/recipes_one_line.csv')

In [None]:
recipes.head()

In [None]:
recipes.info()

# Predicton of NaN Values in Meal Class

In [None]:
# Replace empty strings with NaN values
recipes['meal_class'] = recipes['meal_class'].replace(' ', np.nan)

In [None]:
nan_count = recipes['meal_class'].isna().sum()
nan_count

In [None]:
class_count = recipes['meal_class'].value_counts()
class_count

In [None]:
recipes.head(50)

In [None]:
class_count = recipes['meal_class'].value_counts()
class_count

In [None]:
recipes['time(min)'] = recipes['time(min)'].astype(str)

In [None]:
# Combine 'ingredients_name' and 'preparations' columns into a single text column
recipes['text_features'] = recipes['ingredients_combined'] + ' ' + recipes['preparations'] + ' ' + recipes['time(min)'] 

# Load Portuguese stopwords and convert to a list
nltk.download('stopwords')
stop_words = list(set(stopwords.words('portuguese')))

# Prepare the data
data = recipes.dropna(subset=['meal_class'])  # Remove rows with NaN in meal_class
X = data['text_features']  # Use the combined text column as input
y = data['meal_class']

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=24)

# Vectorize text data
vectorizer = CountVectorizer(stop_words=stop_words)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a text classification model
classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

# Predict meal_class for rows with NaN values
nan_rows = recipes[recipes['meal_class'].isna()]
X_nan = nan_rows['text_features'].fillna('')  # Replace NaN with empty strings
X_nan_vec = vectorizer.transform(X_nan)
predicted_labels = classifier.predict(X_nan_vec)

# Inverse transform the labels to get the predicted meal_class values
predicted_classes = label_encoder.inverse_transform(predicted_labels)

# Assign the predicted values to the DataFrame
recipes.loc[recipes['meal_class'].isna(), 'meal_class'] = predicted_classes

recipes = recipes.drop('text_features', axis = 1)

# Calculate accuracy on the test set (for illustration purposes)
y_pred = classifier.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the test set: {accuracy:.2f}")

In [None]:
class_count = recipes['meal_class'].value_counts()
class_count

In [None]:
nan_count = recipes['meal_class'].isna().sum()
nan_count

In [None]:
recipes.head()

In [None]:
# Replace empty strings with NaN values
recipes['cost'] = recipes['cost'].replace(' ', np.nan)
recipes['cost'] = recipes['cost'].replace('-', np.nan)

In [None]:
nan_count_cost = recipes['cost'].isna().sum()
nan_count_cost

In [None]:
cost_count = recipes['cost'].value_counts()
cost_count

In [None]:
nan_count_cost = recipes['difficulty'].isna().sum()
nan_count_cost

In [None]:
difficulty_count = recipes['difficulty'].value_counts()
difficulty_count

In [None]:
recipes.info()

# Saving into CSV File

In [None]:
recipes.to_csv('../data/raw/recipes_one_clean.csv', index = False)

In [None]:
recipes['meal_class'].unique()