In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score
from datetime import datetime


**IN THE ABOVE CELL, WE HAVE IMPORTED ALL THE LIBRARIES/MODULES TO BE USED**

* **MATPLOTLIB AND SEABORN FOR PLOTTING AND VISUALISATION**
* **TFIDF TO CNVERT TEXT INTO NUMERICAL FEATURES**
* **STANDARDSCALER TO STANDARDISE THE NUMERICAL FEATURES**
* **TRAIN_TEST_SPLIT HELPS TO SPLIT THE DATA INTO TRAINING AND VALIDATION SETS**
* **DUMMY CLASSIFIER, LOGISTC REGRESSION, XGB CLASSIFIER, RANDOM FOREST CLASSIFIER, KNN CLASSIFIER, SUPPORT VECTOR CLASSIFIER FOR MODEL BUILDING**
* **GRIDSEARCHCV FOR HYPERPARAMETER TUNING**
* **CONFUSION MATRIX AND CLASSIFICATION REPPORT TO GET AN EVALLUATION OF THE PERFORMANCE**


## ----------EDA AND DATA VISUALISATION ALONG WITH FEATURE EXTRACTION----------

**1. Loading the training set and making checking the data type of each attribute**

**2. The variable "data" will be storing the main training set titled "train.csv"**

In [2]:

data=pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/train.csv')

In [None]:

data.shape


In [None]:

data.info()


**Thus, we can see that there are 15 columns to the dataset and in those:**

**1. 10 are numerical in nature**

**2. 5 are non-numerical in nature**

**Now, we will be checking the summary statistics of each numerical feature except for ID, RecipeNumber, and RecipeCode**

In [None]:

num_col = ['UserReputation', 'CreationTimestamp', 'ReplyCount', 'ThumbsUpCount', 'ThumbsDownCount', 'Rating', 'BestScore']
summary = data[num_col].describe()
print(summary)


**FROM THE ABOVE INFORMATION, WE CAN SEE THAT THERE ARE 75% ROWS WITH RATING 5, WITH THE MEDIAN BEING THE SAME. THUS, IT CAN BE SAID THAT THE DATASET HAS AN IMBALANCE.**

**NOW, IN ORDERT TO VISUALISE THE SUMMARY STATISTICS, WE WILL BE MAKING SOME PLOTS**

In [None]:

fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(14, 12))
fig.subplots_adjust(hspace=0.5)
axes = axes.flatten()

for i, column in enumerate(num_col):
    ax = axes[i]
    ax.hist(data[column], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
    ax.set_title(f'Distribution of {column}')
    ax.set_xlabel(column)
    ax.set_ylabel('Frequency')
    ax.grid(True)
plt.tight_layout()
plt.show()


**Thus, we can see that around 75% of Ratings belong to category 5, around 10.26% to category 4, around 2.2% in category 3, around 1% in category 2, around 1.4% in category 1, and around 10.26% for category 0.**

**Thus, we can see that majority (more than half) of the ratings belong to the category 5**

**CreationTimeStamp, the time at which the comment was posted as a Unix64 timestamp, can be a hidden cue to better modeling. There could be temporal variations in user behaviour because from personal experience, we can see that Rating a dish online depends on time as well. We will try extracting hour and day feature from it.**

**HERE, FEATURE EXTRACTION IS ALSO BEING DONE**

In [3]:

data['CreationTimestamp'] = pd.to_datetime(data['CreationTimestamp'], unit='s')
data['Hour'] = data['CreationTimestamp'].dt.hour
data['Day'] = data['CreationTimestamp'].dt.dayofweek


In [None]:

data.info()


In [None]:

timecol=['Hour','Day']
summary = data[timecol].describe()
print(summary)


**Now checking we can see that two new features have been added to the dataset, namely Hour (numerical) and Day (numerical)**

**Now, the CreationTimestamp has been turned into datatype datetime with nanosecond precsion**

**We can now check how Rating varies with Day and Hour**

**CHECKING THE VARIATION OF RATING WITH DAY**

In [None]:

ratingvday = data.groupby('Day')['Rating'].mean()
plt.figure(figsize=(10, 6))
plt.plot(ratingvday.index, ratingvday.values, marker='o', color='skyblue', linestyle='-')
plt.title('Variation of Rating with Day')
plt.xlabel('Day')
plt.ylabel('Mean Rating')
plt.grid(True)
plt.xticks(ratingvday.index)
plt.show()


**We can see that the average rating for day 0 is between 3.8-3.9, average rating for day 1 is below 3.8, average rating for day 2 is between 3.9 and 4.0, average rating for day 3 is around 4.3, the average rating for day 4 crosses 4.4, for day 5 it is close to 3.9 whereas for day 6, the average rating is 4.**

**NOW WE WILL BE CHECKING THE VARIATION OF RATING WITH HOUR**

In [None]:

ratingvday = data.groupby('Hour')['Rating'].mean()
plt.figure(figsize=(10, 6))
plt.plot(ratingvday.index, ratingvday.values, marker='o', color='skyblue', linestyle='-')
plt.title('Variation of Rating with Hour')
plt.xlabel('Hour')
plt.ylabel('Mean Rating')
plt.grid(True)
plt.xticks(ratingvday.index)
plt.show()


**We can see that the avergae rating varies from hour 0 to 23 - 3.9 for 0, 4.3 for 1, 3.5 for 2, 4 for 3, 4.6 for 4, 4.8 for 5, 1.5 for 6, 3.8 for 7, 5 for 8, around 3.2 for 9, around 4.4 for 10, around 4.3 for 11, 3.8 for 12, around 3.8 for 13, around 3.9 for 14, 4 for 15, around 3.8 for 16, 4 for 17, around 4 for 18, around 3.6 for 19, around 4.3 for 20, 4 for 21, around 4 for 22, around 4.4 for 23.**

**Thus, we should keep Hour and Day in our consideration and model building**

**Now we can see the correlation matrix between all the numerical values**

**LET US VISUALISE IT USING HEATMAP**

In [None]:

num_col2 = ['UserReputation', 'ReplyCount', 'ThumbsUpCount', 'ThumbsDownCount', 'BestScore','Hour','Day', 'Rating']
corltn_matrix = data[num_col2].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corltn_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})
plt.title('Correlation Matrix of Numeric Columns')
plt.show()




**Currently, we can see Rating doesn't show any considerable correlation with other attributes. The best correlations are - Day, Hour, BestScore, and UserReputation. This might vary with Preprocessing**

## ----------DETECTING MISSING DATA AND IMPUTATION----------

**We need to see if there are any missing values and impute those places accordingly**

In [4]:

data.isna().sum()


ID                   0
RecipeNumber         0
RecipeCode           0
RecipeName           0
CommentID            0
UserID               0
UserName             0
UserReputation       0
CreationTimestamp    0
ReplyCount           0
ThumbsUpCount        0
ThumbsDownCount      0
Rating               0
BestScore            0
Recipe_Review        2
Hour                 0
Day                  0
dtype: int64

**Thus we can see that Recipe_Review has two missing values. Now, two missing values amongst 13636 rows would be statistically insignificant. We will be replacing them with empty string**

In [5]:

data.fillna('', inplace=True)


In [None]:

data.isna().sum()


**Thus, we can see that the two missing values in Recipe_Review are not there anymore**

## ----------PREPROCESSING----------

In [None]:

data.info()


**We will be dropping ID, RecipeNumber, RecipeCode, RecipeName, CommentID, UserID, UserName from the training set as they won't be relevant in terms of predicting the Rating. We will also be skipping CreationTimestamp as we have extracted day and hour out of it.**

**We will be keeping Recipe_Review because in real life scenarios, reviews and ratings often go hand in hand - good review leads to good rating as well as the other way around**

**The New Dataset which will be made from training set "data" will be titled "dataprime"**

In [6]:

coldrop=['ID', 'RecipeNumber', 'RecipeCode', 'RecipeName', 'CommentID', 'UserID', 'UserName', 'CreationTimestamp']
dataprime=data.drop(columns=coldrop)


In [None]:

dataprime.info()


**dataprime has 8 numerical column and 1 non-numerical column**

In [None]:

dataprime.isna().sum()


**We cross checked dataprime has no missingvalues**

**NOW, WE WILL BE ENCODING THE Recipe Review COLUMN**

In [7]:

tfidf = TfidfVectorizer(max_features=1000)
tfidf_features = tfidf.fit_transform(dataprime['Recipe_Review'])


In [None]:

tfidf_features.shape


**THE NUMBER OF COLUMNS IN THIS VARIABLE IS 1000 AFTER USING TFIDF**

In [None]:

dataprime.head()


**NOW, WE WILL BE SCALING THE NUMERICAL COLUMNS**

In [8]:

num_col3 = dataprime.select_dtypes(include=['int64','int32','float64']).columns
stscaler = StandardScaler()

dataprime[num_col3] = stscaler.fit_transform(dataprime[num_col3])


In [None]:

dataprime.head()


**NOW, LET US TRY PLOTTING THE CORRELATION MATRIX ONCE AGAIN USING THE ABOVE DATASET**

In [None]:

num_col4 = ['UserReputation', 'ReplyCount', 'ThumbsUpCount', 'ThumbsDownCount', 'BestScore','Hour','Day', 'Rating']
corltn_matrix2 = dataprime[num_col4].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corltn_matrix2, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})
plt.title('Correlation Matrix of Numeric Columns')
plt.show()


**THUS, WE CAN SEE THAT RATING AS TOP 4 CORRELATION WITH DAY, HOUR, BESTSCORE, AND USERREPUTATION**

## ----------MAKING X AND y SET FROM THE PREPROCCESSED DATA----------

**NOW WE WILL BE MAKING OUR X AND y DATASET**

In [9]:

X = pd.concat([dataprime[['Hour', 'Day', 'UserReputation', 'ReplyCount', 'ThumbsUpCount', 'ThumbsDownCount', 'BestScore']], pd.DataFrame(tfidf_features.toarray())], axis=1)
y = data['Rating']


In [None]:

X.head()


**THUS WE CAN SEE THAT X HAS THE 7 NUMERICAL FEATURES AS WELL AS THE 1000 TFIDF FEATURES**

In [None]:

X.shape


**IT HAS 1007 COLUMNS**

In [None]:

y.head()


In [None]:

y.shape


**X and y are the new datasets which shall help us to train and validate models**

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)


## ----------WORKING ON THE TEST SET TITLED "test.csv" FOR FURTHER TESTING PURPOSE----------

In [10]:

test_data = pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/test.csv')


**Test data is saved in a variable titled "test_data"**

In [None]:

test_data.shape


**THE TEST SET HAS 14 COLUMNS**

In [None]:
test_data.info()

**Thus, we can see that there:**

**1. There are 9 numerical features.**

**2. There are 5 categorical features.**

**Now we will be extracting Day and Hour from CreationTimeStamp for test_data as well**

In [11]:
test_data['CreationTimestamp'] = pd.to_datetime(test_data['CreationTimestamp'], unit='s')
test_data['Hour'] = test_data['CreationTimestamp'].dt.hour
test_data['Day'] = test_data['CreationTimestamp'].dt.dayofweek

In [None]:
test_data.info()

**HOUR AND DAY HAVE BEEN ADDED TO test_data AS WELL**

**NOW WE WILL BE ENCODING NON-NUMERICAL COLUMN AND SCALING THE NUMERICAL COLUMNS FOR test_data AS WELL**

In [12]:
tfidf_test= tfidf.transform(test_data['Recipe_Review'])

In [13]:
coldrop1=['ID', 'RecipeNumber', 'RecipeCode', 'RecipeName', 'CommentID', 'UserID', 'UserName', 'CreationTimestamp']
test_data=test_data.drop(columns=coldrop)

In [14]:
num_col5 = test_data.select_dtypes(include=['int64','int32','float64']).columns
stscaler = StandardScaler()

test_data[num_col5] = stscaler.fit_transform(test_data[num_col5])

In [22]:
test_data = pd.concat([test_data[['Hour', 'Day', 'UserReputation', 'ReplyCount', 'ThumbsUpCount', 'ThumbsDownCount', 'BestScore']], pd.DataFrame(tfidf_test.toarray())], axis=1)
test_data.columns=test_data.columns.astype(str)

**WE HAVE FOLLOWED ALL THE STEPS FOR TEST DATA WHICH WE DID FOR TRAINING AND VALIDATION DATA**

## ----------MODEL BUILDING----------

**NOW, WE SHALL BE TRYING MODEL BUILDING BASED ON THE TRAINING AND VALIDATION SET AND THEN APPLY IT ON THE TEST SET**

## MODEL 0: DUMMY CLASSIFIER

In [None]:
dc = DummyClassifier(strategy="most_frequent")
y_data = data['Rating'].copy()
data.drop(columns=['Rating'],axis=1)
dc.fit(data,y_data)


In [None]:
pred=dc.predict(test_data)

In [None]:
submission = pd.DataFrame(columns=['ID', 'Rating'])
submission['ID'] = [__ for __ in range(1, len(pred)+1)]
submission['Rating'] = pred
submission.to_csv('submission.csv', index=False)

**THE VALUE WE GET IS 0.76066**

## MODEL 1:LOGISTIC REGRESSION 

In [None]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

In [None]:
logregpred = logreg.predict(X_test)

In [None]:
logregaccuracy = accuracy_score(y_test, logregpred)
print("Logistic Regression Accuracy:", logregaccuracy)

**LOGISTIC REGRESSION ACCURACY FOR VALIDATION: 0.7694281524926686**

In [None]:
logregpred_test = logreg.predict(test_data)
print("Predictions for the test set:", logregpred_test)

In [None]:
submission = pd.DataFrame(columns=['ID', 'Rating'])
submission['ID'] = [__ for __ in range(1, len(logregpred_test)+1)]
submission['Rating'] = logregpred_test
submission.to_csv('submission.csv', index=False)

**THE SCORE FOR TEST SET AFTER LOGISTIC REGRESSION IS 0.77540**

**NOW WE SHALL BE FINDING THE CONFUSION MATRIX AND CLASSIFICATION REPORT FOR LOGISTIC REGRESSION**

In [None]:
con_matrix = confusion_matrix(y_test,logregpred)
print("Confusion Matrix:")
print(con_matrix)
print(classification_report(y_test, logregpred))

**IN THE CONFUSION MATRIX, ROWS REPRESENT ACTUAL CLASSES AND THE COLUMNS REPRESENT THE PREDICTED ONES. R1(ROW1) TO R6 REPRESNT ACTUAL CLASSES FROM 0-5 WHEREAS C1(COLUMN1) TO C6 REPRESENT PREDICTED CLASSES. FOR EXAMPLE:**

* **R1C1 IS A TRUE POSITIVE FOR CLASS 0 (PREDICTED=ACTUAL)**
* **R2C1 IS A FALSE NEGATIVE FOR CLASS 1 CLASSIFIED AS CLASS 0**

**AND SO ON**

* **PRECISION GIVES THE RATIO OF TRUE PSOITIVES TO THE TOTAL PREDICTED POSITIVES** (HIGH PRECISION MEANS LESS FALSE POSITIVES)

* **RECALL GIVES THE RATIO OF TRUE POSITIVES TO THE TOTAL ACTUAL POSITIVE INSTANCES** (HIGH RECALL MEANS MODEL CAPTURING A LARGE PROPORTION OF POSITIVES)

**F-1 SCORE IS A TRADEOFF BETWEEN RECALL AND PRECISION AND COMES HANDY IN SUCH AN IMBALANCED DATA**

**WE CAN SEE THAT THE HIGHEST VALUE OF PRECISION, RECALL, AND F-1 SCORE BELONGS TO CLASS 5, SUGGESTING THAT IT HAS MORE NUMBER OF INSTANCES**

## MODEL 2: XGBOOST CLASSIFIER

In [None]:

xgbc = XGBClassifier()
xgbc.fit(X_train, y_train)


In [None]:

xgbpred = xgbc.predict(X_test)
xgbaccuracy = accuracy_score(y_test, xgbpred)
print("XGBoost Classifier Accuracy:", xgbaccuracy)


**FOR XGBOOST, THE ACCURACY SCORE OF VALIDATION IS COMING: 0.7668621700879765**

In [None]:

xgbpred_test= xgbc.predict(test_data)


In [None]:

submission = pd.DataFrame(columns=['ID', 'Rating'])
submission['ID'] = [__ for __ in range(1, len(xgbpred_test)+1)]
submission['Rating'] = xgbpred_test
submission.to_csv('submission.csv', index=False)


**THE SCORE FOR TEST SET AFTER XGBOOST CLASSIFIER IS 0.77518**

**NOW WE SHALL BE FINDING THE CONFUSION MATRIX AND CLASSIFICATION REPORT FOR XGBOOST CLASSIFIER**

In [None]:

con_matrix1 = confusion_matrix(y_test,xgbpred)
print("Confusion Matrix:")
print(con_matrix1)
print(classification_report(y_test, xgbpred))


**WE CAN SEE THAT THE HIGHEST VALUE OF PRECISION, RECALL, AND F-1 SCORE BELONGS TO CLASS 5, SUGGESTING THAT IT HAS MORE NUMBER OF INSTANCES**

## MODEL 3: RANDOM FOREST CLASSIFIER

In [None]:

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)


In [None]:

rfcpred = rfc.predict(X_test)
rfcaccuracy = accuracy_score(y_test, rfcpred)
print("Random Forest Classifier Accuracy:", rfcaccuracy)


**FOR RFC, THE ACCURACY SCORE IS 0.7617302052785924**

In [None]:

rfcpred_test = rfc.predict(test_data)


In [None]:

submission = pd.DataFrame(columns=['ID', 'Rating'])
submission['ID'] = [__ for __ in range(1, len(rfcpred_test)+1)]
submission['Rating'] = rfcpred_test
submission.to_csv('submission.csv', index=False)


**THE SCORE FOR TEST SET AFTER RANDOM FOREST CLASSIFIER IS 0.7622**

**NOW WE SHALL BE FINDING THE CONFUSION MATRIX AND CLASSIFICATION REPORT FOR RANDOM FOREST CLASSIFIER**

In [None]:

con_matrix2 = confusion_matrix(y_test,rfcpred)
print("Confusion Matrix:")
print(con_matrix2)
print(classification_report(y_test, rfcpred))


**WE CAN SEE THAT THE HIGHEST VALUE OF PRECISION, RECALL, AND F-1 SCORE BELONGS TO CLASS 5, SUGGESTING THAT IT HAS MORE NUMBER OF INSTANCES**

## MODEL 4: KNN CLASSIFIER

In [None]:

knnc = KNeighborsClassifier(n_neighbors=20)
knnc.fit(X_train, y_train)


In [None]:

knnpred = knnc.predict(X_test)
knnaccuracy = accuracy_score(y_test, knnpred)
print("KNN Classifier Accuracy:", knnaccuracy)


**FOR KNN, THE ACCURACY SCORE IS 0.7565982404692082**

In [None]:

knnpred_test= knnc.predict(test_data)


In [None]:

submission = pd.DataFrame(columns=['ID', 'Rating'])
submission['ID'] = [__ for __ in range(1, len(knnpred_test)+1)]
submission['Rating'] = knnpred_test
submission.to_csv('submission.csv', index=False)


**THE SCORE FOR TEST SET AFTER KNN CLASSIFIER IS 0.76902**

**NOW WE SHALL BE FINDING THE CONFUSION MATRIX AND CLASSIFICATION REPORT FOR KNN CLASSIFIER**

In [None]:

con_matrix3 = confusion_matrix(y_test,knnpred)
print("Confusion Matrix:")
print(con_matrix3)
print(classification_report(y_test, knnpred))


**WE CAN SEE THAT THE HIGHEST VALUE OF PRECISION, RECALL, AND F-1 SCORE BELONGS TO CLASS 5, SUGGESTING THAT IT HAS MORE NUMBER OF INSTANCES**

## MODEL 5: SUPPORT VECTOR CLASSIFIER

In [None]:

svmc = SVC()
svmc.fit(X_train, y_train)


In [None]:

svmcpred = svmc.predict(X_test)
svmcaccuracy = accuracy_score(y_test, svmcpred)
print("SVM Classifier Accuracy:", svmcaccuracy)


**FOR SVC, THE ACCURACY SCORE IS 0.7562316715542522**

In [None]:

svmcpred_test= svmc.predict(test_data)


In [None]:

submission = pd.DataFrame(columns=['ID', 'Rating'])
submission['ID'] = [__ for __ in range(1, len(svmcpred_test)+1)]
submission['Rating'] = svmcpred_test
submission.to_csv('submission.csv', index=False)


**FOR SVC, THE TEST SET ACCURACY IS 0.77232**

**NOW WE SHALL BE FINDING THE CONFUSION MATRIX AND CLASSIFICATION REPORT FOR SUPPORT VECTOR CLASSIFIER**

In [None]:

con_matrix4 = confusion_matrix(y_test,svmcpred)
print("Confusion Matrix:")
print(con_matrix4)
print(classification_report(y_test, svmcpred))


**WE CAN SEE THAT THE HIGHEST VALUE OF PRECISION, RECALL, AND F-1 SCORE BELONGS TO CLASS 5, SUGGESTING THAT IT HAS MORE NUMBER OF INSTANCES**

## ----------HYPERPRAMETER TUNING----------

## HYPERPARAMETER TUNING FOR LOGISTIC REGRESSION

In [None]:

param_grid = {
    'C': [ 1, 10, 100 ],  
    'penalty': ['l1', 'l2']  
}
logreg = LogisticRegression(max_iter=1000)
gridsearch = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5, scoring='accuracy')
gridsearch.fit(X_train, y_train)


In [None]:

print("Best hyperparameters:", gridsearch.best_params_)


In [None]:

print("Best accuracy:", gridsearch.best_score_)


**BEST ACCURACY FOR LOGISTIC REGRESSION FOR GRIDSEARCHCV IS 0.7755783953660289; THE BEST HYPERPARAMETERS ARE - C: 1; PENALTY: l2**

In [None]:

gridlogpred = gridsearch.best_estimator_.predict(X_test)
accuracy_test = accuracy_score(y_test, gridlogpred)
print("Accuracy on test set:", accuracy_test)


**BEST ACCURACY FOR LOGISTIC REGRESSION AFTER APPLYING GRIDSEARCHCV IS 0.7694281524926686**

In [None]:

gridlogpredtest = gridsearch.best_estimator_.predict(test_data)


In [None]:

submission = pd.DataFrame(columns=['ID', 'Rating'])
submission['ID'] = [__ for __ in range(1, len(gridlogpredtest)+1)]
submission['Rating'] = gridlogpredtest
submission.to_csv('submission.csv', index=False)


**ACCURACY FOR LOGISTIC REGRESSION ON TEST SET AFTER GRIDSEARCHCV IS 0.7754**

**CONFUSION MATRIX AND CLASSIFICATION REPORT**

In [None]:

con_matrix5 = confusion_matrix(y_test,gridlogpred)
print("Confusion Matrix:")
print(con_matrix5)
print(classification_report(y_test, gridlogpred))


**WE CAN SEE THAT THE HIGHEST VALUE OF PRECISION, RECALL, AND F-1 SCORE BELONGS TO CLASS 5, SUGGESTING THAT IT HAS MORE NUMBER OF INSTANCES**


## HYPERPARAMETR TUNING FOR XGBOOST CLASSIFIER

In [None]:

param_grid1 = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'n_estimators': [100]
}
xgbc = XGBClassifier()

gridsearch1 = GridSearchCV(estimator=xgbc, param_grid=param_grid1, cv=5, scoring='accuracy')
gridsearch1.fit(X_train, y_train)


In [None]:

print("Best hyperparameters:", gridsearch1.best_params_)


In [None]:

print("Best accuracy:", gridsearch1.best_score_)


**AFTER APPLYING GRIDSEARCHCV ON XGBOOST CLASSIFIER, WE GET:**


**1. BEST HYPERPARAMETERS: LEARNING RATE:0.2, MAX DEPTH:5, WHILE THE ESTIMATORS ARE 100**

**2. BEST ACCURACY THAT WE ARE GETTING IS 0.773836663695418**



In [None]:

gridxgbpred = gridsearch1.best_estimator_.predict(X_test)
accuracy_test1 = accuracy_score(y_test, gridxgbpred)
print("Accuracy on test set:", accuracy_test1)


**ACCURACY ON VALIDATION SET IS 0.7683284457478006**

In [None]:

gridxgbpredtest = gridsearch1.best_estimator_.predict(test_data)


In [None]:

submission = pd.DataFrame(columns=['ID', 'Rating'])
submission['ID'] = [__ for __ in range(1, len(gridxgbpredtest)+1)]
submission['Rating'] = gridxgbpredtest
submission.to_csv('submission.csv', index=False)


**ACCURACY ON TEST SET IS 0.77364**

**NOW LET US FIND THE CONFUSION MATRIX AND CLASSIFICATION REPORT**

In [None]:

con_matrix6 = confusion_matrix(y_test,gridxgbpred)
print("Confusion Matrix:")
print(con_matrix6)
print(classification_report(y_test, gridxgbpred)) 


**WE CAN SEE THAT THE HIGHEST VALUE OF PRECISION, RECALL, AND F-1 SCORE BELONGS TO CLASS 5, SUGGESTING THAT IT HAS MORE NUMBER OF INSTANCES**

## HYPERPARAMETER TUNING OF RANDOM FOREST CLASSIFIER

In [None]:

param_grid2 = {
    'n_estimators': [100],      
    'max_depth': [None, 5],      
    'min_samples_split': [2, 5],      
}

rfc = RandomForestClassifier()
gridsearch2 = GridSearchCV(estimator=rfc, param_grid=param_grid2, cv=5, scoring='accuracy')
gridsearch2.fit(X_train, y_train)


In [None]:

print("Best hyperparameters:", gridsearch2.best_params_)


In [None]:

print("Best accuracy:", gridsearch2.best_score_)


**AFTER RUNNING GRIDSEARCHCV ON RFC, WE GET:**

* **1. AMONGST THE BEST HYPERPARAMETER, WE CAN SEE THAT FOR 100 ESTIMATORS, THE BEST MAX DEPTH IS NONE WHILE THE MINIMUM SAMPLE SPLIT IS 2.**

* **2. THE BEST ACCURACY IS 0.7697107466323397**

In [None]:

gridrfcpred = gridsearch2.best_estimator_.predict(X_test)
accuracy_test2 = accuracy_score(y_test, gridrfcpred)
print("Accuracy on test set:", accuracy_test2)


**ON VALIDATION SET, THE ACCURACY IS 0.7635630498533724**

In [None]:

gridrfcpredtest = gridsearch2.best_estimator_.predict(test_data)


In [None]:

submission = pd.DataFrame(columns=['ID', 'Rating'])
submission['ID'] = [__ for __ in range(1, len(gridrfcpredtest)+1)]
submission['Rating'] = gridrfcpredtest
submission.to_csv('submission.csv', index=False)


**ON TRAINING SET THE ACCURACY IS 0.7718**

**NOW WE WILL FIND THE CONFUSION MATRIX AND CLASSIFICATION REPORT**

In [None]:

con_matrix7 = confusion_matrix(y_test,gridrfcpred)
print("Confusion Matrix:")
print(con_matrix7)
print(classification_report(y_test, gridrfcpred)) 


In [None]:

precision = precision_score(y_test, gridrfcpred,average='weighted', zero_division='warn')
recall = recall_score(y_test, gridrfcpred,average='weighted', zero_division='warn')
f1 = f1_score(y_test, gridrfcpred,average='weighted', zero_division='warn')

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

con_matrix7 = confusion_matrix(y_test, gridrfcpred)
print("Confusion Matrix:")
print(con_matrix7)


**WE CAN SEE THAT THE HIGHEST VALUE OF PRECISION, RECALL, AND F-1 SCORE BELONGS TO CLASS 5, SUGGESTING THAT IT HAS MORE NUMBER OF INSTANCES**

## HYPERPARAMETER TUNING OF KNN CLASSIFIER

In [None]:

param_grid3 = {'n_neighbors': [3, 5, 7, 9, 11, 20]}
knnc = KNeighborsClassifier()
gridsearch3 = GridSearchCV(estimator=knnc, param_grid=param_grid3, cv = 5, scoring='accuracy')
gridsearch3.fit(X_train, y_train)


In [None]:

print("Best hyperparameters:", gridsearch3.best_params_)


In [None]:

print("Best accuracy:", gridsearch3.best_score_)


**THE INFORMATION RECEIVED AFTER RUNNING GRIDSEARCHCV ON KNN:**


**1. AMONGST THE BEST HYPERPRAMETERS, THE NUMBER OF NEIGHBOURS SHOULD BE 20**

**2. WHEREAS THE BEST ACCURACY COMES OUT AS 0.7631098256713361**



In [None]:

gridknnpred = gridsearch3.best_estimator_.predict(X_test)
accuracy_test3 = accuracy_score(y_test, gridknnpred)
print("Accuracy on test set:", accuracy_test3)


**ACCURACY ON VALIDATION TEST IS 0.7565982404692082**

In [None]:

gridknnpredtest = gridsearch3.best_estimator_.predict(test_data)


In [None]:

submission = pd.DataFrame(columns=['ID', 'Rating'])
submission['ID'] = [__ for __ in range(1, len(gridknnpredtest)+1)]
submission['Rating'] = gridknnpredtest
submission.to_csv('submission.csv', index=False)


**THE TEST SET ACCURACY IS 0.7633**

In [None]:

con_matrix8 = confusion_matrix(y_test,gridknnpred)
print("Confusion Matrix:")
print(con_matrix8)
print(classification_report(y_test, gridknnpred)) 


In [None]:

precision = precision_score(y_test, gridknnpred,average='weighted', zero_division='warn')
recall = recall_score(y_test, gridknnpred,average='weighted', zero_division='warn')
f1 = f1_score(y_test, gridknnpred,average='weighted', zero_division='warn')

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

con_matrix8 = confusion_matrix(y_test, gridknnpred)
print("Confusion Matrix:")
print(con_matrix8)


**WE CAN SEE THAT THE HIGHEST VALUE OF PRECISION, RECALL, AND F-1 SCORE BELONGS TO CLASS 5, SUGGESTING THAT IT HAS MORE NUMBER OF INSTANCES**

## HYPERPARAMETER TUNING OF SVCLASSIFIER

In [None]:

param_grid4 = {
    'C': [0.1, 1, 10],                
    'kernel': ['linear'],      
    'gamma': ['scale'],       
}
svmc = SVC()
gridsearch4 = GridSearchCV(estimator=svmc, param_grid=param_grid4, cv=5, scoring='accuracy')
gridsearch4.fit(X_train, y_train)


In [None]:

print("Best hyperparameters:", gridsearch4.best_params_)


In [None]:

print("Best accuracy:", gridsearch3.best_score_)


**AFTER RUNNING GRIDSEARCHCV ON SVC, WE GET:**

**1. Best hyperparameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}**

**2. Best accuracy: 0.7631098256713361**

In [None]:
gridsvcpred = gridsearch4.best_estimator_.predict(X_test)
accuracy_test4 = accuracy_score(y_test, gridsvcpred)
print("Accuracy on test set:", accuracy_test4)


**ACCURACY ON VALIDATION SET IS 0.7653958944281525**

In [None]:

gridsvcpredtest = gridsearch4.best_estimator_.predict(test_data)


In [None]:

submission = pd.DataFrame(columns=['ID', 'Rating'])
submission['ID'] = [__ for __ in range(1, len(gridsvcpredtest)+1)]
submission['Rating'] = gridsvcpredtest
submission.to_csv('submission.csv', index=False)


**THE ACCURACY FOR TEST SET IS 0.76902**

**NOW WE WILL BE FINDING CONFUSION MATRIX AND CLASSIFICATION REPORT**

In [None]:

con_matrix9 = confusion_matrix(y_test,gridsvcpred)
print("Confusion Matrix:")
print(con_matrix9)
print(classification_report(y_test, gridsvcpred)) 


**WE CAN SEE THAT THE HIGHEST VALUE OF PRECISION, RECALL, AND F-1 SCORE BELONGS TO CLASS 5, SUGGESTING THAT IT HAS MORE NUMBER OF INSTANCES**

## **----------COMPARISON OF THE MODELS----------**

**First, we are going to compare the accuracy of Logistic Regression, XGBoost Classifier, Random Forest Classifier,KNN Classifier, and Support Vector Classifier after Hyperparameter Tuning**

In [None]:

logregaccu=0.7754
xgbaccu=0.77364
rfcaccu=0.7718
knnaccu=0.7633
svcaccu=0.76902
model=['Logistic Regression','XGBoost Classifier','Random Forest Classifier', 'KNN Classifier', 'Support Vector Classifier']
accu=[logregaccu,xgbaccu,rfcaccu, knnaccu, svcaccu]

plt.figure(figsize=(8, 6))
plt.bar(model, accu, color=['blue', 'green', 'red','orange','black'])
plt.xticks(rotation=45, ha='right')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Comparison of Accuracy Scores')
plt.ylim(0.75, 0.78)  
plt.show()


* **WE CAN CLEARLY SEE THAT AFTER HYPERPARAMETER TUNING,LOGISTIC REGRESSION HAS ACHIVED THE HIGHEST ACCURACY.**

* **THE ACCURACY OF XGBOOST CLASSIFIER CLOSE TO THAT OF LOGISTIC REGRESSION. MORE EXHAUSTIVE HYPERPARAMETER TUNING CAN BRING THAT RESULT OUT.**

* **THE ACCURACY OF RANDOM FOREST CLASSIFIER FALLS SHORT COMPARED TO LOGISTIC REGRESSION AND XGBOOST CLASSIFIER. MORE EXHAUSTIVE HYPERPARAMETER TUNING CAN BRING THAT RESULT OUT.**

* **THE ACCURACY OF KNN CLASSIFIER IS MUCH LESS THAN THE OTHERS. MORE EXHAUSTIVE HYPERPARAMETER TUNING CAN BRING UP THE ACCURACY TO A HIGHER VALUE.**

* **THE ACCURACY OF SUPPORT VECTOR CLASSIFIER IS MORE THAN KNN CLASSIFIER BUT LESS THAN THE REST THREE. THIS DEMANDS FOR MORE REFINED AND EXHAUSTIVE HYPER PARAMETER TUNING.**

* **OVERALL, THE INSIGHTS DERIVED FROM THIS SET OF HYPERPARAMETER TUNING AND MODEL BUILDING GIVES AND IDEA OF THE RELATIVE PERFORMANCE OF THESE THREE MODELS.**


**Now, are going to compare the accuracy of Logistic Regression, XGBoost Classifier, Random Forest Classifier,KNN Classifier, and Support Vector Classifier prior to Hyperparametr Tuning. We will also consider**

In [None]:

logregaccu1=0.77540
xgbaccu1=0.77518
rfcaccu1=0.7622
knnaccu1=0.76902
svcaccu1=0.77232
dumaccu=0.76066
model=['Logistic Regression','XGBoost Classifier','Random Forest Classifier', 'KNN Classifier', 'Support Vector Classifier', 'Dummy Classifier']
accu=[logregaccu1,xgbaccu1,rfcaccu1, knnaccu1, svcaccu1, dumaccu]

plt.figure(figsize=(8, 6))
plt.bar(model, accu, color=['blue', 'green', 'red','orange','black', 'pink'])
plt.xticks(rotation=45, ha='right')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Comparison of Accuracy Scores Prior to Hyperparameter Tuning')
plt.ylim(0.75, 0.78)  
plt.show()


* **WE CAN SEE THAT LOGISTIC REGRESSION HAS ACHIEVED THE HIGHEST VALUE FOLLOWED BY XGBOOST CLASSIFIER.**
* **RANDOM FOREST CLASSIFIER IS GIVING THE LOWEST VALUE APART FROM DUMMY CLASSIFIER.**
* **KNN CLASSIFIER HAS REACHED A VALUE MORE THAN RANDOM FOREST CLASSIFIER BUT IS LESS THAN REST OF THE MODELS.**
* **SUPPORT VECTOR CLASSIFIER HAS DONE A BETTER JOB COMPARED TO RFC, KNN, AND DUMMY, REACHING THE THIRD HIGHEST VALUE.**
* **DUMMY CLASSIFIER HAS ATTAINED THE LOWEST VALUE**



**NOW WE WILL BE COMPARING EACH MODEL TO ITSELF BEFORE AND AFTER HYPERPARAMETER TUNING**

**1. LOGISTIC REGRESSION**

In [None]:

model=['Logistic Regression','Logistic Regression HPT']
accu=[logregaccu1,logregaccu]

plt.figure(figsize=(8, 6))
plt.bar(model, accu, color=['blue', 'red'])
plt.xticks(rotation=45, ha='right')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Comparison of LOGISTIC REGRESSION')
plt.ylim(0.75, 0.78)  
plt.show()


**WE CAN SEE THAT THE VALUE FOR LOGISTIC REGRESSION BEFORE AND AFTER HPT IS THE SAME. THIS ACTS AS A CORROBORATIVE EVIDENCE TO THE FACT MORE REFINED AND EXHAUSTIVE HPT IS TO BE DONE. THERE IS SCOPE OF BETTERMENT**

**2. XGBOOST**

In [None]:

model=['XGB CLASSIFIER','XGB CLASSIFIER HPT']
accu=[xgbaccu1,xgbaccu]

plt.figure(figsize=(8, 6))
plt.bar(model, accu, color=['blue', 'red'])
plt.xticks(rotation=45, ha='right')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Comparison of XGB CLASSIFIER')
plt.ylim(0.75, 0.78)  
plt.show()


**XGB CLASSIFIER HAS ITS VALUE GOING DOWN POST HPT. THIS SUGGESTS THAT BETTER HPT METHODS ARE TO BE USED TO SEE AN INCREMENT IN THE VALUE. THERE IS A LOT OF SCOPE FOR IMPROVEMENT**

**3. RANDOM FOREST CLASSIFIER**

In [None]:

model=['RANDOM FOREST CLASSIFIER','RANDOM FOREST CLASSIFIER HPT']
accu=[rfcaccu1,rfcaccu]

plt.figure(figsize=(8, 6))
plt.bar(model, accu, color=['blue', 'red'])
plt.xticks(rotation=45, ha='right')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Comparison of RFC')
plt.ylim(0.75, 0.78)  
plt.show()


**RANDOM FOREST CLASSIFIER HAS SHOWN BETTERMENT, WITH A PERCENT CHANGE OF 1.26%. MORE EXHAUSTIVE HPT HAS A POSITIVE CHANCE TO ADD TO THE ACCURACY.**

**4. KNN CLASSIFIER**

In [None]:

model=['KNN CLASSIFIER','KNN CLASSIFIER HPT']
accu=[knnaccu1,knnaccu]

plt.figure(figsize=(8, 6))
plt.bar(model, accu, color=['blue', 'red'])
plt.xticks(rotation=45, ha='right')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Comparison of KNN CLASSIFIER')
plt.ylim(0.75, 0.78)  
plt.show()


**KNN CLASSIFIER SEES A DROP IN ACCURACY AFTER HPT SUGGESTING THAT BETTER AND MORE EXHAUSTIVE HPT METHOD IS TO BE USED TO SEE ANY POSSIBLE INCREMENT**

**5. SUPPORT VECTOR CLASSIFIER**

In [None]:

model=['SUPPORT VECTOR CLASSIFIER','SUPPPORT VECTOR CLASSIFIER HPT']
accu=[svcaccu1,svcaccu]

plt.figure(figsize=(8, 6))
plt.bar(model, accu, color=['blue', 'red'])
plt.xticks(rotation=45, ha='right')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Comparison of SVC')
plt.ylim(0.75, 0.78)  
plt.show()


**SUPPORT VECTOR CLASSIFIER ALSO SEES A DROP OF 0.43% IN THE ACCURACY SUGGESTING THAT MORE EXHAUSTIVE HPT METHOD IS TO BE USED FOR THERE IS A SCOPE OF BETTERMENT**

## ##############################################################################################

## **TRYING TO INCREASE ACCURACY**

In [16]:
X.head()

Unnamed: 0,Hour,Day,UserReputation,ReplyCount,ThumbsUpCount,ThumbsDownCount,BestScore,0,1,2,...,990,991,992,993,994,995,996,997,998,999
0,-0.306714,0.083117,1.768748,-0.105742,-0.263606,-0.166499,-0.376842,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.237746,0.0,0.0,0.0,0.0,0.0
1,-0.306714,0.083117,-0.115041,-0.105742,-0.263606,0.139729,-0.376842,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.306714,0.083117,-0.115041,-0.105742,-0.263606,-0.166499,-0.376842,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.109435,-1.940771,-0.115041,-0.105742,-0.263606,-0.166499,-0.376842,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.176516,0.083117,-0.115041,-0.105742,-0.263606,-0.166499,-0.376842,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.251517,0.0,0.0


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [21]:
from sklearn.feature_selection import RFE
logreg = LogisticRegression(multi_class='auto', solver='lbfgs', max_iter=1000) 
rfe = RFE(logreg, n_features_to_select=10)
rfe.fit(X_train, y_train)
selected_features = X_train.columns[rfe.support_]

X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]


logreg.fit(X_train_selected, y_train)

train_score = logreg.score(X_train_selected, y_train)
test_score = logreg.score(X_test_selected, y_test)

print("Train Score:", train_score)
print("Test Score:", test_score)

Train Score: 0.7638430509717639
Test Score: 0.7576979472140762


In [25]:
stf=test_data.columns[rfe.support_]
test_data_1=test_data[stf]

In [26]:
from sklearn.metrics import accuracy_score

# Calculate accuracy on the training set
train_predictions = logreg.predict(X_train_selected)
train_accuracy = accuracy_score(y_train, train_predictions)

# Calculate accuracy on the testing set
test_predictions = logreg.predict(X_test_selected)
test_accuracy = accuracy_score(y_test, test_predictions)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# Assuming you have a new dataset X_new and corresponding target y_new
# Replace X_new, y_new with your actual new dataset
# Use the trained model to make predictions on the new dataset
new_predictions = logreg.predict(test_data_1)

# Calculate accuracy on the new dataset
new_accuracy = accuracy_score(y_test, new_predictions)
print("Accuracy on New Dataset:", new_accuracy)


Training Accuracy: 0.7638430509717639
Testing Accuracy: 0.7576979472140762


ValueError: Found input variables with inconsistent numbers of samples: [2728, 4546]

## ##############################################################################################

## ##############################################################################################

In [None]:
train = pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/train.csv')
train.head()

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
from sklearn.dummy import DummyClassifier
dc = DummyClassifier(strategy="most_frequent")
y_train = train['Rating'].copy()
train.drop(columns=['Rating'],axis=1)
dc.fit(train,y_train)

pred=dc.predict(test)
#submission = pd.DataFrame(columns=['ID', 'Rating'])
#submission['ID'] = [__ for __ in range(1, len(pred)+1)]
#submission['Rating'] = pred
#submission.to_csv('submission.csv', index=False)

In [None]:
y_train = train['Rating'].copy()
y_train

In [None]:
train.drop(columns=['Rating'],axis=1)

In [None]:
dc.fit(train,y_train)

In [None]:
test=pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/test.csv')

In [None]:
pred=dc.predict(test)

In [None]:
pred

In [None]:
sample=pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/sample.csv')

In [None]:
#submission = pd.DataFrame(columns=['ID', 'Rating'])
#submission['ID'] = [__ for __ in range(1, len(pred)+1)]
#submission['Rating'] = pred
#submission.to_csv('submission.csv', index=False)

FOR MILESTONE1


In [None]:
train.isna().sum()

In [None]:
train.info()

In [None]:
train.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train, train['Rating'], test_size=0.1)

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()

In [None]:
train['CommentID'] = label_encoder.fit_transform(train['CommentID'])
train['RecipeName'] = label_encoder.fit_transform(train['RecipeName'])
train['UserID'] = label_encoder.fit_transform(train['UserID'])
train['UserName'] = label_encoder.fit_transform(train['UserName'])
train['Recipe_Review'] = label_encoder.fit_transform(train['Recipe_Review'])

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()

scaled_data=scaler.fit_transform(train)

scaled_df=pd.DataFrame(scaled_data,columns=train.columns)

scaled_df.head()

In [None]:
scaled_df.shape

In [None]:
pred

**TRYING LOGISTIC REGRESSION**

In [None]:
data=pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/train.csv')
testdata=pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/test.csv')

In [None]:
datanum=data.select_dtypes(include=['number'])
X_train=datanum.drop('Rating',axis=1)
y_train=datanum['Rating']
X_test=testdata.select_dtypes(include=['number'])

In [None]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(X_train, y_train)


In [None]:
y_pred=logreg.predict(X_test)
y_pred

In [None]:
#submission = pd.DataFrame(columns=['ID', 'Rating'])
#submission['ID'] = [__ for __ in range(1, len(y_pred)+1)]
#submission['Rating'] = y_pred
#submission.to_csv('submission.csv', index=False)

**TRYING RANDOM FOREST CLASSIFIER AFTER USING TFIDF**

In [None]:
import pandas as pd
import numpy as np
data=pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/train.csv')

In [None]:
data.fillna(0, inplace=True)

In [None]:
data.drop(['ID', 'RecipeCode', 'CommentID', 'UserID', 'UserName', 'CreationTimestamp', 'RecipeName'], axis=1, inplace=True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000) 
data['Recipe_Review'] = data['Recipe_Review'].astype(str)
tfidf_features = tfidf.fit_transform(data['Recipe_Review'])
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf.get_feature_names_out())
data = pd.concat([data, tfidf_df], axis=1)
data.drop('Recipe_Review', axis=1, inplace=True)

In [None]:

X = data.drop('Rating', axis=1)
y = data['Rating']


from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [None]:

y_pred = clf.predict(X_val)


from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)


In [None]:
import pickle
with open('random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(clf, model_file)

In [None]:

test_data = pd.read_csv("/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/test.csv")
test_data.drop(['ID', 'RecipeCode', 'CommentID', 'UserID', 'UserName', 'CreationTimestamp', 'RecipeName'], axis=1, inplace=True)

test_data.fillna(0, inplace=True)


test_data['Recipe_Review'] = test_data['Recipe_Review'].astype(str)

tfidf_test = tfidf.transform(test_data['Recipe_Review'])
tfidf_test_df = pd.DataFrame(tfidf_test.toarray(), columns=tfidf.get_feature_names_out())
test_data = pd.concat([test_data, tfidf_test_df], axis=1)
test_data.drop('Recipe_Review', axis=1, inplace=True)


import pickle
with open('random_forest_model.pkl', 'rb') as model_file:
    clf = pickle.load(model_file)


test_predictions = clf.predict(test_data)

In [None]:
#submission = pd.DataFrame(columns=['ID', 'Rating'])
#submission['ID'] = [__ for __ in range(1, len(test_predictions)+1)]
#submission['Rating'] = test_predictions
#submission.to_csv('submission.csv', index=False)

**NOW TRYING XGBOOST**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [None]:
data=pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/train.csv')
test_data = pd.read_csv("/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/test.csv")

In [None]:
data.drop(['ID', 'RecipeCode', 'CommentID', 'UserID', 'UserName', 'CreationTimestamp', 'RecipeName'], axis=1, inplace=True)

In [None]:
data.fillna(0, inplace=True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
data['Recipe_Review'] = data['Recipe_Review'].astype(str)
tfidf = TfidfVectorizer(max_features=1000) 
tfidf_features = tfidf.fit_transform(test_data['Recipe_Review'])
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf.get_feature_names_out())
data = pd.concat([data.drop('Recipe_Review', axis=1), tfidf_df], axis=1)

In [None]:
X=data.drop('Rating', axis=1)
y=data['Rating']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
xgb_model = XGBClassifier(objective='multi:softmax', num_class=6, eval_metric='mlogloss', seed=42)
xgb_model.fit(X_train, y_train)

In [None]:
y_pred = xgb_model.predict(X_val)
accuracy=accuracy_score(y_val, y_pred)
print("Validation Accuracy", accuracy)

In [None]:
test_data.drop(['ID', 'RecipeCode', 'CommentID', 'UserID', 'UserName', 'CreationTimestamp', 'RecipeName'], axis=1, inplace=True)

test_data.fillna(0, inplace=True)


test_data['Recipe_Review'] = test_data['Recipe_Review'].astype(str)

tfidf_test = tfidf.transform(test_data['Recipe_Review'])
tfidf_test_df = pd.DataFrame(tfidf_test.toarray(), columns=tfidf.get_feature_names_out())
test_data = pd.concat([test_data, tfidf_test_df], axis=1)
test_data.drop('Recipe_Review', axis=1, inplace=True)

In [None]:
test_predictions = xgb_model.predict(test_data)


In [None]:
#submission = pd.DataFrame(columns=['ID', 'Rating'])
#submission['ID'] = [__ for __ in range(1, len(test_predictions)+1)]
#submission['Rating'] = test_predictions
#submission.to_csv('submission.csv', index=False)

**TRYING ENSEMBLING RANDOM FOREST AND XGBC**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [None]:
data=pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/train.csv')
test_data = pd.read_csv("/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/test.csv")

In [None]:
data.head(2)

In [None]:
test_data.head(2)

In [None]:
data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

In [None]:
data.drop(['ID', 'RecipeCode', 'CommentID', 'UserID', 'UserName', 'CreationTimestamp', 'RecipeName'], axis=1, inplace=True)
test_data.drop(['ID', 'RecipeCode', 'CommentID', 'UserID', 'UserName', 'CreationTimestamp', 'RecipeName'], axis=1, inplace=True)

In [None]:
tfidf = TfidfVectorizer(max_features=1000)
data['Recipe_Review'] = data['Recipe_Review'].astype(str)
test_data['Recipe_Review'] = data['Recipe_Review'].astype(str)

In [None]:
cattf = pd.concat([data['Recipe_Review'], test_data['Recipe_Review']], axis=0)
tfidf.fit(cattf)

In [None]:
tfidf_train = tfidf.transform(data['Recipe_Review'])
tfidf_test = tfidf.transform(test_data['Recipe_Review'])

In [None]:
tfidf_train_df = pd.DataFrame(tfidf_train.toarray(), columns=tfidf.get_feature_names_out())
tfidf_test_df = pd.DataFrame(tfidf_test.toarray(), columns=tfidf.get_feature_names_out())


In [None]:
X_train_tfidf = pd.concat([data.drop('Recipe_Review', axis=1), tfidf_train_df], axis=1)
X_test_tfidf = pd.concat([test_data.drop('Recipe_Review', axis=1), tfidf_test_df], axis=1)

In [None]:
xgb_classifier = XGBClassifier(objective='multi:softmax', num_class=6, eval_metric='mlogloss', seed=42)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
X_train_tfidf = X_train_tfidf.drop('Rating', axis=1)

In [None]:
voting_classifier = VotingClassifier(estimators=[('xgb', xgb_classifier),('rf', rf_classifier)], voting='hard')

In [None]:
y = data['Rating']
voting_classifier.fit(X_train_tfidf, y)

In [None]:
test_predictions_voting = voting_classifier.predict(X_test_tfidf)


In [None]:
#submission = pd.DataFrame(columns=['ID', 'Rating'])
#submission['ID'] = [__ for __ in range(1, len(test_predictions_voting)+1)]
#submission['Rating'] = test_predictions_voting
#submission.to_csv('submission.csv', index=False)

# TRYING INCLUDING CREATIONTIMESTAMP WITH LOG REG

using the train set

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from datetime import datetime

In [None]:
data = pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/train.csv')

In [None]:
data['CreationTimestamp'] = pd.to_datetime(data['CreationTimestamp'], unit='s')
data['Hour'] = data['CreationTimestamp'].dt.hour
data['Day'] = data['CreationTimestamp'].dt.dayofweek

In [None]:
data.fillna('', inplace=True)

In [None]:
tfidf = TfidfVectorizer(max_features=1000)
tfidf_features = tfidf.fit_transform(data['Recipe_Review'])

In [None]:
X = pd.concat([data[['Hour', 'Day']], pd.DataFrame(tfidf_features.toarray())], axis=1)
y = data['Rating']

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [None]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

In [None]:
logregpred = logreg.predict(X_test)

In [None]:
logregaccuracy = accuracy_score(y_test, logregpred)
print("Logistic Regression Accuracy:", logregaccuracy)

FOR LOGISTIC REGRESSION, THE ACCURACY SCORE IS COMING: 0.7679618768328446

on test set

In [None]:
test_data = pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/test.csv')

In [None]:
test_data['CreationTimestamp'] = pd.to_datetime(test_data['CreationTimestamp'], unit='s')
test_data['Hour'] = test_data['CreationTimestamp'].dt.hour
test_data['Day'] = test_data['CreationTimestamp'].dt.dayofweek

In [None]:
test_data.fillna('', inplace=True)

In [None]:
tfidf_test= tfidf.transform(test_data['Recipe_Review'])

In [None]:
test_data = pd.concat([test_data[['Hour', 'Day']], pd.DataFrame(tfidf_test.toarray())], axis=1)
test_data.columns=test_data.columns.astype(str)

In [None]:
logregpred_test = logreg.predict(test_data)
print("Predictions for the test set:", logregpred_test)

In [None]:
#submission = pd.DataFrame(columns=['ID', 'Rating'])
#submission['ID'] = [__ for __ in range(1, len(logregpred_test)+1)]
#submission['Rating'] = logregpred_test
#submission.to_csv('submission.csv', index=False)

HENCE, FOR LOGREG, THE ACCURACY SCORE FOR TEST SET IS 0.77452

# applying the same on rfc

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
rfcpred = rfc.predict(X_test)

In [None]:
rfcaccuracy = accuracy_score(y_test, rfcpred)
print("Random Forest Classifier Accuracy:", rfcaccuracy)

FOR RFC, THE ACCURACY SCORE OF TRAINING IS COMING:  0.7617302052785924

In [None]:
rfcpred_test = rfc.predict(test_data)

In [None]:
#submission = pd.DataFrame(columns=['ID', 'Rating'])
#submission['ID'] = [__ for __ in range(1, len(rfcpred_test)+1)]
#submission['Rating'] = rfcpred_test
#submission.to_csv('submission.csv', index=False)

FOR RFC, THE ACCURACY SCORE FOR THE TEST SET IS 0.77034

# trying the same on xgboost

In [None]:
from xgboost import XGBClassifier
xgbc = XGBClassifier()
xgbc.fit(X_train, y_train)


In [None]:
xgbpred = xgbc.predict(X_test)

In [None]:
xgbaccuracy = accuracy_score(y_test, xgbpred)
print("XGBoost Classifier Accuracy:", xgbaccuracy)

FOR XGBOOST, THE ACCURACY SCORE OF TRAINING IS COMING: 0.7635630498533724

In [None]:
xgbpred_test= xgbc.predict(test_data)

In [None]:
#submission = pd.DataFrame(columns=['ID', 'Rating'])
#submission['ID'] = [__ for __ in range(1, len(xgbpred_test)+1)]
#submission['Rating'] = xgbpred_test
#submission.to_csv('submission.csv', index=False)

HENCE, FOR XGBOOST, THE ACCURACY SCORE FOR THE TEST SET IS 0.77188

## TRYING THE SAME ON KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knnc = KNeighborsClassifier()
knnc.fit(X_train, y_train)

In [None]:
knnpred = knnc.predict(X_test)

In [None]:
knnaccuracy = accuracy_score(y_test, knnpred)
print("KNN Classifier Accuracy:", knnaccuracy)

FOR KNN, THE ACCURACY SCORE OF TRAINING IS COMING: 0.5711143695014663



In [None]:
knnpred_test= knnc.predict(test_data)

In [None]:
#submission = pd.DataFrame(columns=['ID', 'Rating'])
#submission['ID'] = [__ for __ in range(1, len(knnpred_test)+1)]
#submission['Rating'] = knnpred_test
#submission.to_csv('submission.csv', index=False)

## TRYING THE SAME ON SVM

In [None]:
from sklearn.svm import SVC
svmc = SVC()
svmc.fit(X_train, y_train)

In [None]:
svmcpred = svmc.predict(X_test)

In [None]:
svmcaccuracy = accuracy_score(y_test, svmcpred)
print("SVM Classifier Accuracy:", svmcaccuracy)

FOR SVC, THE ACCURACY SCORE OF TRAINING IS COMING: 0.7554985337243402

In [None]:
svmcpred_test= svmc.predict(test_data)

In [None]:
#submission = pd.DataFrame(columns=['ID', 'Rating'])
#submission['ID'] = [__ for __ in range(1, len(svmcpred_test)+1)]
#submission['Rating'] = svmcpred_test
#submission.to_csv('submission.csv', index=False)