In [None]:
# import libraries 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import statsmodels.api as smb

In [None]:
df = pd.read_csv("data/recipes.csv")
print(df.isnull().sum())
print(df.shape)

In [None]:
zero_observations = (df["Calories"] == 0) & (df["CholesterolContent"] == 0) & (df["ProteinContent"] == 0) & (df["CarbohydrateContent"] == 0) & (df["SugarContent"] == 0)
df[zero_observations].value_counts().sum()

In [None]:
df[df["RecipeCategory"].isnull()]

In [None]:
df.drop(['Name', "RecipeId"], axis=1, inplace=True)
df.dropna(subset="RecipeCategory", how='any', inplace=True)
df.isnull().sum()

In [None]:
df.drop(df.loc[zero_observations].index, inplace=True)
print(df[zero_observations].value_counts().sum())
df.shape

In [None]:
df.RecipeCategory.unique()

In [None]:
df.describe()

In [None]:
numeric_data = df.drop(["RecipeCategory", "HighScore", "RecipeServings"], axis=1)
def box_hist_plots(data):
    """Make box plots and histograms of data"""
    # Name=data.name.upper()
    # mean=data.mean()
    # median=data.median()
    # fig, ax = plt.subplots()
    fig, (ax1, ax2) = plt.subplots(2, 1)
    # fig.suptitle("SPREAD OF DATA FOR " + Name)
    sns.boxplot(x=data, data=data, ax=ax1)
    sns.histplot(data, bins=100, ax=ax2)
    # create mean and median line on histogram plot
    # ax.axvline(mean, color='r', linestyle='--',linewidth=2)
    # ax.axvline(median, color='g', linestyle='-',linewidth=2)
    # plt.legend({'Mean': mean, 'Median': median})

numeric_columns = numeric_data.columns
# print(numeric_columns)
for i in range(len(numeric_columns)):
    box_hist_plots(df[numeric_columns[i]])

In [None]:
def qqplot(data):
    Name=data.name.upper()
    fig = sm.qqplot(data)
    fig.suptitle("QQPLOT FOR " + Name)
    
numeric_columns = numeric_data.columns
for i in range(len(numeric_columns)):
    qqplot(df[numeric_columns[i]])

In [None]:
cals_transformed, _ = stats.boxcox(df["Calories"] + 1)
df["Calories"] = cals_transformed

protein_transformed = np.log(df["ProteinContent"] + 1)
df["ProteinContent"] = protein_transformed

sugar_transformed = np.log(df["SugarContent"] + 1)
df["SugarContent"] = sugar_transformed

carbs_transformed, _ = stats.boxcox(df["CarbohydrateContent"] + 1)
df["CarbohydrateContent"] = carbs_transformed

cholesterol_transformed, _ = stats.boxcox(df["CholesterolContent"] + 1)
df["CholesterolContent"] = cholesterol_transformed

recipe_transformed, _ = stats.boxcox(df["RecipeServings"] + 1)
df["RecipeServings"] = recipe_transformed

In [None]:
sns.regplot(x="ProteinContent", y="Calories", data=df)
plt.show()

In [None]:
def outlier(data):
	q_low = data.quantile(0.25)
	q_high = data.quantile(0.75)

	df_filtered = df[(data < q_high) & (data > q_low)]

outlier(df[numeric_data])
sns.boxplot(df_filtered['CarbohydrateContent'])

In [None]:
for i in range(len(numeric_columns)):
    box_hist_plots(df_filtered[numeric_columns[i]])

In [None]:
for i in range(len(numeric_columns)):
    qqplot(df_filtered[numeric_columns[i]])

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)

In [None]:
model = LogisticRegression(class_weight={0: 0.668, 1:0.332})
model.fit(X_train, y_train)
# model = LinearSVC(class_weight={0: 0.668, 1:0.332}, random_state=42)
# model.fit(X_train, y_train)

In [None]:
score = model.score(X_test, y_test)
print(f'Model accuracy of: % {round(score * 100, 2)}')

In [None]:
predictions = model.predict(X_test)

In [None]:
conf_matrix = confusion_matrix(y_test, predictions)
print(conf_matrix)

In [None]:
TN = conf_matrix[0, 0] 
FP = conf_matrix[0, 1] 
FN = conf_matrix[1, 0] 
TP = conf_matrix[1, 1] 


In [None]:
sp = TN / (TN + FP) 
print(sp * 100)

In [None]:
sn = TP / (TP + FN)
print(sn * 100)

In [None]:
print(f'Value count of target variable: \n{df["HighScore"].value_counts()}')
sns.set_style(style='darkgrid')
sns.histplot(df['HighScore'])
plt.title('Value Count Of Target Variable')
plt.show()

In [None]:
df.RecipeCategory.unique()

In [None]:
scaler = RobustScaler()
scaled = scaler.fit_transform(numeric_data.values)

In [None]:
df_scaled = pd.DataFrame(scaled, index=numeric_data.index, columns=numeric_data.columns)

In [None]:
sns.histplot(df_scaled["Calories"])
plt.show()

In [None]:
# print(df['Calories'].value_counts(bins = [0,40,100,400], sort = False))
from scipy.stats import binned_statistic
x_data = np.arange(0, len(df))
y_data = df['Calories']
x_bins,bin_edges, misc = binned_statistic(y_data,x_data, statistic="median", bins=[0, 40, 100, 400, 800, 1600, 3200, 31000])
x_data2 = np.arange(0, len(df))
y_data2 = df['ProteinContent']
x_bins2,bin_edges2, misc2 = binned_statistic(y_data2,x_data2, statistic="median", bins=[0, 40, 100, 400, 800, 1600])

In [None]:
bin_intervals = pd.IntervalIndex.from_arrays(bin_edges[:-1], bin_edges[1:])
bin_intervals2 = pd.IntervalIndex.from_arrays(bin_edges2[:-1], bin_edges2[1:])

In [None]:
def set_to_median(x, bin_intervals):
    for interval in bin_intervals:
        if x in interval:
            return interval.mid

In [None]:
df['Calories'] = df['Calories'].apply(lambda x: set_to_median(x, bin_intervals))
df['ProteinContent'] = df['ProteinContent'].apply(lambda x: set_to_median(x, bin_intervals))

In [None]:
sns.regplot('Calories', 'ProteinContent', data=df)
plt.show()

In [None]:
plt.plot(df['Calories'], label='original')
plt.plot(df['sampled_calories'], color='red', label='sampled')
plt.legend()
plt.show()

In [None]:
#Outlier Treatment
for i in range(len(numeric_columns)):
    box_plots(df[numeric_columns[i]])

In [None]:
Every feature has many outliers but they can not all be removed. Outliers will be removed based on distance from other outliers. If there are no near outliers near a value it will be removed. Can not use iqr because many observations will be removed that may have important information

In [None]:
print(df['Calories'].sort_values(ascending=False).head(20))
# print(df['CholesterolContent'].sort_values(ascending=False).head(20))
# print(df['CarbohydrateContent'].sort_values(ascending=False).head(20))
# print(df['SugarContent'].sort_values(ascending=False).head(20))
# print(df['ProteinContent'].sort_values(ascending=False).head(20))
# print(df['RecipeServings'].sort_values(ascending=False).head(20))

In [None]:
df.iloc[38735, 0:8] 

In [None]:
# index_cals = df[df['Calories'] >= 25].index
# df.drop(index_cals, inplace=True) #Removed 1 observation (38) which is 7 points away 

# index_cholesterol = df[df['CholesterolContent'] >= .2].index
# df.drop(index_cholesterol, inplace=True) #Removed 1 observation (26) which is 5 points away

# index_carbs = df[df['CarbohydrateContent'] >= 27].index
# df.drop(index_carbs, inplace=True) #Removed 3 observations (60, 42, 40) 40 is 6 points away

# index_sugar = df[df['SugarContent'] >= 22].index
# df.drop(index_sugar, inplace=True) #Removed 2 observations (29, 29) which is 3 points away

# index_protein = df[df['ProteinContent'] >= 15].index
# df.drop(index_protein, inplace=True) # Removed 5 observations (33, 22, 20, 19, 19) 19 is 3 away

# index_rs = df[df['RecipeServings'] >= 13].index
# df.drop(index_rs, inplace=True) #Removed 4 observations(181, 59, 20, 20) 20 is 5 away

In [None]:
index_cals = df[df['Calories'] >= 27].index
df.drop(index_cals, inplace=True) #Removed 1 observation (38) which is 7 points away 

index_cholesterol = df[df['CholesterolContent'] >= 0.2].index
df.drop(index_cholesterol, inplace=True) #Removed 1 observation (26) which is 5 points away

index_carbs = df[df['CarbohydrateContent'] >= 27].index
df.drop(index_carbs, inplace=True) #Removed 3 observations (60, 42, 40) 40 is 6 points away

index_sugar = df[df['SugarContent'] >= 21].index
df.drop(index_sugar, inplace=True) #Removed 2 observations (29, 29) which is 3 points away

index_protein = df[df['ProteinContent'] >= 15].index
df.drop(index_protein, inplace=True) # Removed 5 observations (33, 22, 20, 19, 19) 19 is 3 away

index_rs = df[df['RecipeServings'] >= 13].index
df.drop(index_rs, inplace=True) #Removed 4 observations(181, 59, 20, 20) 20 is 5 away

In [None]:
#### Class imbalance

In [None]:
# Seperate features from target variable
X = df.drop(['HighScore'], axis=1)
y = df['HighScore']

print(f'Original target data count: \n{y.value_counts()}')
# perform random over sampling with replacement
over_sample = RandomOverSampler(sampling_strategy='all', random_state=200)
X_over, y_over = over_sample.fit_resample(X, y)
print(f'Target data count after sampling: \n{Counter(y_over)}')

In [None]:
df = X_over
df['HighScore'] = y_over
print(f'Dataframe shape after balancing class: {df.shape}')

In [None]:
index_cals = df[(df['Calories'] >= 1500) & (df['RecipeServings'] == 1)].index
# display(index_cals.sort_values(by='Calories', ascending=False))
df.drop(index_cals, inplace=True) 

index_cals = df[df['Calories'] >= 9900].index
df.drop(index_cals, inplace=True)

index_cholesterol = df[df['CholesterolContent'] >= 2].index
df.drop(index_cholesterol, inplace=True) 

# Start from here
index_carbs = df[df['CarbohydrateContent'] >= 500].index # 0 rows
df.drop(index_carbs, inplace=True)

index_sugar = df[df['SugarContent'] >= 500].index
df.drop(index_sugar, inplace=True)  #300

index_protein = df[df['ProteinContent'] >= 200].index
df.drop(index_protein, inplace=True) 

index_rs = df[df['RecipeServings'] >= 100].index
df.drop(index_rs, inplace=True) 

In [None]:
# Check skew value of numeric features
print('Calories skew value: ', df['Calories'].skew())
print('Carbs skew value: ',df['CarbohydrateContent'].skew())
print('Proteins skew value: ',df['ProteinContent'].skew())
print('Cholesterols skew value: ',df['CholesterolContent'].skew())
print('Sugars skew value: ',df['SugarContent'].skew())
print('Recipe Servings skew value: ',df['RecipeServings'].skew())

In [None]:
# # Check skew values again after transformation
# print('Calories skew value: ', df['Calories'].skew()) 
# print('Carbs skew value: ',df['CarbohydrateContent'].skew()) 
# print('Proteins skew value: ',df['ProteinContent'].skew()) 
# print('Cholesterols skew value: ',df['CholesterolContent'].skew()) 
# print('Sugars skew value: ',df['SugarContent'].skew()) 
# print('Recipe Servings skew value: ',df['RecipeServings'].skew()) 