In [13]:
import pandas as pd
from pathlib import Path
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.decomposition import PCA


In [14]:
file_path = Path("./data/recipes_sample.csv")
df = pd.read_csv(file_path)

In [15]:
# df=df.drop(df.index[100:])

In [16]:
#dataset cleanup
df.dropna(subset=["RecipeId", "AggregatedRating"],inplace=True)

In [17]:
ingredients = df["RecipeIngredientParts"]

#splitting ingredients to separate columns
ingredients = ingredients.str.replace(" ", "\"")
temp =ingredients.str.split("\"",expand=True)
dummies=pd.get_dummies(temp,dtype=int)
s=dummies.columns.str.lstrip('0123456789_')
s=s.to_frame().drop_duplicates()
# s.drop(s.index[0:4],inplace=True)
s.rename(columns={0:"ingredients list"},inplace=True)
s


Unnamed: 0,ingredients list
,
c(,c(
character(0),character(0)
Absolut,Absolut
Baileys,Baileys
...,...
baguettes,baguettes
pico,pico
gallo,gallo
gumbo,gumbo


In [18]:
keywords = df["Keywords"]

#splitting keywords to separate columns
keywords = keywords.str.replace(" ", "\"")
temp =keywords.str.split("\"",expand=True)
dummiesk=pd.get_dummies(temp,dtype=int)
sk=dummiesk.columns.str.lstrip('0123456789_')
sk=sk.to_frame().drop_duplicates()
# sk.drop(sk.index[0:4],inplace=True)
sk.rename(columns={0:"keywords list"},inplace=True)
sk

Unnamed: 0,keywords list
,
c(,c(
<,<
African,African
Apple,Apple
...,...
Oysters,Oysters
Szechuan,Szechuan
Dairy,Dairy
Ramadan,Ramadan


In [19]:
# creating dummy tables for all ingredients
#scanning every column for duplicate named columns
for j in range(0,dummies.shape[1]):
    # progress counter
    if j%100==0:
        print(j)
    col_idx_drop=[]

    #merging duplicate columns
    for k in range(j+1,dummies.shape[1]):
        #column duplicate name check
        if (dummies.columns[j].lstrip('0123456789_')==dummies.columns[k].lstrip('0123456789_')):
            #merging duplicate columns
            dummies.iloc[:,j] = dummies.iloc[:,j]+dummies.iloc[:,k]
            col_idx_drop.append(k)
    #dropping the second duplicate named column
    dummies.drop(columns=dummies.columns[col_idx_drop],inplace=True)

for j in range(0,dummies.shape[1]):
    dummies.rename(columns={dummies.columns[j]:dummies.columns[j].lstrip('0123456789_')},inplace=True)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200


In [20]:
#same rationale applied for keywords, as for ingredients
for j in range(0,dummiesk.shape[1]):
    if j%100==0:
        print(j)
    col_idx_drop=[]
    
    for k in range(j+1,dummiesk.shape[1]):
        if (dummiesk.columns[j].lstrip('0123456789_')==dummiesk.columns[k].lstrip('0123456789_')):
            dummiesk.iloc[:,j] = dummiesk.iloc[:,j]+dummiesk.iloc[:,k]
            col_idx_drop.append(k)
  
    dummiesk.drop(columns=dummiesk.columns[col_idx_drop],inplace=True)

for j in range(0,dummiesk.shape[1]):
    dummiesk.rename(columns={dummiesk.columns[j]:dummiesk.columns[j].lstrip('0123456789_')},inplace=True)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700


In [21]:
#change to 1 every number greater than 1 on dummy table
dummies[dummies.iloc[:,:]>1]=1
dummiesk[dummiesk.iloc[:,:]>1]=1

#remove columes that are all 1
dummies = dummies.iloc[:,list(dummies.sum()<0.5*len(dummies))]
dummiesk = dummiesk.iloc[:,list(dummiesk.sum()<0.5*len(dummiesk))]

In [22]:
#create combined table and change the Rating column to int for the ML algorithm
df_combined = pd.concat([df['AggregatedRating'], dummies],axis=1)
df_combined["AggregatedRating"] = df_combined["AggregatedRating"]*2
df_combined.astype(int)

Unnamed: 0,AggregatedRating,character(0),Absolut,Baileys,Belgian,Bisquick,Bourbon,Burgundy,Classic,Cool,...,strong,bouquet,crabmeat,garni,aubergine,baguettes,pico,gallo,gumbo,file
0,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4993,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4994,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4997,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4999,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
#export df_combined file for review
df_combined.to_csv('df_combined.csv')

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X=df_combined.drop(columns='AggregatedRating')
y=df_combined['AggregatedRating']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
classifier = LogisticRegression(random_state=1)
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")


Training Data Score: 0.7739583333333333
Testing Data Score: 0.6037441497659907


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
