In [13]:
import pandas as pd

# Load the dataset
df_train = pd.read_excel("ML_Py_Learn.xlsx")

# Display the first few rows of the dataset
df_train.head()


Unnamed: 0,Grupa 1,Grupa 2,Tytul,Tytul Url,Review
0,Attractions & Activities,Cathedral,Milan: Cathedral and Rooftop Ticket,https://www.getyourguide.com/milan-l139/duomo-...,18029
1,Attractions & Activities,Leonardo da Vinci,Milan: Da Vinci's Last Supper Guided Tour,https://www.getyourguide.com/last-supper-l4955...,5175
2,Outside the City,Como,"From Milan: Lake Como, Bellagio, and Varenna o...",https://www.getyourguide.com/milan-l139/lake-c...,5108
3,Attractions & Activities,Leonardo da Vinci,Milan Half-Day Tour Including da Vinci's 'Last...,https://www.viator.com/tours/Milan/Milan-Half-...,3692
4,Attractions & Activities,Cathedral,Milan: Fast-Track Milan Cathedral and Terraces...,https://www.getyourguide.com/milan-cathedral-l...,3023


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
MAX_NUM_WORDS = 10000  # Max number of words in the vocabulary
# Fill missing values in the Tytul column with a placeholder
df_train['Tytul'].fillna("unknown", inplace=True)

# TF-IDF Vectorization for the Tytul column
vectorizer = TfidfVectorizer(max_features=MAX_NUM_WORDS)
X_tfidf = vectorizer.fit_transform(df_train['Tytul'])
# Reinitialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Binarize the Grupa 1 and Grupa 2 columns
y = mlb.fit_transform(df_train[['Grupa 1', 'Grupa 2']].values)

# Split the data into training and validation sets
X_train_tfidf, X_val_tfidf, y_train, y_val = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42)

X_train_tfidf.shape, X_val_tfidf.shape, y_train.shape, y_val.shape
# Constants


((375, 697), (94, 697), (375, 90), (94, 90))

In [15]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Initialize the MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)

# Train the model
mlp.fit(X_train_tfidf, y_train)

# Predict on the validation set
y_pred = mlp.predict(X_val_tfidf)

# Calculate accuracy for each of the output (Grupa 1 and Grupa 2)
accuracy_group1 = accuracy_score(y_val[:, 0], y_pred[:, 0])
accuracy_group2 = accuracy_score(y_val[:, 1], y_pred[:, 1])

accuracy_group1, accuracy_group2




(1.0, 1.0)

In [16]:
# Load the test dataset
df_test = pd.read_excel("ML_Py_Test.xlsx")

# Display the first few rows of the test dataset
df_test.head()

Unnamed: 0,Grupa 1,Grupa 2,Tytul,Tytul Url,Review,City
0,Outside the City,Combined Tours,"Tuscany Day: Siena, San Gimignano & Monteriggi...",https://www.viator.com/tours/Florence/Full-Day...,725,Florance
1,Attractions & Activities,Accademia,Florence Uffizi Gallery Semi-Private Guided Tour,https://www.viator.com/tours/Florence/Skip-the...,712,Florance
2,Attractions & Activities,Uffizi,Skip the Line: Florence's Accademia Gallery Pr...,https://www.viator.com/tours/Florence/Florence...,712,Florance
3,Attractions & Activities,Food Tours,Tuscan Cooking Class and Dinner in Florence,https://www.viator.com/tours/Florence/Tuscan-C...,699,Florance
4,Attractions & Activities,Scooter,Small-Group Tuscany By Vespa,https://www.viator.com/tours/Florence/Small-Gr...,690,Florance


In [17]:
# Fill missing values in the Tytul column with a placeholder
df_test['Tytul'].fillna("unknown", inplace=True)

# Preprocess the Tytul column for the test dataset
X_test_tfidf = vectorizer.transform(df_test['Tytul'])

# Binarize the actual Grupa 1 and Grupa 2 values for comparison
y_test = mlb.transform(df_test[['Grupa 1', 'Grupa 2']].values)

# Make predictions using the trained model
y_test_pred = mlp.predict(X_test_tfidf)

# Calculate accuracy for each of the outputs (Grupa 1 and Grupa 2)
accuracy_test_group1 = accuracy_score(y_test[:, 0], y_test_pred[:, 0])
accuracy_test_group2 = accuracy_score(y_test[:, 1], y_test_pred[:, 1])

accuracy_test_group1, accuracy_test_group2




(0.9919354838709677, 0.9959677419354839)

In [18]:
# Load the new dataset
df_to_do = pd.read_excel("ML_Py_ToDo.xlsx")

# Check for missing values in the Tytul column and fill them
df_to_do['Tytul'].fillna("unknown", inplace=True)

# Preprocess the Tytul column using the trained TF-IDF vectorizer
X_to_do_tfidf = vectorizer.transform(df_to_do['Tytul'])

# Make predictions using the trained model
y_to_do_pred = mlp.predict(X_to_do_tfidf)

# Convert numerical predictions back to original category labels
predicted_labels = mlb.inverse_transform(y_to_do_pred)
# Extract predicted labels while handling potential missing predictions
df_to_do['Predicted Grupa 1'] = [labels[0] if len(labels) > 0 else "Unknown" for labels in predicted_labels]
df_to_do['Predicted Grupa 2'] = [labels[1] if len(labels) > 1 else "Unknown" for labels in predicted_labels]

df_to_do[['Tytul', 'Predicted Grupa 1', 'Predicted Grupa 2']].head()

Unnamed: 0,Tytul,Predicted Grupa 1,Predicted Grupa 2
0,1 Hour Private Veluba - Historic Tour of London,City Sightseeing,Walking City Tour
1,household cavalry museum entrance ticket in lo...,Unknown,Unknown
2,moulin rouge the musical entrance ticket in lo...,City Sightseeing,Unknown
3,Private Transfer Arrival or Departure Heathrow...,Transfers,Unknown
4,Private Tour of London's Landmarks in a Classi...,City Sightseeing,Unknown


In [20]:
df_to_do.to_excel('ML_processed.xlsx')

Unnamed: 0,Group,Group_2,Tytul,Tytul Url,City,Operators,Review,Predicted Grupa 1,Predicted Grupa 2
1,,,household cavalry museum entrance ticket in lo...,https://www.viator.com/tours/london/household-...,,,,Unknown,Unknown
7,,,the crown: a royal tour & boat ride,https://www.viator.com/tours/london/magic-of-f...,,,,Unknown,Unknown
11,,,London to Southampton Cruise Terminals Private...,https://www.viator.com/tours/London/Private-Ve...,,,,Unknown,Unknown
16,,,Private Arrival Transfer: Heathrow Airport LHR...,https://www.viator.com/tours/London/Private-Ar...,,,,Unknown,Unknown
17,,,"Stonehenge Inner Circle, Bath and Lacock Tour ...",https://www.viator.com/tours/London/Private-Vi...,,,,Unknown,Unknown
...,...,...,...,...,...,...,...,...,...
3432,,,Executive Transfer Central London to Dover Port,https://www.getyourguide.com/london-l57/execut...,,,,Unknown,Unknown
3437,,,Richmond: Paddleboard Experience,https://www.getyourguide.com/richmond-uk-l1557...,,,,Unknown,Unknown
3441,,,Festive London Christmas Day River Thames Lunc...,https://www.getyourguide.com/london-l57/london...,,,,Unknown,Unknown
3442,,,Windsor: Beginner Stand-Up Paddle Board Lesson,https://www.getyourguide.com/egham-l151220/win...,,,,Unknown,Unknown


In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
# Reload the dataset
data = pd.read_excel("ML_Py_Learn.xlsx")

# Remove rows with missing values in the "Tytul" column
data_cleaned = data.dropna(subset=['Tytul'])

# Convert 'Tytul' column into TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(max_features=MAX_NUM_WORDS)
X_tfidf = tfidf_vectorizer.fit_transform(data_cleaned['Tytul'])

# Label encoding for 'Grupa 1' and 'Grupa 2'
label_encoder_1 = LabelEncoder()
encoded_group1 = label_encoder_1.fit_transform(data_cleaned['Grupa 1'])

label_encoder_2 = LabelEncoder()
encoded_group2 = label_encoder_2.fit_transform(data_cleaned['Grupa 2'])

Y = pd.concat([pd.Series(encoded_group1), pd.Series(encoded_group2)], axis=1)

# Split the data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X_tfidf, Y, test_size=0.2, random_state=42)

X_train.shape, X_val.shape, Y_train.shape, Y_val.shape

# Define the base classifier
base_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Wrap it within a MultiOutputClassifier
multi_target_classifier = MultiOutputClassifier(base_classifier, n_jobs=-1)

# Train the multi-output classifier
multi_target_classifier.fit(X_train, Y_train)

# Check the training accuracy
training_accuracy = multi_target_classifier.score(X_train, Y_train)
validation_accuracy = multi_target_classifier.score(X_val, Y_val)

training_accuracy, validation_accuracy


(0.9946380697050938, 0.46808510638297873)

In [33]:
# Load the new dataset
to_do_data = pd.read_excel("ML_Py_ToDo_Porto.xlsx")

# Ensure there are no missing values in the "Tytul" column
to_do_data_cleaned = to_do_data.dropna(subset=['Tytul'])

# Convert 'Tytul' column in the new dataset into TF-IDF vectors (using the trained vectorizer)
X_to_do = tfidf_vectorizer.transform(to_do_data_cleaned['Tytul'])

# Make predictions using the trained model
predicted_values_to_do = multi_target_classifier.predict(X_to_do)
predicted_group1_to_do = label_encoder_1.inverse_transform(predicted_values_to_do[:, 0])
predicted_group2_to_do = label_encoder_2.inverse_transform(predicted_values_to_do[:, 1])

# Add the predictions to the new dataset
to_do_data_cleaned['Predicted Grupa 1'] = predicted_group1_to_do
to_do_data_cleaned['Predicted Grupa 2'] = predicted_group2_to_do

to_do_data_cleaned[['Tytul', 'Predicted Grupa 1', 'Predicted Grupa 2']].head()


Unnamed: 0,Tytul,Predicted Grupa 1,Predicted Grupa 2
0,"From Porto: Douro Valley w/ Boat Tour, Wine Ta...",Outside the City,Food Tours
1,"Porto: Hop-On Hop-Off Bus, River Cruise, & Por...",City Sightseeing,Hop-On Hop-off
2,Douro Valley Small-Group Tour with Wine Tastin...,Attractions & Activities,Food Tours
3,"Porto Card with Transportation (1, 2, 3 or 4 D...",Attractions & Activities,Art&Performance
4,"Porto: City Train Tour, River Cruise & Wine Ce...",Attractions & Activities,Boat Tour


In [34]:
to_do_data_cleaned.to_excel('test.xlsx')