In [2]:
!pip install geopy pandas numpy scikit-learn



In [3]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from sklearn.decomposition import NMF
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv("dataset_TSMC2014_NYC.csv")

df = df.dropna(subset=["userId", "venueId", "latitude", "longitude"])
df["utcTimestamp"] = pd.to_datetime(df["utcTimestamp"], errors="coerce")

df = df[["userId", "venueId", "venueCategory", "latitude", "longitude", "utcTimestamp"]]

# Add implicit label (user visited = 1)
df["label"] = 1

user_enc = LabelEncoder()
venue_enc = LabelEncoder()

df["userId_enc"] = user_enc.fit_transform(df["userId"])
df["venueId_enc"] = venue_enc.fit_transform(df["venueId"])


interaction_matrix = (
    df.groupby(["userId_enc", "venueId_enc"])["label"]
      .sum().unstack(fill_value=0)
)

nmf = NMF(n_components=30, random_state=42)
user_features = nmf.fit_transform(interaction_matrix)
venue_features = nmf.components_.T

print("✅ Model training complete using NMF!")

def recommend_for_coordinates(user_original_id, coords_list, radius_m=500, top_n=5):
    """
    Given a user and list of coordinate points, recommend top-N venues nearby each.
    """
    if user_original_id not in user_enc.classes_:
        raise ValueError("User not found in training data.")

    user_id = user_enc.transform([user_original_id])[0]
    user_vector = user_features[user_id]

    candidates = df[["venueId", "venueId_enc", "venueCategory", "latitude", "longitude"]].drop_duplicates()

    results = []
    for lat, lon in coords_list:
        candidates["distance_m"] = candidates.apply(
            lambda r: geodesic((lat, lon), (r.latitude, r.longitude)).meters, axis=1
        )

        nearby = candidates[candidates["distance_m"] < radius_m].copy()
        if nearby.empty:
            continue


        nearby["score"] = nearby["venueId_enc"].apply(
            lambda v: np.dot(user_vector, venue_features[v])
        )


        top = nearby.sort_values("score", ascending=False).head(top_n)
        top["input_lat"] = lat
        top["input_lon"] = lon
        top["rank"] = np.arange(1, len(top) + 1)

        results.append(top[[
            "input_lat", "input_lon", "rank", "venueId", "venueCategory", "distance_m", "score"
        ]])

    return pd.concat(results, ignore_index=True) if results else pd.DataFrame()

test_user = df["userId"].iloc[0]
test_coords = [
    (40.7580, -73.9855),  # Times Square
    (40.7306, -73.9866),  # Greenwich Village
    (40.7060, -74.0086)   # Wall Street
]

recommendations_df = recommend_for_coordinates(test_user, test_coords, radius_m=800, top_n=5)

print("Top Recommended Points of Interest per Coordinate:")
print(recommendations_df)


  df["utcTimestamp"] = pd.to_datetime(df["utcTimestamp"], errors="coerce")


✅ Model training complete using NMF!
Top Recommended Points of Interest per Coordinate:
    input_lat  input_lon  rank                   venueId  \
0     40.7580   -73.9855     1  49b7ed6df964a52030531fe3   
1     40.7580   -73.9855     2  4f93f1c8e5e828f50a2b81d1   
2     40.7580   -73.9855     3  439c437bf964a520f02b1fe3   
3     40.7580   -73.9855     4  439c437bf964a520f02b1fe3   
4     40.7580   -73.9855     5  4840fe6bf964a52030501fe3   
5     40.7306   -73.9866     1  3fd66200f964a520def11ee3   
6     40.7306   -73.9866     2  4ad8add6f964a520a61321e3   
7     40.7306   -73.9866     3  4ad8add6f964a520a61321e3   
8     40.7306   -73.9866     4  4a8cc1d4f964a520130f20e3   
9     40.7306   -73.9866     5  4a8cc1d4f964a520130f20e3   
10    40.7060   -74.0086     1  3fd66200f964a520daf11ee3   
11    40.7060   -74.0086     2  4ad78220f964a520960b21e3   
12    40.7060   -74.0086     3  4c9f62d503133704f79471d5   
13    40.7060   -74.0086     4  4ddc09e71838306b81fd524f   
14    40.706

In [5]:
df = pd.read_csv("dataset_TSMC2014_NYC.csv")

df = df.dropna(subset=["userId", "venueId", "latitude", "longitude"])
df["utcTimestamp"] = pd.to_datetime(df["utcTimestamp"], errors = "coerce")
df = df.sort_values(["userId", "utcTimestamp"])

user_encode = LabelEncoder()
venue_encode= LabelEncoder()

df["userId_enc"] = user_encode.fit_transform(df["userId"])
df["venueId_enc"] = venue_encode.fit_transform(df["venueId"])

user_sequences = df.groupby("userId_enc")["venueId_enc"].apply(list)


  df["utcTimestamp"] = pd.to_datetime(df["utcTimestamp"], errors = "coerce")


In [6]:
X = []
y = []

for sequence in user_sequences:
  for i in range(1, len(sequence)):
    X.append(sequence[:i])
    y.append(sequence[i])


In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [8]:
max_len = max(len(s) for s in X)
X_pad = pad_sequences(X, maxlen=max_len, padding="pre")
y_cat = to_categorical(y, num_classes=len(venue_encode.classes_))

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

num_values = len(venue_encode.classes_)
embeddings = 64

model = Sequential([
    Embedding(input_dim=num_values, output_dim=embeddings, input_length=max_len),
    LSTM(128, return_sequences=False),
    Dense(num_values, activation="softmax")
])

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

model.fit(X_pad, y_cat, batch_size=64, epochs=10, validation_split=0.1)


