In [54]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
from pathlib import Path
from collections import Counter
import json
import nltk
import requests
from datetime import datetime, timedelta


nltk.download("punkt")
nltk.download("stopwords")
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

tf_idf = TfidfVectorizer(stop_words="english")


def encode_topics(df):
    # topics = df["topics"].str.get_dummies(sep=",")
    # topics = df["topics"].apply( topicfor topic in topics  )
    one_hot_encoded = (
        pd.get_dummies(df["topics"].apply(pd.Series).stack()).groupby(level=0).sum()
    )
    df = pd.concat([df, one_hot_encoded], axis=1)
    # print(df)
    return df


def set_index(df, index_column="poll_ID"):
    df.set_index(index_column, inplace=True)
    return df


def reset_index(df):
    df.reset_index()
    return df


def check_column_type(df, column_name, check_type):
    column_index = df.columns.get_loc(column_name)
    for i in range(len(df)):
        if not isinstance(df.iloc[i, column_index], check_type):
            print(
                f"error: {df.iloc[i, 0], df.iloc[i, 1],df.iloc[i, 2], df.iloc[i, 3], df.iloc[i, 4]}"
            )


def preprocess_text(text):
    tokens = nltk.tokenize.word_tokenize(text)
    # tokens = [word.lower() for word in tokens if type(word) is str]
    tokens = [word.lower() for word in tokens]
    tokens = [word for word in tokens if word not in string.punctuation]
    stop_words = set(nltk.corpus.stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    processed_text = " ".join(tokens)

    return processed_text


def preprocess_list(field_list):
    ret_list = []
    stop_words = set(nltk.corpus.stopwords.words("english"))
    for item in field_list:
        tokens = nltk.tokenize.word_tokenize(item)
        # tokens = [word.lower() for word in tokens if type(word) is str]
        tokens = [word.lower() for word in tokens]
        tokens = [word for word in tokens if word not in string.punctuation]
        tokens = [word for word in tokens if word not in stop_words]
        processed_text = " ".join(tokens)
        ret_list.append(processed_text)

    return ret_list


def create_tf_idf_matrix(df, column):
    # print(f"{df[column]} is {df[column].dtype} and {df[column].dtype is list} {list}: ")
    df[column] = df[column].apply(lambda x: " ".join(x))
    df[column] = df[column].apply(preprocess_text)

    return tf_idf.fit_transform(df[column])


def create_souped_tf_idf_matrix(df):
    df["topics"] = df["topics"].apply(preprocess_list)
    df["question"] = df["question"].apply(preprocess_text)

    # Create a new soup feature
    df["soup"] = df.apply(create_soup, axis=1)

    return tf_idf.fit_transform(df["soup"])


def create_soup(df):
    res = (
        df["question"]
        + " "
        + " ".join(df["options"])
        + " "
        + (4 * (" " + " ".join(df["topics"])))
    )
    # print(f"-----------------------------------\n* Processing: [{ }]")
    return res


def calc_cosine_similarity_matrix(tf_idf_matrix_1, tf_idf_matrix_2):
    # if tf_idf_matrix_1 is not None and tf_idf_matrix_2 is not None:
    return cosine_similarity(tf_idf_matrix_1, tf_idf_matrix_2)


def id_to_index(df, search_id):
    result = df[df["id"] == str(search_id)].index.values[0]
    print(result)

    if len(result) > 0:
        return result
    else:
        return None


def id_to_index2(df, id):
    try:
        if any(df["id"] == str(id)):
            # df.to_csv("df.csv", index=False)
            # print(
            #    f"---------------\nFound {id} at {df[df['id'] == str(id)].index.values[0]}"
            # )
            # print(f"\nWhich is equal to:\n{df[df['id'] == str(id)]}")
            return df[df["id"] == str(id)].index.values[0]

    except IndexError as e:
        print(f"erorrrrrrrrrrrrr:")
        print(f"{str(id)}")
        print(f"{df['id']==str(id)}")


def title_from_idx(df, idx):
    return df[df.index == idx]


def gen_recommendations(
    index,
    df,
    cosine_similarity_matrix,
    number_of_recommendations,
):
    # index = idx_from_title(df, original_title)
    similarity_scores = list(enumerate(cosine_similarity_matrix[index]))
    similarity_scores_sorted = sorted(
        similarity_scores, key=lambda x: x[1], reverse=True
    )

    recommendations_indices = [
        t[0] for t in similarity_scores_sorted[1 : (number_of_recommendations + 1)]
    ]
    recommendations = list(df["title"].iloc[recommendations_indices])
    # print(recommendations)
    # print(similarity_scores_sorted, type(similarity_scores_sorted))
    # recommendations_indices = [
    #    t[0] for t in similarity_scores_sorted[1 : (number_of_recommendations + 1)]
    # ]
    # recommendations_scores = [
    #    t[1] for t in similarity_scores_sorted[1 : (number_of_recommendations + 1)]
    # ]
    # return (df["title"].iloc[recommendations_indices], recommendations_scores)

    return recommendations


def gen_rec_from_list_of_polls(
    interacted_polls,
    filtered_polls_df,
    cosine_similarity_matrix,
    number_of_recommendations,
):
    recommendations = []
    for poll_id in interacted_polls:
        index = id_to_index2(filtered_polls_df, poll_id)
        if index is not None:
            similarity_scores = list(enumerate(cosine_similarity_matrix[index]))
            similarity_scores_sorted = sorted(
                similarity_scores, key=lambda x: x[1], reverse=True
            )

            recommendations_indices = [
                t[0]
                for t in similarity_scores_sorted[1 : (number_of_recommendations + 1)]
            ]
            recs = list(filtered_polls_df["id"].iloc[recommendations_indices])

            # Filter out polls that have already been interacted with
            filtered_recs = [poll for poll in recs if poll not in interacted_polls]

            recommendations.append(filtered_recs)

        else:
            pass

        # index = id_to_index(polls, poll_id)
        # print(f"cosine_similarity_matrix:{len(cosine_similarity_matrix)}")
        # print(f"index:{index} | id:{poll_id}")

    flattened_recommendations = [
        item for sublist in recommendations for item in sublist
    ]
    flattened_recommendations = Counter(flattened_recommendations)
    n_most_recommended = flattened_recommendations.most_common(
        number_of_recommendations
    )
    n_most_recommended = [t[0] for t in n_most_recommended]
    # print(n_most_recommended)

    return n_most_recommended


def gen_rec_from_list_of_polls_df(
    interacted_polls,
    filtered_polls_df,
    cosine_similarity_matrix,
    number_of_recommendations,
):
    recommendations = []
    for poll_id in interacted_polls:
        index = id_to_index2(filtered_polls_df, poll_id)
        if index is not None:
            similarity_scores = list(enumerate(cosine_similarity_matrix[index]))
            similarity_scores_sorted = sorted(
                similarity_scores, key=lambda x: x[1], reverse=True
            )

            recommendations_indices = [
                t[0]
                for t in similarity_scores_sorted[1 : (number_of_recommendations + 1)]
            ]
            recs = list(filtered_polls_df["id"].iloc[recommendations_indices])

            # Filter out polls that have already been interacted with
            filtered_recs = [poll for poll in recs if poll not in interacted_polls]

            recommendations.append(filtered_recs)

        else:
            pass

        # index = id_to_index(polls, poll_id)
        # print(f"cosine_similarity_matrix:{len(cosine_similarity_matrix)}")
        # print(f"index:{index} | id:{poll_id}")

    flattened_recommendations = [
        item for sublist in recommendations for item in sublist
    ]
    flattened_recommendations = Counter(flattened_recommendations)
    n_most_recommended = flattened_recommendations.most_common(
        number_of_recommendations
    )
    n_most_recommended = [t[0] for t in n_most_recommended]

    filtered_df = filtered_polls_df[filtered_polls_df["id"].isin(n_most_recommended)]

    order_dict = {id: idx for idx, id in enumerate(n_most_recommended)}

    # Sort the filtered DataFrame based on the order
    filtered_df["order"] = filtered_df["id"].map(order_dict)
    filtered_df = filtered_df.sort_values("order")

    # Drop the 'order' column if not needed
    filtered_df = filtered_df.drop(columns=["order"])

    # Reset the index if needed
    filtered_df = filtered_df.reset_index(drop=True)

    return filtered_df


def is_valid_limitations(limitations):
    if isinstance(limitations, dict):
        return (
            "allowedLocations" in limitations
            and "allowedGender" in limitations
            and "allowedAgeRange" in limitations
        )
    return False


def is_within_10_days_liifetime(timestamp):
    try:
        # Convert the timestamp to a datetime object
        time = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")

        # Calculate the time difference
        time_difference = datetime.now() - time

        return True if time_difference <= timedelta(days=10) else False
    except ValueError:
        # If the timestamp doesn't match the expected format, return False
        return False


def order(polls_df):
    polls_df["createdAt"] = pd.to_datetime(polls_df["createdAt"])

    # Sort the DataFrame based on the 'createdAt' column in ascending order
    polls_df = polls_df.sort_values(by="createdAt", ascending=True)

    # If you want to sort in descending order, use the following line instead
    # df = df.sort_values(by='createdAt', ascending=False)

    # Reset the index to maintain the order of the sorted rows
    polls_df = polls_df.reset_index(drop=True)
    return polls_df


def filter_timestamp(timestamp):
    try:
        # Convert the timestamp to a datetime object
        time = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")

        # Calculate the time difference
        time_difference = datetime.now() - time

        return time_difference
    except ValueError:
        # If the timestamp doesn't match the expected format, return None
        return None


def split_df_by_lifetime(polls_df):
    filtered_df = polls_df[polls_df["createdAt"].apply(filter_timestamp).notna()]

    older_than_10_days = filtered_df[
        filtered_df["createdAt"].apply(filter_timestamp) >= timedelta(days=10)
    ]
    newer_than_10_days = filtered_df[
        filtered_df["createdAt"].apply(filter_timestamp) < timedelta(days=10)
    ]

    # Reset the index if needed
    older_than_10_days = older_than_10_days.reset_index(drop=True)
    newer_than_10_days = newer_than_10_days.reset_index(drop=True)

    return older_than_10_days, newer_than_10_days


def list_to_df(polls_list, polls_df):
    # Filter the DataFrame based on the id_list
    filtered_df = polls_df[polls_df["id"].isin(polls_list)]

    # Create a dictionary to preserve the order
    order_dict = {id: idx for idx, id in enumerate(polls_list)}

    # Sort the filtered DataFrame based on the order
    filtered_df["order"] = filtered_df["id"].map(order_dict)
    filtered_df = filtered_df.sort_values("order")

    # Drop the 'order' column if not needed
    filtered_df = filtered_df.drop(columns=["order"])

    # Reset the index if needed
    filtered_df = filtered_df.reset_index(drop=True)
    return filtered_df


def filter_polls(row, user_limitations):
    if (
        row["pollType"] == "Public"
        and isinstance(row.get("pollLimitations"), dict)
        and all(k in user_limitations for k in ["Location", "Gender", "Age"])
        # and is_within_10_days_liifetime(row["createdAt"])
    ):
        user_location = user_limitations.get("Location")

        allowed_locations = row.get("pollLimitations").get("allowedLocations")
        if len(allowed_locations) == 0 or any(
            user_location == loc for loc in allowed_locations
        ):
            allowed_gender = row["pollLimitations"]["allowedGender"]
            user_gender = user_limitations["Gender"]
            if allowed_gender == "All" or allowed_gender == user_gender:
                allowed_age_range = row["pollLimitations"]["allowedAgeRange"]
                user_age = user_limitations["Age"]
                if (
                    allowed_age_range["minimumAge"]
                    <= user_age
                    <= allowed_age_range["maximumAge"]
                ):
                    return True

    return False


def get_allowed_private_polls(
    params,
    url="https://dev.pollett.io/api/Recommend/Polls/GetPrivatePollThatUserCanSee",
):
    # API URL
    # url = "https://dev.pollett.io/api/Recommend/Polls/GetPrivatePollThatUserCanSee"

    # Parameters
    # params = {"userId": "bbe64b34-ba34-4fbd-a62f-e6c84c0423b4"}

    # Send a GET request to the API
    response = requests.get(url, params=params)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the JSON response
        # allowed_polls_list = response.json().get("data")
        return response.json().get("data")
        # Process the data as needed

    else:
        # Handle the error
        print(f"Request failed with status code {response.status_code}")
        print(response.text)

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1007)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1007)>


In [55]:
from elasticsearch import Elasticsearch
import json


class ElasticsearchHandel:
    def __init__(self, elasticsearch_url, username, password, fingerprint):
        self.elasticsearch_url = elasticsearch_url
        self.username = username
        self.password = password
        self.fingerprint = fingerprint
        self.client = Elasticsearch(
            hosts=self.elasticsearch_url,
            basic_auth=(self.username, self.password),
            ssl_assert_fingerprint=self.fingerprint,
        )

    def get_index(self, index_name, batch_size=100):
        setattr(self, index_name, [])
        index_list = getattr(self, index_name)
        from_index = 0
        all_instances = []

        while True:
            # query = {"query": {"match_all": {}}, "size": batch_size, "from": from_index}
            results = self.client.search(
                index=index_name,
                query={"match_all": {}},
                size=batch_size,
                from_=from_index,
            )
            instances = results["hits"]["hits"]

            all_instances.extend(instances)
            from_index += batch_size
            if len(instances) < 100:
                break

        setattr(self, index_name, [instance["_source"] for instance in all_instances])
        return getattr(self, index_name)

    def get_interactions(self, index_name, user_id, batch_size=100):
        # setattr(self, index_name, [])
        # index_list = getattr(self, index_name)
        from_index = 0
        all_instances = []

        query = {
            "match_phrase": {"userId": user_id},
        }

        results = self.client.search(
            index=index_name,
            query=query,
            size=batch_size,
            from_=from_index,
            timeout="1s",
        )
        # instances = results["hits"]["hits"][0]
        hits = results["hits"].get("hits")

        if not hits:
            # raise ValueError("User doesn't have any interactions.")
            raise InteractionNotFound()

        return hits[0].get("_source")

    def get_trend_polls(self, polls):
        # polls = getattr(self, "polls")
        # trend_polls = sorted(polls, key=lambda x: (-x["numberOfPollups"], -x["numberOfVotes"], -x["numberOfLike"]))
        trend_polls = sorted(
            polls,
            key=lambda x: (
                -x["numberOfVotes"],
                -x["numberOfLike"],
                # -x["numberOfPollUp"],
            ),
        )

        # recs = trend_polls["id"]

        # print("\n", filtered_trend_polls, "\n")
        # setattr(self, "trend_polls", trend_polls)
        return trend_polls

    def export_index_to_file(self, index, index_file_path):
        try:
            with open(index_file_path, "w") as output:
                # for instance in self.instances:
                #        json.dump(instance["_source"], output, indent=4)
                json.dump(index, output, indent=4)
        except Exception as exp:
            print("Export Error", exp)

In [56]:
import pandas as pd

elasticsearch_url = "https://159.203.183.251:9200"
username = "pollett"
password = "9r0&rJP@19GY"
fingerprint = "CE:AA:F7:FF:04:C7:31:14:78:9C:62:D4:CE:98:F9:EF:56:DA:70:45:37:14:E3:F8:66:0A:25:ED:05:04:83:ec"


elastic_handle = ElasticsearchHandel(elasticsearch_url, username, password, fingerprint)

polls = elastic_handle.get_index("polls")
trend_polls = elastic_handle.get_trend_polls(polls)

polls_df = pd.DataFrame.from_records(polls)

polls_df

Unnamed: 0,id,question,options,topics,pollType,ownerId,createdAt,numberOfLike,numberOfViews,numberOfVotes,numberOfComments,numberOfPollUp,isDeleted,pollLimitations,enedAt
0,2db5c3de-f765-4116-a8a7-72f18ebe76dd,What is your go-to cocktail for a night out: a classic Old Fashioned or a trendy Aperol Spritz?,"[Aperol Spritz, Old Fashioned]","[Fashion, Food&Drinks]",Public,66e9f472-07e6-4292-9452-4975fba3b292,2023-09-28T09:52:15.045993Z,0,0,0,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",2023-10-30T09:52:15.048545Z
1,2e81f16e-228c-446e-932f-c4ff275fc224,Which scientific discovery or invention has had the biggest impact on the way you stay fit?,"[Understanding of DNA & genetics, Discoveries in nutrition science, Development of fitness equipment]","[Science, Activity]",Public,7f3eee35-b8ea-4b83-ad9e-497a0961b9ad,2023-09-28T09:52:15.04518Z,0,0,0,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",2023-10-30T09:52:15.048545Z
2,2ebfc976-05d9-4780-a6b3-1402a6013bf6,Who is your favorite actor or actress who has played a famous musician in a movie?,"[Jennifer Lopez (Selena), JoaJett (The Runaways), Rami Malek (Freddie Mercury), Jamie Foxx (Ray Charles), Chadwick Boseman (James Brown)]","[Movies & TV shows, Music]",Public,3819a9c7-f762-4c3d-bf48-fedf4a797647,2023-09-28T09:52:15.049209Z,0,0,0,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",2023-10-30T09:52:15.048545Z
3,2ed7adb7-d547-4964-9405-6286f8cf6762,What's your preferred music genre for improving creativity and inspiration?,"[Jazz, World music, Experimental, Classical]","[Health, Music]",Public,ec50681b-7f44-4a87-9070-3730171bd2c1,2023-09-28T09:52:15.044554Z,0,0,0,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",2023-10-30T09:52:15.048545Z
4,2eda1de0-1f95-4998-8c81-f77b5d2c6aa7,Which movie accurately portrays the intelligence and problem-solving skills of pets?,"[Beverly Hills Chihuahua, Babe, The Secret Life of Pets]","[Pets, Movies & TV shows]",Public,0d734484-865c-4cb2-b326-06e0f3f0e6cd,2023-09-28T09:52:15.043768Z,0,0,0,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",2023-10-30T09:52:15.048545Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1979,b50432dd-9230-4c38-8c7e-c233d14df785,b,"[1, 2]",[General],Public,61400ff7-531a-425e-a506-e2a900eec613,2023-10-31T16:02:51.4058852Z,1,0,1,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",
1980,2e64cf90-119f-4b0f-86b8-c84d675f18b2,What is the optimal angle for blinds to control natural light while minimizing heat gain or loss?,"[0 degrees, 45 degrees, 90 degrees]","[Home Décor, Science]",Public,0724901b-5f89-44ff-a249-29c51a8e6985,2023-09-28T09:52:15.043453Z,1,0,0,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",2023-10-30T09:52:15.048545Z
1981,31293e36-8b74-41f3-a65e-2f90fe8e1107,Which feature is most important to you in a smart thermostat?,"[Energy-saving algorithms, Remote temperature control, Integration with other devices]","[Home Décor, Tech]",Public,8d2908fd-e0cf-4380-a881-1917c7022ed9,2023-09-28T09:52:15.04635Z,1,0,0,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",2023-10-30T09:52:15.048545Z
1982,4b656463-7cee-49cd-a74c-2593264efb64,Which type of curtain fabric provides the best noise reduction for a peaceful environment?,"[Velvet, Silk, Thick cotton]","[Science, Home Décor]",Public,0724901b-5f89-44ff-a249-29c51a8e6985,2023-09-28T09:52:15.043461Z,1,0,0,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",2023-10-30T09:52:15.048545Z


In [57]:
test_limitations = {
    "allowedLocations": [
        {"country": "United States", "city": "United States", "state": "NJ"},
        {"country": "United States", "city": "United States", "state": "NJ"},
        {"country": "United States", "city": "United States", "state": "NJ"},
    ],
    "allowedGender": "All",
    "allowedAgeRange": {"minimumAge": 0, "maximumAge": 120},
    "isDeleted": False,
}

constraint_parameters = {
    "Location": {"country": "United States", "city": "United States", "state": "NJ"},
    "Gender": "Men",
    "Age": 30,
}

In [58]:
from datetime import datetime, timedelta


# Function to check if the values are in the expected format
def is_valid_limitations(limitations):
    return (
        "allowedLocations" in limitations
        and "allowedGender" in limitations
        and "allowedAgeRange" in limitations
    )


def oldis_within_10_days_liifetime(timestamp):
    try:
        # Convert the timestamp to a datetime object
        time = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")

        # Calculate the time difference
        time_difference = datetime.now() - time

        return True if time_difference <= timedelta(days=10) else False
    except ValueError:
        # If the timestamp doesn't match the expected format, return False
        return False


def oldorder(timestamp):
    try:
        # Convert the timestamp to a datetime object
        time = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")

        # Calculate the time difference
        time_difference = datetime.now() - time

        return True if time_difference <= timedelta(days=10) else False
    except ValueError:
        # If the timestamp doesn't match the expected format, return False
        return False


# ----------------------------


def is_within_10_days_liifetime(timestamp):
    try:
        # Convert the timestamp to a datetime object
        time = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")

        # Calculate the time difference
        time_difference = datetime.now() - time

        return True if time_difference <= timedelta(days=10) else False
    except ValueError:
        # If the timestamp doesn't match the expected format, return False
        return False


def order(polls_df):
    polls_df["createdAt"] = pd.to_datetime(polls_df["createdAt"])

    # Sort the DataFrame based on the 'createdAt' column in ascending order
    polls_df = polls_df.sort_values(by="createdAt", ascending=True)

    # If you want to sort in descending order, use the following line instead
    # df = df.sort_values(by='createdAt', ascending=False)

    # Reset the index to maintain the order of the sorted rows
    polls_df = polls_df.reset_index(drop=True)
    return polls_df


def filter_timestamp(timestamp):
    try:
        # Convert the timestamp to a datetime object
        time = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")

        # Calculate the time difference
        time_difference = datetime.now() - time

        return time_difference
    except ValueError:
        # If the timestamp doesn't match the expected format, return None
        return None


def split_df_by_lifetime(polls_df):
    filtered_df = polls_df[polls_df["createdAt"].apply(filter_timestamp).notna()]

    older_than_10_days = filtered_df[
        filtered_df["createdAt"].apply(filter_timestamp) >= timedelta(days=10)
    ]
    newer_than_10_days = filtered_df[
        filtered_df["createdAt"].apply(filter_timestamp) < timedelta(days=10)
    ]

    # Reset the index if needed
    older_than_10_days = older_than_10_days.reset_index(drop=True)
    newer_than_10_days = newer_than_10_days.reset_index(drop=True)

    return older_than_10_days, newer_than_10_days


def split_df_by_lifetime_v2(polls_df):
    filtered_df = polls_df[polls_df["createdAt"].apply(filter_timestamp).notna()]

    older_than_10_days = filtered_df[
        filtered_df["createdAt"].apply(filter_timestamp) >= timedelta(days=10)
    ]
    newer_than_10_days = filtered_df[
        filtered_df["createdAt"].apply(filter_timestamp) < timedelta(days=10)
    ]

    # Reset the index if needed
    older_than_10_days = older_than_10_days.reset_index(drop=True)
    newer_than_10_days = newer_than_10_days.reset_index(drop=True)

    return older_than_10_days, newer_than_10_days


# ----------------------------


def filter_polls(row, user_limitations):
    if (
        row["pollType"] == "Public"
        and isinstance(row.get("pollLimitations"), dict)
        and all(k in user_limitations for k in ["Location", "Gender", "Age"])
        # and is_within_10_days_liifetime(row["createdAt"])
    ):
        user_location = user_limitations.get("Location")

        allowed_locations = row.get("pollLimitations").get("allowedLocations")
        if len(allowed_locations) == 0 or any(
            user_location == loc for loc in allowed_locations
        ):
            allowed_gender = row["pollLimitations"]["allowedGender"]
            user_gender = user_limitations["Gender"]
            if allowed_gender == "All" or allowed_gender == user_gender:
                allowed_age_range = row["pollLimitations"]["allowedAgeRange"]
                user_age = user_limitations["Age"]
                if (
                    allowed_age_range["minimumAge"]
                    <= user_age
                    <= allowed_age_range["maximumAge"]
                ):
                    return True

    return False

In [59]:
# Filter the DataFrame based on the user's limitations
# filtered_df = polls_df[polls_df['pollLimitations'].apply(lambda x: meets_limitations(x, test_user))]
# len(polls_df[polls_df["createdAt"] == "0001-01-01T00:00:00"])

# filtered_df = polls_df[polls_df.apply(filter_polls, args=(constraint_parameters,), axis=1)]
# filtered_df

In [94]:
filtered_polls_df = polls_df[
    polls_df.apply(filter_polls, args=(constraint_parameters,), axis=1)
]
user_id = "e00b366a-37a8-407d-9a15-e585d1ad539a"
user_id = "61400ff7-531a-425e-a506-e2a900eec613"
# filtered_polls_df = filtered_polls_df.reset_index(drop=True)
allowed_polls_list = get_allowed_private_polls(
    # params={"userId": "bbe64b34-ba34-4fbd-a62f-e6c84c0423b4"}
    params={"userId": user_id}
)
allowed_private_polls = polls_df[polls_df["id"].isin(allowed_polls_list)]
allowed_private_polls = allowed_private_polls[
    allowed_private_polls.apply(filter_polls, args=(constraint_parameters,), axis=1)
]

concatenated_df = pd.concat([filtered_polls_df, allowed_private_polls], axis=0)
# Reset the index if needed
concatenated_df.reset_index(drop=True, inplace=True)

In [101]:
concatenated_df

Unnamed: 0,id,question,options,topics,pollType,ownerId,createdAt,numberOfLike,numberOfViews,numberOfVotes,numberOfComments,numberOfPollUp,isDeleted,pollLimitations,enedAt
0,2db5c3de-f765-4116-a8a7-72f18ebe76dd,What is your go-to cocktail for a night out: a classic Old Fashioned or a trendy Aperol Spritz?,"[Aperol Spritz, Old Fashioned]","[Fashion, Food&Drinks]",Public,66e9f472-07e6-4292-9452-4975fba3b292,2023-09-28T09:52:15.045993Z,0,0,0,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",2023-10-30T09:52:15.048545Z
1,2e81f16e-228c-446e-932f-c4ff275fc224,Which scientific discovery or invention has had the biggest impact on the way you stay fit?,"[Understanding of DNA & genetics, Discoveries in nutrition science, Development of fitness equipment]","[Science, Activity]",Public,7f3eee35-b8ea-4b83-ad9e-497a0961b9ad,2023-09-28T09:52:15.04518Z,0,0,0,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",2023-10-30T09:52:15.048545Z
2,2ebfc976-05d9-4780-a6b3-1402a6013bf6,Who is your favorite actor or actress who has played a famous musician in a movie?,"[Jennifer Lopez (Selena), JoaJett (The Runaways), Rami Malek (Freddie Mercury), Jamie Foxx (Ray Charles), Chadwick Boseman (James Brown)]","[Movies & TV shows, Music]",Public,3819a9c7-f762-4c3d-bf48-fedf4a797647,2023-09-28T09:52:15.049209Z,0,0,0,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",2023-10-30T09:52:15.048545Z
3,2ed7adb7-d547-4964-9405-6286f8cf6762,What's your preferred music genre for improving creativity and inspiration?,"[Jazz, World music, Experimental, Classical]","[Health, Music]",Public,ec50681b-7f44-4a87-9070-3730171bd2c1,2023-09-28T09:52:15.044554Z,0,0,0,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",2023-10-30T09:52:15.048545Z
4,2eda1de0-1f95-4998-8c81-f77b5d2c6aa7,Which movie accurately portrays the intelligence and problem-solving skills of pets?,"[Beverly Hills Chihuahua, Babe, The Secret Life of Pets]","[Pets, Movies & TV shows]",Public,0d734484-865c-4cb2-b326-06e0f3f0e6cd,2023-09-28T09:52:15.043768Z,0,0,0,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",2023-10-30T09:52:15.048545Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1954,b50432dd-9230-4c38-8c7e-c233d14df785,b,"[1, 2]",[General],Public,61400ff7-531a-425e-a506-e2a900eec613,2023-10-31T16:02:51.4058852Z,1,0,1,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",
1955,2e64cf90-119f-4b0f-86b8-c84d675f18b2,What is the optimal angle for blinds to control natural light while minimizing heat gain or loss?,"[0 degrees, 45 degrees, 90 degrees]","[Home Décor, Science]",Public,0724901b-5f89-44ff-a249-29c51a8e6985,2023-09-28T09:52:15.043453Z,1,0,0,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",2023-10-30T09:52:15.048545Z
1956,31293e36-8b74-41f3-a65e-2f90fe8e1107,Which feature is most important to you in a smart thermostat?,"[Energy-saving algorithms, Remote temperature control, Integration with other devices]","[Home Décor, Tech]",Public,8d2908fd-e0cf-4380-a881-1917c7022ed9,2023-09-28T09:52:15.04635Z,1,0,0,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",2023-10-30T09:52:15.048545Z
1957,4b656463-7cee-49cd-a74c-2593264efb64,Which type of curtain fabric provides the best noise reduction for a peaceful environment?,"[Velvet, Silk, Thick cotton]","[Science, Home Décor]",Public,0724901b-5f89-44ff-a249-29c51a8e6985,2023-09-28T09:52:15.043461Z,1,0,0,0,0,False,"{'allowedLocations': [], 'allowedGender': 'All', 'allowedAgeRange': {'minimumAge': 0, 'maximumAge': 120}}",2023-10-30T09:52:15.048545Z


In [102]:
polls_tf_idf_matrix = create_souped_tf_idf_matrix(concatenated_df)

In [103]:
trend_polls = elastic_handle.get_trend_polls(polls)
trend_polls_df = pd.DataFrame.from_records(trend_polls)
filtered_trend_polls_df = trend_polls_df[
    trend_polls_df.apply(filter_polls, args=(constraint_parameters,), axis=1)
]
filtered_trend_polls_df = filtered_trend_polls_df.reset_index(drop=True)
filtered_trend_polls_list = filtered_trend_polls_df["id"].tolist()

In [104]:
print(f"filtered_trend_polls_list: {type(filtered_trend_polls_list)}")
print(f"filtered_trend_polls_list: {len(filtered_trend_polls_list)}")

filtered_trend_polls_list: <class 'list'>
filtered_trend_polls_list: 1959


In [105]:
filtered_polls_df = concatenated_df[["id", "createdAt", "enedAt"]]

In [106]:
cosine_similarity_matrix = calc_cosine_similarity_matrix(
    polls_tf_idf_matrix, polls_tf_idf_matrix
)

In [107]:
userInteractions = elastic_handle.get_interactions("userpollinteractions", user_id)

userInteractions = [
    interaction["pollId"] for interaction in userInteractions["userPollActions"][:20]
]

In [108]:
recommended_polls_df = gen_rec_from_list_of_polls_df(
    interacted_polls=userInteractions,
    filtered_polls_df=filtered_polls_df,
    cosine_similarity_matrix=cosine_similarity_matrix,
    number_of_recommendations=100,
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["order"] = filtered_df["id"].map(order_dict)


In [109]:
recommended_polls_df

Unnamed: 0,id,createdAt,enedAt
0,23dd304a-e65f-4efd-870d-56b6319d0153,2023-09-28T09:52:15.051215Z,2023-10-30T09:52:15.048545Z
1,0f333d25-7973-4443-be82-8e0edfbfb33d,2023-09-28T09:52:15.051175Z,2023-10-30T09:52:15.048545Z
2,5d19212d-7644-49a5-b390-dfda271b2713,2023-09-28T09:52:15.051203Z,2023-10-30T09:52:15.048545Z
3,791b78a4-97f0-41f5-a77b-db8d49bafd50,2023-09-28T09:52:15.051251Z,2023-10-30T09:52:15.048545Z
4,da5f852f-652e-426f-87f6-e096dcee1998,2023-09-28T09:52:15.051238Z,2023-10-30T09:52:15.048545Z
...,...,...,...
95,da60c663-c57b-495a-817d-bba783eaa08c,2023-09-28T09:52:15.05089Z,2023-10-30T09:52:15.048545Z
96,9add8f97-72e8-4270-9380-96882d2bb288,2023-09-28T09:52:15.050897Z,2023-10-30T09:52:15.048545Z
97,36ec0735-e7f8-446c-b6a1-1f336ec061e7,2023-09-28T09:52:15.050958Z,2023-10-30T09:52:15.048545Z
98,d3c2060e-dc29-4371-8aca-00466751e488,2023-09-28T09:52:15.050908Z,2023-10-30T09:52:15.048545Z


In [110]:
older_recommended_polls, newer_recommended_polls = split_df_by_lifetime(
    recommended_polls_df
)

In [111]:
trend_polls_df = list_to_df(filtered_trend_polls_list, filtered_polls_df)

In [112]:
older_trend_polls, newer_trend_polls = split_df_by_lifetime(trend_polls_df)
newer_trend_polls

Unnamed: 0,id,createdAt,enedAt
0,e3303719-bcae-455a-a7c2-fd29c17e6216,2023-10-30T07:56:05.744197Z,2023-11-09T07:56:06.318Z
1,6c9877b7-6607-456e-8bd5-d1aaf6d5b784,2023-10-31T05:34:34.716433Z,2023-11-10T05:34:34.478Z
2,a5d4f74a-7bc1-4dad-940f-1f471d377e96,2023-10-31T06:41:43.748954Z,2023-11-10T06:41:43.626Z
3,c75ba776-49f5-4862-adf7-a879ff4f3df2,2023-10-30T07:56:59.96682Z,2023-10-31T05:29:29.083378Z
4,7a3670b9-4824-4a55-ad12-ba07cb7b588d,2023-10-30T07:56:48.941826Z,2023-11-09T07:56:49.519Z
5,89846b5c-dfb7-4e5a-a5d8-dd0be3d0f95e,2023-10-30T07:56:35.518561Z,2023-11-09T07:56:36.093Z
6,e98fc321-a1b0-4787-8ba1-9b78e71c6789,2023-10-30T07:56:22.07728Z,2023-11-09T07:56:22.653Z


In [113]:
recommended_polls_list = pd.concat(
    [
        newer_recommended_polls,
        newer_trend_polls,
        older_recommended_polls,
        older_trend_polls,
    ],
    ignore_index=False,
)

In [114]:
recommended_polls_list

Unnamed: 0,id,createdAt,enedAt
0,e3303719-bcae-455a-a7c2-fd29c17e6216,2023-10-30T07:56:05.744197Z,2023-11-09T07:56:06.318Z
1,6c9877b7-6607-456e-8bd5-d1aaf6d5b784,2023-10-31T05:34:34.716433Z,2023-11-10T05:34:34.478Z
2,a5d4f74a-7bc1-4dad-940f-1f471d377e96,2023-10-31T06:41:43.748954Z,2023-11-10T06:41:43.626Z
3,c75ba776-49f5-4862-adf7-a879ff4f3df2,2023-10-30T07:56:59.96682Z,2023-10-31T05:29:29.083378Z
4,7a3670b9-4824-4a55-ad12-ba07cb7b588d,2023-10-30T07:56:48.941826Z,2023-11-09T07:56:49.519Z
...,...,...,...
1944,f347c855-50c4-4f70-864b-bd994039afb3,2023-09-28T09:52:15.045344Z,2023-10-30T09:52:15.048545Z
1945,f34c6ec9-59ce-4180-aa84-0eb6956a27cc,2023-09-28T09:52:15.044215Z,2023-10-30T09:52:15.048545Z
1946,f39bdec3-8ba4-450f-b0fa-f235b87b3820,2023-09-28T09:52:15.049067Z,2023-10-30T09:52:15.048545Z
1947,f3c0981f-9906-4b19-a093-69f4a7eebc74,2023-09-28T09:52:15.043282Z,2023-10-30T09:52:15.048545Z


In [115]:
print(f"Length newer_recommended_polls:{len(newer_recommended_polls)}")
print(f"Length newer_trend_polls:{len(newer_trend_polls)}")
print(f"Length older_recommended_polls:{len(older_recommended_polls)}")
print(f"Length older_trend_polls:{len(older_trend_polls)}")

Length newer_recommended_polls:0
Length newer_trend_polls:7
Length older_recommended_polls:100
Length older_trend_polls:1949


In [91]:
def remove_duplicates(input_list):
    seen = set()
    result = []

    for item in input_list:
        if item not in seen:
            seen.add(item)
            result.append(item)

    return result


recommended_polls_list = recommended_polls_list.reset_index(drop=True)
recommended_polls_list = recommended_polls_list["id"].tolist()
recommended_polls_list = remove_duplicates(recommended_polls_list)

AttributeError: 'list' object has no attribute 'reset_index'

In [93]:
len(recommended_polls_list)

1956