In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
from pathlib import Path
from collections import Counter
import json
import nltk

nltk.download("punkt")
nltk.download("stopwords")

tf_idf = TfidfVectorizer(stop_words="english")


def encode_topics(df):
    # topics = df["topics"].str.get_dummies(sep=",")
    # topics = df["topics"].apply( topicfor topic in topics  )
    one_hot_encoded = (
        pd.get_dummies(df["topics"].apply(pd.Series).stack()).groupby(level=0).sum()
    )
    df = pd.concat([df, one_hot_encoded], axis=1)
    # print(df)
    return df


def set_index(df, index_column="poll_ID"):
    df.set_index(index_column, inplace=True)
    return df


def reset_index(df):
    df.reset_index()
    return df


def check_column_type(df, column_name, check_type):
    column_index = df.columns.get_loc(column_name)
    for i in range(len(df)):
        if not isinstance(df.iloc[i, column_index], check_type):
            print(
                f"error: {df.iloc[i, 0], df.iloc[i, 1],df.iloc[i, 2], df.iloc[i, 3], df.iloc[i, 4]}"
            )


def preprocess_text(text):
    tokens = nltk.tokenize.word_tokenize(text)
    # tokens = [word.lower() for word in tokens if type(word) is str]
    tokens = [word.lower() for word in tokens]
    tokens = [word for word in tokens if word not in string.punctuation]
    stop_words = set(nltk.corpus.stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    processed_text = " ".join(tokens)

    return processed_text


def preprocess_list(field_list):
    ret_list = []
    stop_words = set(nltk.corpus.stopwords.words("english"))
    for item in field_list:
        tokens = nltk.tokenize.word_tokenize(item)
        # tokens = [word.lower() for word in tokens if type(word) is str]
        tokens = [word.lower() for word in tokens]
        tokens = [word for word in tokens if word not in string.punctuation]
        tokens = [word for word in tokens if word not in stop_words]
        processed_text = " ".join(tokens)
        ret_list.append(processed_text)

    return ret_list


def create_tf_idf_matrix(df, column):
    # print(f"{df[column]} is {df[column].dtype} and {df[column].dtype is list} {list}: ")
    df[column] = df[column].apply(lambda x: " ".join(x))
    df[column] = df[column].apply(preprocess_text)

    return tf_idf.fit_transform(df[column])


def create_souped_tf_idf_matrix(df):
    df["topics"] = df["topics"].apply(preprocess_list)
    df["question"] = df["question"].apply(preprocess_text)

    # Create a new soup feature
    df["soup"] = df.apply(create_soup, axis=1)

    return tf_idf.fit_transform(df["soup"])


def create_soup(df):
    res = (
        df["question"]
        + " "
        + " ".join(df["options"])
        + " "
        + (4 * (" " + " ".join(df["topics"])))
    )
    # print(f"-----------------------------------\n* Processing: [{ }]")
    return res


def calc_cosine_similarity_matrix(tf_idf_matrix_1, tf_idf_matrix_2):
    return cosine_similarity(tf_idf_matrix_1, tf_idf_matrix_2)


def id_to_index(df, id):
    return df[df["id"] == id].index.values[0]


def title_from_idx(df, idx):
    return df[df.index == idx]


def gen_recommendations(
    index,
    df,
    cosine_similarity_matrix,
    number_of_recommendations,
):
    # index = idx_from_title(df, original_title)
    similarity_scores = list(enumerate(cosine_similarity_matrix[index]))
    similarity_scores_sorted = sorted(
        similarity_scores, key=lambda x: x[1], reverse=True
    )

    recommendations_indices = [
        t[0] for t in similarity_scores_sorted[1 : (number_of_recommendations + 1)]
    ]
    recommendations = list(df["title"].iloc[recommendations_indices])
    # print(recommendations)
    # print(similarity_scores_sorted, type(similarity_scores_sorted))
    # recommendations_indices = [
    #    t[0] for t in similarity_scores_sorted[1 : (number_of_recommendations + 1)]
    # ]
    # recommendations_scores = [
    #    t[1] for t in similarity_scores_sorted[1 : (number_of_recommendations + 1)]
    # ]
    # return (df["title"].iloc[recommendations_indices], recommendations_scores)

    return recommendations


def gen_rec_from_list_of_polls(
    interacted_polls, polls, cosine_similarity_matrix, number_of_recommendations
):
    recommendations = []
    for poll_id in interacted_polls:
        index = id_to_index(polls, poll_id)
        similarity_scores = list(enumerate(cosine_similarity_matrix[index]))
        similarity_scores_sorted = sorted(
            similarity_scores, key=lambda x: x[1], reverse=True
        )

        recommendations_indices = [
            t[0] for t in similarity_scores_sorted[1 : (number_of_recommendations + 1)]
        ]
        recs = list(polls["id"].iloc[recommendations_indices])

        # Filter out polls that have already been interacted with
        filtered_recs = [poll for poll in recs if poll not in interacted_polls]

        recommendations.append(filtered_recs)

    flattened_recommendations = [
        item for sublist in recommendations for item in sublist
    ]
    flattened_recommendations = Counter(flattened_recommendations)
    n_most_recommended = flattened_recommendations.most_common(
        number_of_recommendations
    )
    n_most_recommended = [t[0] for t in n_most_recommended]
    # print(n_most_recommended)

    return n_most_recommended


[nltk_data] Downloading package punkt to /home/erfan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/erfan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from elasticsearch import Elasticsearch
import json
class ElasticsearchHandel:
    def __init__(self, elasticsearch_url, username, password, fingerprint):
        self.elasticsearch_url = elasticsearch_url
        self.username = username
        self.password = password
        self.fingerprint = fingerprint
        self.client = Elasticsearch(
            hosts=self.elasticsearch_url,
            basic_auth=(self.username, self.password),
            ssl_assert_fingerprint=self.fingerprint,
        )

    def get_index(self, index_name, batch_size=100):
        setattr(self, index_name, [])
        index_list = getattr(self, index_name)
        from_index = 0
        all_instances = []

        while True:
            # query = {"query": {"match_all": {}}, "size": batch_size, "from": from_index}
            results = self.client.search(
                index=index_name,
                query={"match_all": {}},
                size=batch_size,
                from_=from_index,
            )
            instances = results["hits"]["hits"]

            all_instances.extend(instances)
            from_index += batch_size
            if len(instances) < 100:
                break

        setattr(self, index_name, [instance["_source"] for instance in all_instances])
        return getattr(self, index_name)

    def get_interactions(self, index_name, user_id, batch_size=100):
        # setattr(self, index_name, [])
        # index_list = getattr(self, index_name)
        from_index = 0
        all_instances = []

        query = {
            "match_phrase": {"userId": user_id},
        }

        results = self.client.search(
            index=index_name,
            query=query,
            size=batch_size,
            from_=from_index,
            timeout="1s",
        )
        # instances = results["hits"]["hits"][0]
        hits = results["hits"].get("hits")

        if not hits:
            # raise ValueError("User doesn't have any interactions.")
            raise InteractionNotFound()

        return hits[0].get("_source")

    def get_trend_polls(self, polls):
        # polls = getattr(self, "polls")
        # trend_polls = sorted(polls, key=lambda x: (-x["numberOfPollups"], -x["numberOfVotes"], -x["numberOfLike"]))
        trend_polls = sorted(
            polls,
            key=lambda x: (
                -x["numberOfVotes"],
                -x["numberOfLike"],
                # -x["numberOfPollUp"],
            ),
        )

        # recs = trend_polls["id"]

        # print("\n", filtered_trend_polls, "\n")
        # setattr(self, "trend_polls", trend_polls)
        return trend_polls

    def export_index_to_file(self, index, index_file_path):
        try:
            with open(index_file_path, "w") as output:
                # for instance in self.instances:
                #        json.dump(instance["_source"], output, indent=4)
                json.dump(index, output, indent=4)
        except Exception as exp:
            print("Export Error", exp)


In [3]:
import pandas as pd

elasticsearch_url = "https://159.203.183.251:9200"
username = "pollett"
password = "9r0&rJP@19GY"
fingerprint = "CE:AA:F7:FF:04:C7:31:14:78:9C:62:D4:CE:98:F9:EF:56:DA:70:45:37:14:E3:F8:66:0A:25:ED:05:04:83:ec"


elastic_handle = ElasticsearchHandel(
    elasticsearch_url, username, password, fingerprint
)

polls = elastic_handle.get_index("polls")
trend_polls = elastic_handle.get_trend_polls(polls)

polls_df = pd.DataFrame.from_records(polls)

polls_df

Unnamed: 0,id,question,options,topics,pollType,ownerId,createdAt,numberOfLike,numberOfViews,numberOfVotes,numberOfComments,numberOfPollUp
0,28fbd6d1-cbfd-4e68-95db-6b23ade84d92,which one better,"[Google Docs, Microsoft Office]",[Tech],Public,81105fcc-76de-4359-bb93-9a64fec66846,0001-01-01T00:00:00,0,2,2,0,
1,7e71f883-975a-45f8-9534-99d822c79344,which one?,"[2, 1]",[Tech],Public,81105fcc-76de-4359-bb93-9a64fec66846,0001-01-01T00:00:00,0,11,0,0,
2,7e58bb5b-393f-4a45-9286-65837e93925f,which series did you like more?,"[Friends, How I Met Your Mother]","[Fun, Movies & TV shows]",Public,fce979ae-b759-4c2c-bb93-25f036a581d1,0001-01-01T00:00:00,3,0,4,1,0.0
3,775798ae-78fe-4902-81e3-b092bf19c651,Which one?,"[Lenovo, Asus]",[Tech],Public,81105fcc-76de-4359-bb93-9a64fec66846,0001-01-01T00:00:00,2,5,9,1,0.0
4,3e85fad9-7095-40ca-afda-40efb9be14d8,which one ?,"[Samsung Galaxy S21 FE, Samsung Galaxy A54]",[Tech],Public,67eb27ca-ba0b-4d29-8627-9ec78327b512,0001-01-01T00:00:00,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
145,f90006d5-3a9a-45bc-b6ff-c68c736b3b1d,Would you rather live for the rest of your lif...,"[The Arctic, The Sahara desert]","[Sport, Tech]",Public,5c0bc399-5319-4072-911e-5ab48bfb4726,2023-05-28T09:06:48.570672Z,2,4,1,0,0.0
146,f94c3f13-6b19-40cd-9387-fc2a429ba62f,b,"[2, 1]",[General],Public,61400ff7-531a-425e-a506-e2a900eec613,2023-09-02T20:28:37.404203Z,0,1,0,0,0.0
147,fa88439c-f78f-480d-a9bd-a40c96ef946e,c,"[2, 1]",[General],Public,61400ff7-531a-425e-a506-e2a900eec613,2023-08-24T12:11:53.949164Z,0,1,0,0,0.0
148,fb954a1d-1116-4d3e-81f1-70e072311170,"If you had to, which one whould you lose?","[Food, Job]",[Science],Public,e00b366a-37a8-407d-9a15-e585d1ad539a,2023-08-15T08:49:52.077497Z,0,1,0,0,0.0


In [9]:
polls_tf_idf_matrix = create_souped_tf_idf_matrix(polls_df)
print(polls_tf_idf_matrix[0])

  (0, 238)	0.6332368270082208
  (0, 175)	0.34514310400217957
  (0, 163)	0.3735946073020432
  (0, 71)	0.3735946073020432
  (0, 112)	0.34514310400217957
  (0, 36)	0.285688178316335


The `polls_tf_idf_matrix` is a sparse matrix used to represent textual data in a numerical format. Let's break down its characteristics:

- **Dimensions**: The matrix has dimensions of 142 rows and 274 columns.

- **Sparse Matrix**: It's classified as a sparse matrix, meaning that the majority of its elements are zero. This is common in text data like TF-IDF matrices, where most terms do not appear in every document.

- **Data Type**: The elements of the matrix are of type `numpy.float64`, representing 64-bit floating-point numbers. This is the standard data type for TF-IDF values.

- **Stored Elements**: There are 586 non-zero elements (entries) in the matrix. Sparse matrices are memory-efficient because they only store these non-zero values.

- **Compressed Sparse Row Format (CSR)**: The matrix is stored in the Compressed Sparse Row (CSR) format, a widely used format for sparse matrices. It allows for efficient row-wise access and arithmetic operations.

In summary, the `polls_tf_idf_matrix` efficiently represents TF-IDF values of text data with 142 rows and 274 columns. Its sparse nature optimizes memory usage by storing only non-zero values, making it suitable for text analysis tasks.


In [5]:
cosine_similarity_matrix = calc_cosine_similarity_matrix(
                polls_tf_idf_matrix, polls_tf_idf_matrix
            )
cosine_similarity_matrix.shape


(150, 150)

**Cosine Similarity Matrix Explanation:**

- **Definition:** The `cosine_similarity_matrix` is a matrix designed to represent the similarity between pairs of polls within a dataset.

- **Calculation:** It is calculated using the `calc_cosine_similarity_matrix` function, which commonly utilizes the cosine similarity metric. Cosine similarity is a frequently used measure in natural language processing and information retrieval. It assesses the similarity between two vectors, in this context, the TF-IDF vectors representing the polls.

- **Interpretation:** In the `cosine_similarity_matrix`, each element `(i, j)` denotes the cosine similarity between two polls: poll `i` and poll `j`. The values within this matrix have a range from -1 to 1, and their meanings are as follows:

    - `1`: Indicates that the polls are identical or have the highest possible similarity.
    - `0`: Denotes that the polls are orthogonal, implying no similarity between them.
    - `-1`: Suggests that the polls are diametrically opposite or possess the highest possible dissimilarity.

This matrix is critical for generating recommendations as it quantifies the textual content's similarity or dissimilarity between different polls. By leveraging this similarity matrix, the recommendation system can identify polls with content similar to those the user has interacted with, resulting in more personalized and relevant recommendations.


In [19]:
#user_id = request.args.get("userId")
user_id = "67eb27ca-ba0b-4d29-8627-9ec78327b512"



userInteractions = elastic_handle.get_interactions(
                "userpollinteractions", user_id
            )

userInteractions = [
                interaction["pollId"]
                for interaction in userInteractions["userPollActions"][:20]
            ]
recommended_list = gen_rec_from_list_of_polls(
                userInteractions,
                polls_df,
                cosine_similarity_matrix,
                100,
            )

recommended_polls = polls_df[polls_df["id"].isin(recommended_list)]
recommended_polls

Unnamed: 0,id,question,options,topics,pollType,ownerId,createdAt,numberOfLike,numberOfViews,numberOfVotes,numberOfComments,numberOfPollUp,soup
0,016e4c36-84bf-48d1-9125-53fd65d3cec9,bbbb,"[2, 1]",[general],Public,61400ff7-531a-425e-a506-e2a900eec613,2023-08-24T11:55:48.108728Z,0,1,0,0,0.0,bbbb 2 1 general general general general
1,029cc519-fdbf-4f0a-8b38-803f2c1a2a4b,favorite movie genre,"[Romance, Science Fiction, Mystery/Thriller, H...",[movies tv shows],Public,61400ff7-531a-425e-a506-e2a900eec613,2023-08-17T06:42:55.062364Z,1,6,4,0,0.0,favorite movie genre Romance Science Fiction M...
2,03ec66cd-42fd-4de5-88e5-a97765238189,test,"[1, 2]",[general],Public,e00b366a-37a8-407d-9a15-e585d1ad539a,2023-09-10T13:18:16.603142Z,0,1,0,0,0.0,test 1 2 general general general general
3,058d2d5b-dc16-45de-a7c5-17cacfedd88d,aa,"[2, 1]",[general],Public,61400ff7-531a-425e-a506-e2a900eec613,2023-08-24T12:42:34.335171Z,0,1,1,0,0.0,aa 2 1 general general general general
5,0c9f7ece-bf55-4193-b46d-ad8ab871246d,vote best tv show time,"[Friends, The Office (US), Game of Thrones, Br...",[movies tv shows],Public,61400ff7-531a-425e-a506-e2a900eec613,2023-08-17T06:45:42.725365Z,1,8,6,1,0.0,vote best tv show time Friends The Office (US)...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,c9e86380-41a2-4c66-b485-e2d978ff7711,studying english,"[I’m thinking of studying in England., Because...","[politics, tech, sport]",Public,66271a97-73ba-41c8-b460-23d166e4c020,2023-06-14T07:44:05.158609Z,2,4,1,18,0.0,studying english I’m thinking of studying in E...
125,e749a6a3-592b-42be-a041-2eb155a9e95c,social media platoform prefer conduct poll,"[Facebook, Instagram , LinkedIn, Pollett , Twi...","[tech, politics, science]",Private,08f0071c-397c-420d-a1fb-32f613a73398,2023-05-14T15:44:22.843829Z,4,11,8,1,0.0,social media platoform prefer conduct poll Fac...
128,a4e77a01-7e82-4bd7-8910-4333d01a96c4,string,"[string, string]",[tech],Public,67eb27ca-ba0b-4d29-8627-9ec78327b512,0001-01-01T00:00:00,0,0,0,0,0.0,string string string tech tech tech tech
129,188c9313-a751-4211-85ab-14290e6c853d,string,"[string, string]",[tech],Public,67eb27ca-ba0b-4d29-8627-9ec78327b512,0001-01-01T00:00:00,0,0,0,0,0.0,string string string tech tech tech tech
