# Importing necessary libraries

In [3]:
import json
import requests
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import os

# Function to fetch survey responses and save them as JSON files

In [25]:
def get_survey_responses():


    url = "https://data.g2.com/api/v1/survey-responses"


    headers = {


        "Authorization": "Token token=7809acba576f55834b281ff83442108167d73c0c86d163eddaeb4a90e90c55d4",


        "Content-Type": "application/vnd.api+json"


    }


    page_number = 1


    if not os.path.exists("responses"):
        os.mkdir("responses")



    while True:


        response = requests.get(url, headers=headers, params={


                                "page[number]": page_number})


        data = response.json()



        # Check if response data is empty


        if not data['data']:


            break



        with open(f"responses/survey_responses_{page_number}.json", "w") as json_file:


            json.dump(data, json_file, indent=4)


            json_file.write('\n')



        page_number += 1


        print(f"Data written to survey_responses_{page_number-1}.json")




get_survey_responses()

Data written to survey_responses_1.json
Data written to survey_responses_2.json
Data written to survey_responses_3.json
Data written to survey_responses_4.json
Data written to survey_responses_5.json
Data written to survey_responses_6.json
Data written to survey_responses_7.json
Data written to survey_responses_8.json
Data written to survey_responses_9.json
Data written to survey_responses_10.json
Data written to survey_responses_11.json
Data written to survey_responses_12.json
Data written to survey_responses_13.json
Data written to survey_responses_14.json
Data written to survey_responses_15.json
Data written to survey_responses_16.json
Data written to survey_responses_17.json
Data written to survey_responses_18.json
Data written to survey_responses_19.json
Data written to survey_responses_20.json
Data written to survey_responses_21.json
Data written to survey_responses_22.json
Data written to survey_responses_23.json
Data written to survey_responses_24.json
Data written to survey_re

In [10]:
# Initialize lists to store DataFrame objects
dfs = []

# Loop through all files from 1 to 77 and process JSON data
for i in range(1, 78):
    try:
        with open(f'responses/survey_responses_{i}.json') as file:
            file_json = json.load(file)
            attributes_df = pd.DataFrame(file_json['data'])
            attributes_df.drop(
                columns=['links', 'relationships', 'type'], inplace=True)
            attributes_df = pd.DataFrame(attributes_df['attributes'])
            comment_answers = [attributes_df['attributes'][i]
                               ['comment_answers'] for i in range(len(attributes_df))]
            secondary_answers = [attributes_df['attributes'][i]
                                 ['secondary_answers'] for i in range(len(attributes_df))]
            attributes_df['comment_answers'] = comment_answers
            attributes_df['secondary_answers'] = secondary_answers
            dfs.append(attributes_df)
    except FileNotFoundError:
        # If file not found, break out of the loop
        break
    except Exception as e:
        # Handle other exceptions
        print(f"Error processing file {i}: {e}")

# Concatenate all data frames collected in the list
data = pd.concat(dfs, ignore_index=True)
data.head()

Unnamed: 0,attributes,comment_answers,secondary_answers
0,"{'default_sort': 1.2710259698084911, 'product_...",{'love': {'text': 'What do you like best about...,{'ease_of_doing_business_with': {'text': 'Has ...
1,"{'default_sort': 2.5346166321766437, 'product_...",{'love': {'text': 'What do you like best about...,{'ease_of_doing_business_with': {'text': 'Has ...
2,"{'default_sort': 2.8232746888863036, 'product_...",{'love': {'text': 'What do you like best about...,{'ease_of_doing_business_with': {'text': 'Has ...
3,"{'default_sort': 3.098995914434322, 'product_n...",{'love': {'text': 'What do you like best about...,{'meets_requirements': {'text': 'Meets Require...
4,"{'default_sort': 3.247624428366846, 'product_n...",{'love': {'text': 'What do you like best about...,{'meets_requirements': {'text': 'Meets Require...


# Feature Engineering and Data Preprocessing

In [11]:
# Extract relevant information from 'comment_answers' and 'secondary_answers' columns
for i, (comment_answers, secondary_answers) in enumerate(zip(data['comment_answers'], data['secondary_answers'])):
    for key, value in comment_answers.items():
        data[key] = None
    for key, value in secondary_answers.items():
        data[key] = None
for i, (comment_answers, secondary_answers) in enumerate(zip(data['comment_answers'], data['secondary_answers'])):
    for key, value in comment_answers.items():
        data[key][i] = value.get('value', 'na')
    for key, value in secondary_answers.items():
        data[key][i] = value.get('value', 'na')
# Drop unnecessary columns
data.drop(columns=['comment_answers', 'secondary_answers'], inplace=True)

# Extract additional features
data['review_source'] = data['attributes'].apply(lambda x: x['review_source'])
data['is_incentivised'] = data['review_source'].apply(
    lambda x: True if 'was offered' in x else False)

# Preprocess text columns
text_columns = ['love', 'hate', 'recommendations', 'benefits']
for col in text_columns:
    data[col] = data[col].apply(lambda x: x if x else '')
data.to_csv('survey_responses.csv', index=False, header=True)
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762 entries, 0 to 761
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   attributes                   762 non-null    object
 1   love                         762 non-null    object
 2   hate                         762 non-null    object
 3   ease_of_doing_business_with  386 non-null    object
 4   recommendations              762 non-null    object
 5   benefits                     762 non-null    object
 6   meets_requirements           534 non-null    object
 7   ease_of_use                  541 non-null    object
 8   quality_of_support           535 non-null    object
 9   ease_of_setup                360 non-null    object
 10  ease_of_admin                374 non-null    object
 11  service_task_overview        6 non-null      object
 12  review_source                762 non-null    object
 13  is_incentivised              762 no

# Feature Extraction and Model Building

In [28]:
# TF-IDF (Term Frequency-Inverse Document Frequency)
all_love_benefits = (data['love'].to_list() + data['benefits'].to_list())
all_hate_recommendations = (
    data['hate'].to_list() + data['recommendations'].to_list())

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(7, 7))

# Fit and transform the data
tfidf_matrix_love_benefits = tfidf_vectorizer.fit_transform(all_love_benefits)
feature_names_love_benefits = tfidf_vectorizer.get_feature_names_out()

tfidf_matrix_hate_recommendations = tfidf_vectorizer.fit_transform(
    all_hate_recommendations)
feature_names_hate_recommendations = tfidf_vectorizer.get_feature_names_out()

# Calculate TF-IDF scores for each word pair
tfidf_scores_love_benefits = tfidf_matrix_love_benefits.toarray()
tfidf_scores_hate_recommendations = tfidf_matrix_hate_recommendations.toarray()

# Find top TF-IDF scores across all word pairs
top_tfidf_indices_love_benefits = tfidf_scores_love_benefits.mean(axis=0).argsort()[
    ::-1][:100]
top_tfidf_indices_hate_recommendations = tfidf_scores_hate_recommendations.mean(
    axis=0).argsort()[::-1][:100]

# Extract top word pairs
top_word_pairs_love_benefits = [
    feature_names_love_benefits[idx] for idx in top_tfidf_indices_love_benefits]
top_word_pairs_hate_recommendations = [
    feature_names_hate_recommendations[idx] for idx in top_tfidf_indices_hate_recommendations]

# Print top word pairs
print("Top 100 TF-IDF Word Pairs for Love and Benefits:")
print(", ".join(top_word_pairs_love_benefits))
print("\nTop 100 TF-IDF Word Pairs for Hate and Recommendations:")
print(", ".join(top_word_pairs_hate_recommendations))

Top 100 TF-IDF Word Pairs for Love and Benefits:
g2 helping gather feedback customers better understand, generating awareness using reviews useful sales tool, use g2 position market place drive products, social proof authorization capture high intent prospects, helping generate product reviews improve ranking competitors, simple start ease use highly communicative friendly, online reviews help consumers make purchasing decisions, tools funnel new leads helping sales close, helps comparing competitors need improve good customers, like software ranks competition way consumer focused, helps showcase company category leader differentiates competition, seeing progress make review count quarter quarter, numerical value showing company support vs competition, ve using grid report compare product competitors, g2 helps brand awareness b2b comparison competitors, hoping increase presence online build reputable g2, incorporating review process enable consistent reviews quarter, g2 important platf

# Feature Selection using LLM

In [29]:
from hugchat import hugchat
from hugchat.login import Login

# Log in to huggingface and grant authorization to huggingchat
EMAIL = "dpk2k2@gmail.com"
PASSWD = ":.2FRYtJRt!_7eM"
# NOTE: trailing slash (/) is required to avoid errors
cookie_path_dir = "./cookies/"
sign = Login(EMAIL, PASSWD)
cookies = sign.login(cookie_dir_path=cookie_path_dir, save_cookies=True)

# Create your ChatBot
# or cookie_path="usercookies/<email>.json"
chatbot = hugchat.ChatBot(cookies=cookies.get_dict())

# Generate feature sets

In [30]:
# Generate feature sets for positive reviews
messages_positive = '''I am developing a system for filtering out useful feature sets from reviews of a software product.\
    For that I have filtered out the most frequently used phrases/word sets in reviews.\
    Help me generate 10 feature sets from my set of phrases. Give me a small description for each.\
    Here is the feature set: '''+", ".join(top_word_pairs_love_benefits)

# Generate feature sets for negative reviews
messages_negative = '''I am developing a system for filtering out useful feature sets from bad reviews of a software product.\
    For that I have filtered out the most frequently used phrases/word sets in reviews.\
    Help me generate 10 feature sets from my set of phrases that the users have found problematic. Give me a small description for each.\
    ''' + ", ".join(top_word_pairs_hate_recommendations)

## Generate feature sets for positive reviews

In [31]:
response_positive = chatbot.chat(messages_positive)
with open('response_positive.txt', 'w') as file:
    file.write(str(response_positive))
print(response_positive)

Here are 10 feature sets with descriptions derived from the provided phrases: 

1. **G2 for Lead Generation and Brand Awareness**: G2 helps businesses generate leads and build brand awareness, especially in the B2B space. It provides a platform to showcase products, gather customer feedback, and establish credibility through reviews, helping businesses stand out in a crowded marketplace. 

2. **Competitive Intelligence and Market Insights**: G2 offers valuable insights into the competitor landscape, market trends, and buyer behavior. Businesses can use G2 to understand their positioning, identify areas of improvement, and make data-driven decisions to enhance their products and marketing strategies. 

3. **Easy Onboarding and User-Friendly Interface**: The G2 platform is known for its simplicity and ease of use. Its intuitive interface makes it easy for clients and customers to leave reviews, providing a seamless experience that encourages engagement and improves the likelihood of gene

## Generate feature sets for negative reviews

In [32]:
response_negative = chatbot.chat(messages_negative)
with open('response_negative.txt', 'w') as file:
    file.write(str(response_negative))
print(response_negative)

Here are 10 feature sets with descriptions of problematic aspects, derived from the provided negative review phrases: 

1. **Usability and Mobile Experience**: Users have encountered issues with the website's usability, especially on mobile devices. Admin mode and certain functionalities seem to be problematic, impacting the overall user experience and making it cumbersome to manage the platform on the go. 

2. **Cost and Pricing Plans**: Several users find G2 to be expensive, particularly for startups or smaller businesses. The cost of accessing certain features and the need for higher-tier plans to utilize advanced functionalities are common concerns. Some also mention that justifying the ROI can be challenging. 

3. **Lead Generation and Quality**: While G2 promises lead generation, some users are disappointed with the quality and volume of leads received. They find the leads to be highly unqualified or difficult to integrate into their existing systems, questioning the value of G2 

In [14]:
!streamlit run app.py

^C
