# Importing necessary libraries

In [1]:
import json
import requests
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import os

# Function to fetch survey responses and save them as JSON files

In [2]:
def get_survey_responses():


    url = "https://data.g2.com/api/v1/survey-responses"


    headers = {


        "Authorization": "Token token=7809acba576f55834b281ff83442108167d73c0c86d163eddaeb4a90e90c55d4",


        "Content-Type": "application/vnd.api+json"


    }


    page_number = 1


    if not os.path.exists("responses"):
        os.mkdir("responses")



    while True:


        response = requests.get(url, headers=headers, params={


                                "page[number]": page_number})


        data = response.json()



        # Check if response data is empty


        if not data['data']:


            break



        with open(f"responses/survey_responses_{page_number}.json", "w") as json_file:


            json.dump(data, json_file, indent=4)


            json_file.write('\n')



        page_number += 1


        print(f"Data written to survey_responses_{page_number-1}.json")




get_survey_responses()

Data written to survey_responses_1.json
Data written to survey_responses_2.json
Data written to survey_responses_3.json
Data written to survey_responses_4.json
Data written to survey_responses_5.json
Data written to survey_responses_6.json
Data written to survey_responses_7.json
Data written to survey_responses_8.json
Data written to survey_responses_9.json
Data written to survey_responses_10.json
Data written to survey_responses_11.json
Data written to survey_responses_12.json
Data written to survey_responses_13.json
Data written to survey_responses_14.json
Data written to survey_responses_15.json
Data written to survey_responses_16.json
Data written to survey_responses_17.json
Data written to survey_responses_18.json
Data written to survey_responses_19.json
Data written to survey_responses_20.json
Data written to survey_responses_21.json
Data written to survey_responses_22.json
Data written to survey_responses_23.json
Data written to survey_responses_24.json
Data written to survey_re

In [3]:
# Initialize lists to store DataFrame objects
dfs = []

# Loop through all files from 1 to 77 and process JSON data
for i in range(1, 78):
    try:
        with open(f'responses/survey_responses_{i}.json') as file:
            file_json = json.load(file)
            attributes_df = pd.DataFrame(file_json['data'])
            attributes_df.drop(
                columns=['links', 'relationships', 'type'], inplace=True)
            attributes_df = pd.DataFrame(attributes_df['attributes'])
            comment_answers = [attributes_df['attributes'][i]
                               ['comment_answers'] for i in range(len(attributes_df))]
            secondary_answers = [attributes_df['attributes'][i]
                                 ['secondary_answers'] for i in range(len(attributes_df))]
            attributes_df['comment_answers'] = comment_answers
            attributes_df['secondary_answers'] = secondary_answers
            dfs.append(attributes_df)
    except FileNotFoundError:
        # If file not found, break out of the loop
        break
    except Exception as e:
        # Handle other exceptions
        print(f"Error processing file {i}: {e}")

# Concatenate all data frames collected in the list
data = pd.concat(dfs, ignore_index=True)
data.head()

Unnamed: 0,attributes,comment_answers,secondary_answers
0,"{'default_sort': 1.2710259698084911, 'product_...",{'love': {'text': 'What do you like best about...,{'ease_of_doing_business_with': {'text': 'Has ...
1,"{'default_sort': 2.5346166321766437, 'product_...",{'love': {'text': 'What do you like best about...,{'ease_of_doing_business_with': {'text': 'Has ...
2,"{'default_sort': 2.8232746888863036, 'product_...",{'love': {'text': 'What do you like best about...,{'ease_of_doing_business_with': {'text': 'Has ...
3,"{'default_sort': 3.098995914434322, 'product_n...",{'love': {'text': 'What do you like best about...,{'meets_requirements': {'text': 'Meets Require...
4,"{'default_sort': 3.247624428366846, 'product_n...",{'love': {'text': 'What do you like best about...,{'meets_requirements': {'text': 'Meets Require...


# Feature Engineering and Data Preprocessing

In [4]:
# Extract relevant information from 'comment_answers' and 'secondary_answers' columns
for i, (comment_answers, secondary_answers) in enumerate(zip(data['comment_answers'], data['secondary_answers'])):
    for key, value in comment_answers.items():
        data[key] = None
    for key, value in secondary_answers.items():
        data[key] = None
for i, (comment_answers, secondary_answers) in enumerate(zip(data['comment_answers'], data['secondary_answers'])):
    for key, value in comment_answers.items():
        data[key][i] = value.get('value', 'na')
    for key, value in secondary_answers.items():
        data[key][i] = value.get('value', 'na')
# Drop unnecessary columns
data.drop(columns=['comment_answers', 'secondary_answers'], inplace=True)

# Extract additional features
data['review_source'] = data['attributes'].apply(lambda x: x['review_source'])
data['is_incentivised'] = data['review_source'].apply(
    lambda x: True if 'was offered' in x else False)

# Preprocess text columns
text_columns = ['love', 'hate', 'recommendations', 'benefits']
for col in text_columns:
    data[col] = data[col].apply(lambda x: x if x else '')
data.to_csv('survey_responses.csv', index=False, header=True)
data.describe()

Unnamed: 0,attributes,love,hate,ease_of_doing_business_with,recommendations,benefits,meets_requirements,ease_of_use,quality_of_support,ease_of_setup,ease_of_admin,service_task_overview,review_source,is_incentivised
count,762,762.0,762.0,386.0,762.0,762.0,534.0,541.0,535.0,360.0,374.0,6,762,762
unique,762,742.0,742.0,6.0,149.0,734.0,7.0,7.0,7.0,6.0,5.0,6,7,2
top,"{'default_sort': 1.2710259698084911, 'product_...",,,7.0,,,7.0,7.0,7.0,7.0,7.0,We have been working with G2 Crowd since 2014 ...,G2 Gives Campaign. This reviewer was offered a...,True
freq,1,21.0,21.0,291.0,614.0,29.0,276.0,292.0,447.0,206.0,213.0,1,294,557


# Feature Extraction and Model Building

In [5]:
# TF-IDF (Term Frequency-Inverse Document Frequency)
all_love_benefits = (data['love'].to_list() + data['benefits'].to_list())
all_hate_recommendations = (
    data['hate'].to_list() + data['recommendations'].to_list())

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(7, 7))

# Fit and transform the data
tfidf_matrix_love_benefits = tfidf_vectorizer.fit_transform(all_love_benefits)
feature_names_love_benefits = tfidf_vectorizer.get_feature_names_out()

tfidf_matrix_hate_recommendations = tfidf_vectorizer.fit_transform(
    all_hate_recommendations)
feature_names_hate_recommendations = tfidf_vectorizer.get_feature_names_out()

# Calculate TF-IDF scores for each word pair
tfidf_scores_love_benefits = tfidf_matrix_love_benefits.toarray()
tfidf_scores_hate_recommendations = tfidf_matrix_hate_recommendations.toarray()

# Find top TF-IDF scores across all word pairs
top_tfidf_indices_love_benefits = tfidf_scores_love_benefits.mean(axis=0).argsort()[
    ::-1][:100]
top_tfidf_indices_hate_recommendations = tfidf_scores_hate_recommendations.mean(
    axis=0).argsort()[::-1][:100]

# Extract top word pairs
top_word_pairs_love_benefits = [
    feature_names_love_benefits[idx] for idx in top_tfidf_indices_love_benefits]
top_word_pairs_hate_recommendations = [
    feature_names_hate_recommendations[idx] for idx in top_tfidf_indices_hate_recommendations]

# Print top word pairs
print("Top 100 TF-IDF Word Pairs for Love and Benefits:")
print(", ".join(top_word_pairs_love_benefits))
print("\nTop 100 TF-IDF Word Pairs for Hate and Recommendations:")
print(", ".join(top_word_pairs_hate_recommendations))

Top 100 TF-IDF Word Pairs for Love and Benefits:
g2 helping gather feedback customers better understand, generating awareness using reviews useful sales tool, use g2 position market place drive products, social proof authorization capture high intent prospects, helping generate product reviews improve ranking competitors, simple start ease use highly communicative friendly, online reviews help consumers make purchasing decisions, tools funnel new leads helping sales close, helps comparing competitors need improve good customers, like software ranks competition way consumer focused, helps showcase company category leader differentiates competition, seeing progress make review count quarter quarter, numerical value showing company support vs competition, ve using grid report compare product competitors, g2 helps brand awareness b2b comparison competitors, hoping increase presence online build reputable g2, incorporating review process enable consistent reviews quarter, g2 important platf

# Feature Selection using LLM

In [6]:
from hugchat import hugchat
from hugchat.login import Login

# Log in to huggingface and grant authorization to huggingchat
EMAIL = "dpk2k2@gmail.com"
PASSWD = ":.2FRYtJRt!_7eM"
# NOTE: trailing slash (/) is required to avoid errors
cookie_path_dir = "./cookies/"
sign = Login(EMAIL, PASSWD)
cookies = sign.login(cookie_dir_path=cookie_path_dir, save_cookies=True)

# Create your ChatBot
# or cookie_path="usercookies/<email>.json"
chatbot = hugchat.ChatBot(cookies=cookies.get_dict())

# Generate feature sets

In [7]:
# Generate feature sets for positive reviews
messages_positive = '''I am developing a system for filtering out useful feature sets from reviews of a software product.\
    For that I have filtered out the most frequently used phrases/word sets in reviews.\
    Help me generate 10 feature sets from my set of phrases. Give me a small description for each.\
    Here is the feature set: '''+", ".join(top_word_pairs_love_benefits)

# Generate feature sets for negative reviews
messages_negative = '''I am developing a system for filtering out useful feature sets from bad reviews of a software product.\
    For that I have filtered out the most frequently used phrases/word sets in reviews.\
    Help me generate 10 feature sets from my set of phrases that the users have found problematic. Give me a small description for each.\
    ''' + ", ".join(top_word_pairs_hate_recommendations)

## Generate feature sets for positive reviews

In [8]:
response_positive = chatbot.chat(messages_positive)
with open('response_positive.txt', 'w') as file:
    file.write(str(response_positive))
print(response_positive)

Here are 10 feature sets with descriptions derived from the provided phrases: 

1. **G2 for Brand Awareness and Lead Generation**: G2 helps businesses, especially in the B2B space, to increase their brand awareness and generate leads. By showcasing reviews and comparisons with competitors, businesses can differentiate themselves and attract potential customers who are researching similar products. 

2. **Understanding Buyers' Intent**: Through G2's reviews and analytics, companies can better understand their buyers' needs, preferences, and pain points. This feature set helps businesses tailor their products and marketing strategies to align with their target audience's intentions, thereby improving conversion rates. 

3. **Competitive Intelligence**: G2 provides valuable insights into competitors' products, features, and customer sentiments. This feature set enables businesses to identify their unique selling points and develop effective go-to-market strategies by understanding their p

## Generate feature sets for negative reviews

In [9]:
response_negative = chatbot.chat(messages_negative)
with open('response_negative.txt', 'w') as file:
    file.write(str(response_negative))
print(response_negative)

Here are 10 feature sets with descriptions of problematic aspects derived from the provided negative review phrases: 

1. **Usability and Mobile Experience**: Users have encountered issues with the website's usability, especially on mobile devices. Admin mode and certain functionalities seem to be problematic, impacting the overall user experience. 

2. **Cost and Pricing Plans**: Several users find G2 to be expensive, particularly for startups or small businesses. The cost of accessing certain features and the difficulty in measuring ROI are concerns raised by reviewers. 

3. **Lead Generation and Integration**: While G2 promises lead generation, some users find the leads to be unqualified or difficult to integrate with their existing systems. There are also concerns about the additional cost of certain lead generation features. 

4. **Grid Reports and Methodology**: Users seek more transparency and clarity in G2's grid report methodology and algorithms. The process of replicating gri

In [11]:
!streamlit run app.py