<a href="https://colab.research.google.com/github/gtanvi58/stack-on-the-code/blob/master/stackonthecodellmwithoutranking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import itertools
import json

# Read JSON file
with open('/Users/rithikaflorianjohnson/Downloads/answers.json', 'r') as file:
    data = json.load(file)

# Initialize lists to store extracted data
owner_reputation = []
score = []
body_text = []

# Extract data from JSON
for post in data:
    owner_reputation.append(post['owner']['reputation'])
    score.append(post['score'])
    body_text.append(post['body'])

# Split body text into code and non-code parts
code_snippets = []
non_code_texts = []

for text in body_text:
    code = ""
    non_code = ""
    in_code_block = False

    for line in text.split('\n'):
        if line.strip().startswith("<pre"):
            in_code_block = True
        elif line.strip().startswith("</pre>"):
            in_code_block = False
        elif in_code_block:
            code += line + "\n"
        else:
            non_code += line + "\n"

    code_snippets.append(code.strip())
    non_code_texts.append(non_code.strip())



normalized_reputation = [(rep - min(owner_reputation)) / (max(owner_reputation) - min(owner_reputation)) for rep in owner_reputation]
normalized_score = [(scr - min(score)) / (max(score) - min(score)) if scr != 0 else 0 for scr in score]

# Hardcoded weights for reputation and score
reputation_weight = 0.3
score_weight = 0.7

# Calculate combined scores using normalized reputation and score values
combined_scores = []
for rep, scr in zip(normalized_reputation, normalized_score):
    if scr == 0:
        combined_scores.append(0)
    else:
        combined_scores.append(reputation_weight * rep + score_weight * scr)





# Create DataFrame
df = pd.DataFrame({
    'Owner Reputation': owner_reputation,
    'Score': score,
    'Post':body_text,
    'Code Snippet': code_snippets,
    'Text': non_code_texts,
    'Rank': combined_scores
})

# Sort DataFrame by rank
df = df.sort_values(by='Rank', ascending=True)

# Reset index
#df.reset_index(drop=True, inplace=True)

# Display DataFrame



In [None]:
top3=df[:3]
top3

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")

In [None]:
def generate_summary(model, tokenizer, code, max_length=1024):
    # Generate summary for the entire code
    input_ids = tokenizer(code, return_tensors="pt", max_length=max_length, truncation=True).input_ids
    summary_ids = model.generate(input_ids, max_length=max_length)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [None]:
bodies = list(top3['Post'])
codes = list(top3['Code Snippet'])

for body, code in zip(bodies, codes):
    full_summary = generate_summary(model, tokenizer, body)
    print("Full Summary:")
    print(full_summary + "\n" + 'Code from the Post' + "\n"+ code)
    print()