In [10]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('scraped_essays.csv')

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,author,school,year_written,grade,essay,title
0,Skyler,Hwa Chong Institution,2021,A,Hong Kong no doubt boasts some of the wealthie...,‘Prosperity is all that matters; everything el...
1,Skyler,Hwa Chong Institution,2021,A,"Freedom of speech is, to the Western world, a ...",Do you agree that freedom of speech should nev...
2,Skyler,Hwa Chong Institution,2021,A,"Since time immemorial, humanity has been plagu...","‘When children grow up exposed to violence, th..."
3,Junwei,Hwa Chong Institution,2020,36,The 2020 U.S. Presidential Election seemed to ...,A government that fails to win the trust of th...
4,Junwei,Hwa Chong Institution,2020,35/50,"In the aftermath of the Second World War, the ...",To what extent is the use of violence in today...


In [None]:
from groq import Groq
import ollama

In [24]:
# Get the first row's essay
essay_text = df.loc[0, 'essay']

In [25]:
# Initialize the Ollama client
ollama_client = ollama.Client()
GROQ_API_KEY = "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"
groq_client = Groq(api_key=GROQ_API_KEY)

In [None]:
def rewrite_essay_to_poor_quality_gemma2(client, essay_text: str) -> str:
    """
    Rewrite the given essay into a poorer quality version using the gemma-2 model.

    Args:
        client: The initialized Ollama client.
        essay_text (str): The original essay text to be rewritten.

    Returns:
        str: The rewritten essay with grammatical mistakes.
    """
    # Define the prompt for the gemma-2 model
    prompt = f"""
    You are a student researcher tasked with rewriting high school student essays into poorer quality versions as examples. 
    Introduce grammatical mistakes while keeping the original meaning intact. 
    Make it sound like the high school student tried their best to write but failed.
    {essay_text}

    In your response, only give the rewritten essay. Do not include any additional comments or explanations.

    ### After the rewrite, give a score out of 50. Include the score in the response like this:
    ### Score: XX/50
    """
    # Use the gemma-2 model to process the essay
    response = client.generate(model="gemma2", prompt=prompt)
    # Extract the rewritten essay from the response
    rewritten_essay = response['response']
    return rewritten_essay

In [None]:


def rewrite_to_poor_quality_llama(essay_text: str) -> str:
    """Rewrite the essay into a poorer quality version."""
    system_prompt = """
    You are an student researcher tasked with rewriting high school student essays into poorer quality versions as examples. 
    Introduce grammatical mistakes, wrong use of words, poor sentence structure, and incorrect examples while keeping the original meaning intact. 
    Make it sound like the high school student tried their best to write but failed.
    """
    user_message = f"""
    ### Essay:
    {essay_text}

    In your response, only give the rewritten essay. Do not include any additional comments or explanations.

    ### After the rewrite, give a score out of 50. Include the score in the response like this:
    ### Score: XX/50
    """

    response = groq_client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": user_message},
        ],
        temperature=0.7  # Slight randomness for diverse outputs
    )
    return response.choices[0].message.content.strip()

In [None]:
rewrite_to_poor_quality_llama(essay_text)



In [36]:
rewrite_essay_to_poor_quality_gemma2(ollama_client, essay_text)



In [None]:
# Create a new dataframe to store the results
results = []

# Loop through each row in the dataframe
for _, row in df.iterrows():
    title = row['title']
    essay = row['essay']
    
    # Rewrite the essay using both functions
    rewritten_llama = rewrite_to_poor_quality_llama(essay)
    rewritten_gemma2 = rewrite_essay_to_poor_quality_gemma2(ollama_client, essay)
    
    # Append the results to the list
    results.append({
        'title': title,
        'rewritten_llama': rewritten_llama,
        'rewritten_gemma2': rewritten_gemma2
    })

# Convert the results into a new dataframe
rewritten_df = pd.DataFrame(results)

# Display the new dataframe
rewritten_df.head()

Unnamed: 0,title,rewritten_lamma,rewritten_gemma2
0,‘Prosperity is all that matters; everything el...,Hong Kong is one of the worlds wealthiest plac...,Hong Kong surely has some of the wealthiest pe...
1,Do you agree that freedom of speech should nev...,Here is the rewritten essay:\n\nFreedom of spe...,"Freedom of speech is, to the Western world, a ..."
2,"‘When children grow up exposed to violence, th...",Here is the rewritten essay:\n\nSince time imm...,"Since time immemorial, humanity has ben plague..."
3,A government that fails to win the trust of th...,Here is the rewritten essay:\n\nThe 2020 U.S. ...,The 2020 U.S. Presidential Election seemed too...
4,To what extent is the use of violence in today...,Here is the rewritten essay:\n\nIn the afterma...,"In the aftermath of the Second World War, the ..."


In [38]:
rewritten_df.to_csv('rewritten_essays.csv', index=False)

In [None]:
# Remove the prefix "Here is the rewritten essay:" from rewritten_gemma2
rewritten_df['rewritten_gemma2'] = rewritten_df['rewritten_gemma2'].str.replace(r'^Here is the rewritten essay:\s*', '', regex=True)

# Remove the prefix "Here is the rewritten essay:" from rewritten_llama
rewritten_df['rewritten_llama'] = rewritten_df['rewritten_llama'].str.replace(r'^Here is the rewritten essay:\s*', '', regex=True)

# Extract scores from rewritten_gemma2 and rewritten_llama into new columns
rewritten_df['score_gemma2'] = rewritten_df['rewritten_gemma2'].str.extract(r'###\s*Score:\s*(\d+)/50', expand=False).astype(float)
rewritten_df['score_llama'] = rewritten_df['rewritten_llama'].str.extract(r'###\s*Score:\s*(\d+)/50', expand=False).astype(float)

# Remove the score part from rewritten_gemma2 and rewritten_llama
rewritten_df['rewritten_gemma2'] = rewritten_df['rewritten_gemma2'].str.replace(r'###\s*Score:\s*\d+/50', '', regex=True).str.strip()
rewritten_df['rewritten_llama'] = rewritten_df['rewritten_llama'].str.replace(r'###\s*Score:\s*\d+/50', '', regex=True).str.strip()

# Display the updated dataframe
rewritten_df.head()

Unnamed: 0,title,rewritten_lamma,rewritten_gemma2,score_gemma2,score_lamma
0,‘Prosperity is all that matters; everything el...,Hong Kong is one of the worlds wealthiest plac...,Hong Kong surely has some of the wealthiest pe...,32.0,25.0
1,Do you agree that freedom of speech should nev...,"Freedom of speech is, in the Western world, a ...","Freedom of speech is, to the Western world, a ...",32.0,28.0
2,"‘When children grow up exposed to violence, th...","Since time immemorial, humanity has been plagu...","Since time immemorial, humanity has ben plague...",,28.0
3,A government that fails to win the trust of th...,"The 2020 U.S. Presidential Election was like, ...",The 2020 U.S. Presidential Election seemed too...,28.0,28.0
4,To what extent is the use of violence in today...,"In the aftermath of the Second World War, the ...","In the aftermath of the Second World War, the ...",28.0,28.0


In [49]:
rewritten_df.head()

Unnamed: 0,title,rewritten_llama,rewritten_gemma2,score_gemma2,score_llama
0,‘Prosperity is all that matters; everything el...,Hong Kong is one of the worlds wealthiest plac...,Hong Kong surely has some of the wealthiest pe...,32.0,25.0
1,Do you agree that freedom of speech should nev...,"Freedom of speech is, in the Western world, a ...","Freedom of speech is, to the Western world, a ...",32.0,28.0
2,"‘When children grow up exposed to violence, th...","Since time immemorial, humanity has been plagu...","Since time immemorial, humanity has ben plague...",,28.0
3,A government that fails to win the trust of th...,"The 2020 U.S. Presidential Election was like, ...",The 2020 U.S. Presidential Election seemed too...,28.0,28.0
4,To what extent is the use of violence in today...,"In the aftermath of the Second World War, the ...","In the aftermath of the Second World War, the ...",28.0,28.0


In [None]:
# Merge the two dataframes on the 'title' column
merged_df = pd.merge(df, rewritten_df, on='title')

# Display the merged dataframe
merged_df.head()

Unnamed: 0,author,school,year_written,grade,essay,title,rewritten_llama,rewritten_gemma2,score_gemma2,score_llama
0,Skyler,Hwa Chong Institution,2021,A,Hong Kong no doubt boasts some of the wealthie...,‘Prosperity is all that matters; everything el...,Hong Kong is one of the worlds wealthiest plac...,Hong Kong surely has some of the wealthiest pe...,32.0,25.0
1,Skyler,Hwa Chong Institution,2021,A,"Freedom of speech is, to the Western world, a ...",Do you agree that freedom of speech should nev...,"Freedom of speech is, in the Western world, a ...","Freedom of speech is, to the Western world, a ...",32.0,28.0
2,Skyler,Hwa Chong Institution,2021,A,"Since time immemorial, humanity has been plagu...","‘When children grow up exposed to violence, th...","Since time immemorial, humanity has been plagu...","Since time immemorial, humanity has ben plague...",,28.0
3,Junwei,Hwa Chong Institution,2020,36,The 2020 U.S. Presidential Election seemed to ...,A government that fails to win the trust of th...,"The 2020 U.S. Presidential Election was like, ...",The 2020 U.S. Presidential Election seemed too...,28.0,28.0
4,Junwei,Hwa Chong Institution,2020,35/50,"In the aftermath of the Second World War, the ...",To what extent is the use of violence in today...,"In the aftermath of the Second World War, the ...","In the aftermath of the Second World War, the ...",28.0,28.0


In [None]:
merged_df.to_csv('rewritten_essays.csv', index=False)

In [53]:
import pandas as pd

# Example: original structure
# merged_df has columns:
# ['author', 'school', 'year_written', 'grade', 'essay', 'title', 'rewritten_llama', 'rewritten_gemma2', 'score_gemma2', 'score_llama']

# Step 1: Create three versions of the DataFrame
original = merged_df[['author', 'school', 'year_written', 'title', 'grade', 'essay']].copy()
original['type'] = 'original'
original = original.rename(columns={'essay': 'text', 'grade': 'score'})

llama = merged_df[['author', 'school', 'year_written', 'title', 'rewritten_llama', 'score_llama']].copy()
llama['type'] = 'llama'
llama = llama.rename(columns={'rewritten_llama': 'text', 'score_llama': 'score'})

gemma2 = merged_df[['author', 'school', 'year_written', 'title', 'rewritten_gemma2', 'score_gemma2']].copy()
gemma2['type'] = 'gemma2'
gemma2 = gemma2.rename(columns={'rewritten_gemma2': 'text', 'score_gemma2': 'score'})

# Step 2: Concatenate all three
long_df = pd.concat([original, llama, gemma2], ignore_index=True)

# Optional: reorder columns
long_df = long_df[['author', 'school', 'year_written', 'title', 'type', 'text', 'score']]


In [55]:
long_df.head()
long_df.to_csv('essays_with_augmentation.csv', index=False)