In [3]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('scraped_essays.csv')

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,author,school,year_written,grade,essay,title
0,Skyler,Hwa Chong Institution,2021,A,Hong Kong no doubt boasts some of the wealthie...,‘Prosperity is all that matters; everything el...
1,Skyler,Hwa Chong Institution,2021,A,"Freedom of speech is, to the Western world, a ...",Do you agree that freedom of speech should nev...
2,Skyler,Hwa Chong Institution,2021,A,"Since time immemorial, humanity has been plagu...","‘When children grow up exposed to violence, th..."
3,Junwei,Hwa Chong Institution,2020,36,The 2020 U.S. Presidential Election seemed to ...,A government that fails to win the trust of th...
4,Junwei,Hwa Chong Institution,2020,35/50,"In the aftermath of the Second World War, the ...",To what extent is the use of violence in today...


In [4]:
from groq import Groq
import ollama

In [5]:
# Get the first row's essay and title
essay_text, essay_title = df.loc[0, ['essay', 'title']]

In [6]:
# Initialize the Ollama client
ollama_client = ollama.Client()
GROQ_API_KEY = "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"
groq_client = Groq(api_key=GROQ_API_KEY)

In [18]:
def rewrite_essay_to_poor_quality_gemma2(client, essay_title: str, essay_text: str,) -> str:
    # Define the prompt for the gemma-2 model
    prompt = f"""
    You are a student researcher tasked with writing imperfect high school student essays as examples. 
    Write an essay about this topic: {essay_title}.

    The essay should contain some grammatical errors and awkward phrasing.
    Make it sound like the high school student tried their best to write but failed.

    You may refer to this model essay as a reference for the topic, but do not copy it. 
    Do not copy the examples, but try to use novel ones.
    {essay_text}

    Remember to introduce grammatical mistakes, wrong use of words, poor sentence structure, and incorrect examples.

    In your response, only give the essay. Do not include any additional comments or explanations.

    ### After the rewrite, give a score out of 50. Include the score in the response like this:
    ### Score: XX/50
    """
    # Use the gemma-2 model to process the essay
    response = client.generate(model="gemma2", prompt=prompt)
    # Extract the rewritten essay from the response
    rewritten_essay = response['response']
    return rewritten_essay

In [None]:
def rewrite_to_poor_quality_llama(essay_title: str,essay_text: str) -> str:
    system_prompt = """
    You are a student researcher tasked with writing imperfect high school student essays as examples. 
    Introduce grammatical mistakes, and some awkward phrasing. 
    Make it sound like the high school student tried their best to write but failed.
    """
    user_message = f"""
    Write a essay about this topic: {essay_title}.
    

    You may refer to this model essay as a reference for the topic, but do not copy it. 
    Do not copy the examples, but try to use novel ones.
    ### Example Essay:
    {essay_text}

    Remember to introduce grammatical mistakes. 
    In your response, only give the essay. Do not include any additional comments or explanations.

    ### After the rewrite, give a score out of 50. Include the score in the response like this:
    ### Score: XX/50
    """

    response = groq_client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": user_message},
        ],
        temperature=0.7  # Slight randomness for diverse outputs
    )
    return response.choices[0].message.content.strip()

In [35]:
def rewrite_to_off_topic_llama(essay_title: str,essay_text: str) -> str:
    system_prompt = """
    You are a student researcher tasked with writing imperfect high school student essays as examples. 
    You need to write an essay that is off-topic from the given title.
    The essay should be well written and professional, but it should not address the title well.
    """
    user_message = f"""
    Write a essay about this topic: {essay_title}.
    However, the essay should start writing about the topic but then stray off-topic and become irrelevant, and do not return to address the title.

    You may refer to this model essay as a reference for the topic, but do not copy it. 
    Do not copy the examples, but try to use novel ones.
    ### Example Essay:
    {essay_text}

    Remember to introduce grammatical mistakes. 
    In your response, only give the essay. Do not include any additional comments or explanations.

    ### After the rewrite, give a score out of 50. As this is off topic, it should only be less than 25 marks. Include the score in the response like this:
    ### Score: XX/50
    """

    response = groq_client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": user_message},
        ],
        temperature=0.7  # Slight randomness for diverse outputs
    )
    return response.choices[0].message.content.strip()

In [26]:
rewrite_to_poor_quality_llama(essay_title, essay_text)

'Here is the rewritten essay:\n\nProsperity is not the only thing that matters. The statement "Prosperity is all that matters; everything else can take a back seat" is a very narrow-minded view. Prosperity is important, but it is not the only thing that is important. In fact, if we prioritize prosperity over everything else, we will end up with a society that is very unequal and unhappy.\n\nTake for example, the city of Dubai. Dubai is a very prosperous city, with many rich people and big businesses. But, if you look deeper, you will see that many of the workers who built Dubai are living in very poor conditions. They are not paid well and do not have good living conditions. This is not a good thing. Prosperity should not just be for the rich, it should be for everyone.\n\nAnother problem with prioritizing prosperity is that it can lead to environmental problems. For example, in China, the air and water are very polluted because of the factories and industries that are there. This is b

In [20]:
rewrite_essay_to_poor_quality_gemma2(ollama_client, essay_title, essay_text)

"Prosperity is all that matters; everything else can take a back seat. That's what some people say, and I kinda get it. Like, when you have money, you can buy stuff, go on vacation, live comfy. Sounds pretty good, right? But hold on a sec! Is that really all there is to life?\n\nI think we gotta look at the bigger picture here.  Sure, being rich is nice, but what about your friends and family? What about helping people who are less fortunate? What about the planet? If you just focus on making money, you can end up forgetting about all the other important stuff. \n\nLike, imagine a world where everyone is super rich, but nobody cares about each other.  Sounds kinda depressing, right? There wouldn't be any kindness or compassion, and that wouldn't make for a very happy place to live. \n\nPlus, think about all the problems in the world – poverty, hunger, pollution. These things can't be solved just by having more money. We need to work together to find solutions, and that means putting th

In [37]:
rewrite_to_off_topic_llama(essay_title, essay_text)

'The concept of prosperity is often associated with wealth and economic growth, but it is also important to consider its impact on individuals and society as a whole. While some may argue that prosperity is the most important thing, I believe that it is not the only factor that should be considered.\n\nIn recent years, there has been a growing trend of people choosing to live in tiny houses. These small homes, often no larger than 500 square feet, are becoming increasingly popular due to their affordability and eco-friendliness. Many people are drawn to the simplicity and freedom that comes with living in a tiny house, and it is not uncommon to see tiny house communities popping up in cities and towns across the country.\n\nOne of the benefits of tiny houses is that they are often much cheaper than traditional homes. This is because they require less materials to build and maintain, and they are often built on wheels, which means that they can be easily moved to different locations. Th

In [None]:
# Create a new dataframe to store the results
results = []

# Loop through each row in the dataframe
for _, row in df.iterrows():
    title = row['title']
    essay = row['essay']
    
    # Rewrite the essay using both functions
    rewritten_llama = rewrite_to_poor_quality_llama(title, essay)
    rewritten_gemma2 = rewrite_essay_to_poor_quality_gemma2(ollama_client, title, essay)
    rewritten_off_topic = rewrite_to_off_topic_llama(title, essay) 
    
    # Append the results to the list
    results.append({
        'title': title,
        'rewritten_off_topic': rewritten_off_topic,
        'rewritten_llama': rewritten_llama,
        'rewritten_gemma2': rewritten_gemma2,
    })

# Convert the results into a new dataframe
rewritten_df = pd.DataFrame(results)

In [41]:
rewritten_df.head()

Unnamed: 0,title,rewritten_off_topic,rewritten_llama,rewritten_gemma2
0,‘Prosperity is all that matters; everything el...,Here is the rewritten essay:\n\nProsperity is ...,Here is the essay:\n\nProsperity is all that m...,Prosperity is all that matters; everything els...
1,Do you agree that freedom of speech should nev...,The freedom of speech is a fundamental right t...,Here is the essay:\n\nThe freedom of speech is...,"Freedom of speech is really important, like, s..."
2,"‘When children grow up exposed to violence, th...","When children grow up exposed to violence, the...","When children grow up exposed to violence, the...","When children grow up around violence, the wor..."
3,A government that fails to win the trust of th...,The relationship between a government and its ...,A goverment that fails to win the trust of the...,A government that can't win the trust of its p...
4,To what extent is the use of violence in today...,The use of violence in today's world is a topi...,Here is the rewritten essay:\n\nThe use of vio...,"Violence is a really big problem today, and ev..."


In [42]:
rewritten_df.to_csv('rewritten_essays.csv', index=False)

In [45]:
rewritten_df = pd.read_csv('rewritten_essays.csv')

In [46]:
# Remove the prefix "Here is the rewritten essay:" from rewritten_gemma2
rewritten_df['rewritten_gemma2'] = rewritten_df['rewritten_gemma2'].str.replace(r'^Here is the rewritten essay:\s*', '', regex=True)

# Remove the prefix "Here is the essay:" from rewritten_gemma2
rewritten_df['rewritten_gemma2'] = rewritten_df['rewritten_gemma2'].str.replace(r'^Here is the essay:\s*', '', regex=True)

# Remove the prefix "Here is the rewritten essay:" from rewritten_llama
rewritten_df['rewritten_llama'] = rewritten_df['rewritten_llama'].str.replace(r'^Here is the rewritten essay:\s*', '', regex=True)

# Remove the prefix "Here is the essay:" from rewritten_llama
rewritten_df['rewritten_llama'] = rewritten_df['rewritten_llama'].str.replace(r'^Here is the essay:\s*', '', regex=True)

# Extract scores from rewritten_gemma2 and rewritten_llama into new columns
rewritten_df['score_gemma2'] = rewritten_df['rewritten_gemma2'].str.extract(r'###\s*Score:\s*(\d+)/50', expand=False).astype(float)
rewritten_df['score_llama'] = rewritten_df['rewritten_llama'].str.extract(r'###\s*Score:\s*(\d+)/50', expand=False).astype(float)

# Remove the score part from rewritten_gemma2 and rewritten_llama
rewritten_df['rewritten_gemma2'] = rewritten_df['rewritten_gemma2'].str.replace(r'###\s*Score:\s*\d+/50', '', regex=True).str.strip()
rewritten_df['rewritten_llama'] = rewritten_df['rewritten_llama'].str.replace(r'###\s*Score:\s*\d+/50', '', regex=True).str.strip()

# Remove the prefix "Here is the rewritten essay:" from rewritten_off_topic
rewritten_df['rewritten_off_topic'] = rewritten_df['rewritten_off_topic'].str.replace(r'^Here is the rewritten essay:\s*', '', regex=True)

# Remove the prefix "Here is the essay:" from rewritten_off_topic
rewritten_df['rewritten_off_topic'] = rewritten_df['rewritten_off_topic'].str.replace(r'^Here is the essay:\s*', '', regex=True)

# Extract scores from rewritten_off_topic into a new column
rewritten_df['score_off_topic'] = rewritten_df['rewritten_off_topic'].str.extract(r'###\s*Score:\s*(\d+)/50', expand=False).astype(float)

# Remove the score part from rewritten_off_topic
rewritten_df['rewritten_off_topic'] = rewritten_df['rewritten_off_topic'].str.replace(r'###\s*Score:\s*\d+/50', '', regex=True).str.strip()

# Display the updated dataframe
rewritten_df.head()


Unnamed: 0,title,rewritten_off_topic,rewritten_llama,rewritten_gemma2,score_gemma2,score_llama,score_off_topic
0,‘Prosperity is all that matters; everything el...,Prosperity is all that matters; everything els...,Prosperity is all that matters; everything els...,Prosperity is all that matters; everything els...,15.0,32.0,18.0
1,Do you agree that freedom of speech should nev...,The freedom of speech is a fundamental right t...,The freedom of speech is a fundamental right t...,"Freedom of speech is really important, like, s...",25.0,35.0,15.0
2,"‘When children grow up exposed to violence, th...","When children grow up exposed to violence, the...","When children grow up exposed to violence, the...","When children grow up around violence, the wor...",23.0,30.0,15.0
3,A government that fails to win the trust of th...,The relationship between a government and its ...,A goverment that fails to win the trust of the...,A government that can't win the trust of its p...,22.0,30.0,18.0
4,To what extent is the use of violence in today...,The use of violence in today's world is a topi...,The use of violence in todays world is a contr...,"Violence is a really big problem today, and ev...",20.0,32.0,10.0


In [47]:
# Merge the two dataframes on the 'title' column
merged_df = pd.merge(df, rewritten_df, on='title')

# Display the merged dataframe
merged_df.head()

Unnamed: 0,author,school,year_written,grade,essay,title,rewritten_off_topic,rewritten_llama,rewritten_gemma2,score_gemma2,score_llama,score_off_topic
0,Skyler,Hwa Chong Institution,2021,A,Hong Kong no doubt boasts some of the wealthie...,‘Prosperity is all that matters; everything el...,Prosperity is all that matters; everything els...,Prosperity is all that matters; everything els...,Prosperity is all that matters; everything els...,15.0,32.0,18.0
1,Skyler,Hwa Chong Institution,2021,A,"Freedom of speech is, to the Western world, a ...",Do you agree that freedom of speech should nev...,The freedom of speech is a fundamental right t...,The freedom of speech is a fundamental right t...,"Freedom of speech is really important, like, s...",25.0,35.0,15.0
2,Skyler,Hwa Chong Institution,2021,A,"Since time immemorial, humanity has been plagu...","‘When children grow up exposed to violence, th...","When children grow up exposed to violence, the...","When children grow up exposed to violence, the...","When children grow up around violence, the wor...",23.0,30.0,15.0
3,Junwei,Hwa Chong Institution,2020,36,The 2020 U.S. Presidential Election seemed to ...,A government that fails to win the trust of th...,The relationship between a government and its ...,A goverment that fails to win the trust of the...,A government that can't win the trust of its p...,22.0,30.0,18.0
4,Junwei,Hwa Chong Institution,2020,35/50,"In the aftermath of the Second World War, the ...",To what extent is the use of violence in today...,The use of violence in today's world is a topi...,The use of violence in todays world is a contr...,"Violence is a really big problem today, and ev...",20.0,32.0,10.0


In [48]:
merged_df.to_csv('rewritten_essays.csv', index=False)

In [49]:
# Step 1: Create four versions of the DataFrame

# Example: original structure
# merged_df has columns:
# ['author', 'school', 'year_written', 'grade', 'essay', 'title', 'rewritten_llama', 'rewritten_gemma2', 'score_gemma2', 'score_llama']

# Step 1: Create three versions of the DataFrame
original = merged_df[['author', 'school', 'year_written', 'title', 'grade', 'essay']].copy()
original['type'] = 'original'
original = original.rename(columns={'essay': 'text', 'grade': 'score'})

llama = merged_df[['author', 'school', 'year_written', 'title', 'rewritten_llama', 'score_llama']].copy()
llama['type'] = 'llama'
llama = llama.rename(columns={'rewritten_llama': 'text', 'score_llama': 'score'})

gemma2 = merged_df[['author', 'school', 'year_written', 'title', 'rewritten_gemma2', 'score_gemma2']].copy()
gemma2['type'] = 'gemma2'
gemma2 = gemma2.rename(columns={'rewritten_gemma2': 'text', 'score_gemma2': 'score'})

off_topic = merged_df[['author', 'school', 'year_written', 'title', 'rewritten_off_topic', 'score_off_topic']].copy()
off_topic['type'] = 'off_topic'
off_topic = off_topic.rename(columns={'rewritten_off_topic': 'text', 'score_off_topic': 'score'})
# Step 2: Concatenate all four
long_df = pd.concat([original, llama, gemma2, off_topic], ignore_index=True)
# Optional: reorder columns
long_df = long_df[['author', 'school', 'year_written', 'title', 'type', 'text', 'score']]


In [50]:
long_df.head()

Unnamed: 0,author,school,year_written,title,type,text,score
0,Skyler,Hwa Chong Institution,2021,‘Prosperity is all that matters; everything el...,original,Hong Kong no doubt boasts some of the wealthie...,A
1,Skyler,Hwa Chong Institution,2021,Do you agree that freedom of speech should nev...,original,"Freedom of speech is, to the Western world, a ...",A
2,Skyler,Hwa Chong Institution,2021,"‘When children grow up exposed to violence, th...",original,"Since time immemorial, humanity has been plagu...",A
3,Junwei,Hwa Chong Institution,2020,A government that fails to win the trust of th...,original,The 2020 U.S. Presidential Election seemed to ...,36
4,Junwei,Hwa Chong Institution,2020,To what extent is the use of violence in today...,original,"In the aftermath of the Second World War, the ...",35/50


In [51]:
long_df.to_csv('essays_with_augmentation.csv', index=False)