In [1]:
from google import genai
import numpy as np 
import pandas as pd 
from dotenv import load_dotenv
import os
import time

### Loading the Data

In [2]:
some_data = pd.read_json("Dataset.json", lines=True)

In [3]:
some_data.iloc[:,:2].sample(10)

Unnamed: 0,is_sarcastic,headline
14385,1,professor pressured to sleep with student for ...
17796,1,mccain clinches religious vote with stirring h...
16982,1,gop leaders demand congressman duncan hunter's...
16711,0,devastating floods leave 23 dead in west virginia
8911,0,patty jenkins is already thinking about a 'won...
8159,1,many animals harmed in catering of film
19602,0,buzz aldrin blasts off with the air force thun...
4984,0,"a u.s. cyclist made sure she won gold, then co..."
3876,0,parents of kidnapped girls make desperate plea
2231,1,justice roberts stops in middle of oath of off...


In [4]:
some_data.shape

(28619, 3)

### Is it political? (with Gemini)

In [20]:
load_dotenv()
SECRET_KEY = os.getenv('SECRET_KEY_2')
client = genai.Client(api_key=SECRET_KEY)

In [6]:
for i in some_data[some_data["is_sarcastic"] == 1].headline.sample(10):
    print (i)

trump pours himself glass of chocolate syrup on rocks to unwind after stressful day
scientists isolate gene simmons
jupiter's liberals worried about their ammonia footprint
ecuadorian embassy runs ad seeking 'no drama' tenant for newly vacant room
biden forges president's signature on executive order to make december dokken history month
serious man pleased with how jowls are coming in
new custard could cause worldwide flandemic
elmore leonard, modern prose master, noted for his terse prose style and for writing about things perfectly and succinctly with a remarkable economy of words, unfortunately and sadly expired this gloomy tuesday at the age of 87 years old
$85,000 in fertility treatments result in miracle
poll finds only 83% of new yorkers visit statue of liberty every day


In [7]:
from tqdm import tqdm
import time

prompt_bare = """Are the following titles of a political genre? 
Each line is a different sentence

Answer only 0 if it's not, and answer only 1 otherwise
Give the answer in a list, and only reply in a list

Example answer for 20 texts:
[0,0,1,1,0,0,0,0,1,0,0,0,1,1,1,0,0,0,1,1]

Texts : 
{}
"""
client = genai.Client(api_key=SECRET_KEY)

In [8]:
all_titles_full = some_data.headline.values.tolist()

In [18]:
len(all_titles_full)

639

In [29]:
BATCH_SIZE = 10

# all_titles = all_titles_full[640:]
all_titles = all_titles_full

total_batches = (len(all_titles) + BATCH_SIZE - 1) // BATCH_SIZE  

with tqdm(total=total_batches) as pbar:
    results_after = []
    for i in range(0, len(all_titles), BATCH_SIZE):
        batch = all_titles[i:i + BATCH_SIZE]
        combined_text = "\n".join(batch)  # Join 20 titles into a single string
        
        prompt = prompt_bare.format(combined_text)
        
        response = client.models.generate_content(model="gemini-2.0-flash",
                                                  contents=[prompt])
        
        results_after.append(response.text.strip())  
        time.sleep(3)  # Adjust based on API rate limits
        
        pbar.update(1) 

  0%|          | 0/64 [00:00<?, ?it/s]

100%|██████████| 64/64 [04:38<00:00,  4.35s/it]


In [31]:
true_res = []
for idx,i in enumerate(results_after):
    i_e = eval(i)

    if len(i_e) != BATCH_SIZE:
        print (idx)
        
    true_res.extend(i_e)

print (len(true_res))

63
639


### Revise Errors

In [45]:
almost_done = pd.read_csv('all_but_some.csv')
revisions = almost_done[~almost_done['is_political'].isin(['1','0'])].headline.values.tolist()
almost_done[~almost_done['is_political'].isin(['1','0'])]

Unnamed: 0,is_sarcastic,headline,article_link,is_political
4280,1,grandpa looking absolutely precious in new bas...,https://local.theonion.com/grandpa-looking-abs...,error
4281,0,arrested but innocent? the internet still thin...,https://www.huffingtonpost.com/entry/helping-e...,error
4282,1,historical archives: to be sold - carved woode...,https://www.theonion.com/historical-archives-t...,error
4283,1,report: one in five women training to be yoga ...,https://www.theonion.com/report-one-in-five-wo...,error
4284,1,thieves make off with museum's most valuable d...,https://local.theonion.com/thieves-make-off-wi...,error
...,...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...,empty
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...,empty
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...,empty
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...,empty


In [None]:
BATCH_SIZE = 10

all_titles = revisions

total_batches = (len(all_titles) + BATCH_SIZE - 1) // BATCH_SIZE  

with tqdm(total=total_batches) as pbar:
    results_after = []
    for i in range(0, len(all_titles), BATCH_SIZE):
        batch = all_titles[i:i + BATCH_SIZE]
        combined_text = "\n".join(batch)  # Join 20 titles into a single string
        
        prompt = prompt_bare.format(combined_text)
        
        response = client.models.generate_content(model="gemini-2.0-flash",
                                                  contents=[prompt])
        
        results_after.append(response.text.strip())  
        time.sleep(3)  # Adjust based on API rate limits
        
        pbar.update(1) 

In [30]:
true_res = []
for idx,i in enumerate(results_after):
    i_e = eval(i)

    true_res.extend(i_e)

print (len(true_res))

639


In [46]:
import pandas as pd

# Values to replace and replacement list
error_count = 80
empty_count = 559
replacement_list = true_res

# Splitting replacement list
error_replacements = replacement_list[:error_count]
empty_replacements = replacement_list[error_count:error_count + empty_count]

In [47]:
almost_done.is_political.unique()

array(['0', '1', 'error', 'empty'], dtype=object)

In [48]:
# Replace values
almost_done.loc[almost_done['is_political'] == "error", 'is_political'] = error_replacements
almost_done.loc[almost_done['is_political'] == "empty", 'is_political'] = empty_replacements

print(almost_done)

       is_sarcastic                                           headline  \
0                 1  thirtysomething scientists unveil doomsday clo...   
1                 0  dem rep. totally nails why congress is falling...   
2                 0  eat your veggies: 9 deliciously different recipes   
3                 1  inclement weather prevents liar from getting t...   
4                 1  mother comes pretty close to using word 'strea...   
...             ...                                                ...   
28614             1       jews to celebrate rosh hashasha or something   
28615             1  internal affairs investigator disappointed con...   
28616             0  the most beautiful acceptance speech this week...   
28617             1  mars probe destroyed by orbiting spielberg-gat...   
28618             1                 dad clarifies this not a food stop   

                                            article_link is_political  
0      https://www.theonion.com/thirtys

In [50]:
almost_done.is_political = almost_done.is_political.astype('int')

In [51]:
done = almost_done.copy()
done.to_csv('all_done.csv')