In [1]:
import os, time
from anthropic import AnthropicVertex
PROJECT_ID = 'auspicious-lead-386121'
REGION = 'us-central1'

claude_client = AnthropicVertex(project_id=PROJECT_ID, region=REGION)


In [2]:
import pandas as pd
# Load the file content into a pandas DataFrame
raw_data = pd.read_csv('~/Downloads/russia_social_media.csv').dropna(subset=['translation', 'description'])


# Tag

In [3]:
common_prompt = "You are an AI assistant analyzing Russian and Ukrainian social media posts."

tag_prompt = """From each post generate a new <tagged_post> blob. Inside <tagged_post> there will 4 sections:<id>, <tags>, <locations> and <emotion>

Here are some guideliens for <tags>:
1. Tags for specific events like "Crimean annexation," "Donbas conflict," "Minsk agreements," "NATO expansion," etc.
2. Names of key political figures such as "Putin," "Zelensky," "Medvedev," "Poroshenko," etc.
3. Tags related to governmental bodies like "Kremlin," "Verkhovna Rada," "Duma," "Security Service of Ukraine (SBU)," "FSB," etc.
4. Tags indicating potential propaganda or disinformation efforts such as "Russian propaganda," "Ukrainian disinformation," "Information warfare," "Fake news," etc.
5. Tags for military operations like "Operation Barbarossa," "Operation Northern Wind," "ATO (Anti-Terrorist Operation)," "Russian military intervention," etc.
6. Tags related to civil society movements and activism like "Protests," "Civil rights," "Activism," "Civil society organizations," etc.
7. Tags for ethnic and cultural identity like "Russian identity," "Ukrainian nationalism," "Crimean Tatars," "Donbas Russians," etc.
8. Tags for economic factors affecting the region such as "Sanctions," "Economic downturn," "Energy dependence," "Trade agreements," etc.

If the post is not interesting in terms of national security, please tag it as "Others".
Do not add more than 3 tags to a post. Make them a comma-separated list.

Here is the guideline for <emotion>:
1. Try to judge the emotion of the post and put it inside a searate <emotion> tag. 
2. Emotion should be in the range of -2, -1, 0, 1 and 2. -2 being the most negative rmotion.

Here is the guideline for <locations>:
Put the locations inside a <locations> tag as a comma-separated list based on the post.

"""


import re
from retry import retry

@retry(delay=50, tries=5)
def tag(posts, post_count):
    prompt = f"""{common_prompt}
    
{tag_prompt}

Now, I am going to give you a list of posts and description of the posters. Please create tags for them.
Output only <tags> not the post.

"{posts}"

    """
    # print(prompt)
    
    try: 
        message = claude_client.messages.create(
            model="claude-3-sonnet@20240229",
            # model="claude-3-haiku@20240307",
            max_tokens=4096,
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
        )
    except Exception as e:
        print(e)
        raise e
        
    # print(message.content[0].text)
    
    pattern = r'<tagged_post>((.|\n)*?)<\/tagged_post>'
    blob_matches = re.findall(pattern, message.content[0].text)
    if (len(blob_matches) == 0):
        pattern = r'<post>((.|\n)*?)<\/post>'
        blob_matches = re.findall(pattern, message.content[0].text)
    
    
    results = {}
    
    for blob in blob_matches:
        # print(blob)
        pattern = r'<tags>(.*?)<\/tags>'
        # Find all matches
        matches = re.findall(pattern, blob[0])
        tags = []
        
        if (len(matches) > 0):
            tags = matches[0].split(',')
            tags = [s.strip() for s in tags]

        pattern = r'<locations>(.*?)<\/locations>'
        # Find all matches
        locations = []
        matches = re.findall(pattern, blob[0])
        if (len(matches) > 0):
            locations = matches[0].split(',')
            locations = [s.strip() for s in locations]

        pattern = r'<emotion>(.*?)<\/emotion>'
        # Find all matches
        matches = re.findall(pattern, blob[0])
        
        emotion = 0
        if (len(matches) > 0):
            emotion = matches[0].strip()
        
        pattern = r'<id>(.*?)<\/id>'
        # Find all matches
        matches = re.findall(pattern, blob[0])
        
        if (len(matches) > 0):
            index = int(matches[0].strip())
            
            
            results[index] = {
                'locations': locations,
                'sentiment': emotion,
                'tags': tags,
            }
    
    if len(results) < post_count:
        print(message.content[0].text)
        
    return results 

# Summarize

In [4]:
summarization_prompt = """Here are the requirements:
1. Ignore any post unrelated to the Russia-Ukraine conflict.
1. Collect interesting events from similar posts. 
2. Put each event inside <event> tags.
3. Each event will have 5 sections: <title>, <description>, <date>, <severity> and an optional <locations>.
4. Severity should be either "normal" or "important".
5. <date> should be in YYYY-MM-DD format and derived from the input posts.
"""

import re

@retry(delay=10, tries=4, backoff=2)
def summarize(posts, start_date):
    prompt = f"""{common_prompt}

{summarization_prompt}

Here are the posts:
{posts}

    """
    # print(prompt)
    
    message = claude_client.messages.create(
        model="claude-3-sonnet@20240229",
        max_tokens=4096,
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
    )
    
    print(message.content[0].text)
    pattern = r'<event>((.|\n)*?)<\/event>'
    result = []
    # Find all matches
    matches = re.findall(pattern, message.content[0].text)  
    for match in matches:
        title_matches = re.findall(r'<title>((.|\n)*?)<\/title>', match[0])
        detail_matches = re.findall(r'<description>((.|\n)*?)<\/description>', match[0])
        severity_matches = re.findall(r'<severity>((.|\n)*?)<\/severity>', match[0])
        location_matches = re.findall(r'<locations>((.|\n)*?)<\/locations>', match[0])
        date_matches = re.findall(r'<date>((.|\n)*?)<\/date>', match[0])
        locations = location_matches[0][0].split(",") if (len(location_matches) > 0) else []
        locations = [s.strip() for s in locations]
    
        result.append({
            'title': title_matches[0][0].strip() if (len(title_matches)) > 0 else "",
            'description': detail_matches[0][0].strip() if (len(detail_matches)) > 0 else "",
            'severity': severity_matches[0][0].strip().lower() if (len(severity_matches)) > 0 else "",
            'date': date_matches[0][0].strip() if (len(date_matches)) > 0 else start_date,
            'locations': locations
            # 'search_query': search_query_matches[0][0],

        })

    
    return result

# Predict

In [5]:
prediction_prompt = """You are a national security assistant. 
I am going to provide some events from the Russia-Ukraine conflict.
Based on them please predict some future events and wrap them inside <event> tags.
Each future event should have a <title>, <description> and <severity>. Severity should be either "normal" or "important".
"""

import re

@retry(delay=10, tries=4, backoff=2)
def predict(events):
    prompt = f"""{prediction_prompt}

Here are the events:
{events}

Now, predict some future events.
    """
    print(prompt)
    
    message = claude_client.messages.create(
        model="claude-3-sonnet@20240229",
        max_tokens=4096,
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
    )
    
    print(message.content[0].text)
    pattern = r'<event>((.|\n)*?)<\/event>'
    result = []
    # Find all matches
    matches = re.findall(pattern, message.content[0].text)  
    for match in matches:
        title_matches = re.findall(r'<title>((.|\n)*?)<\/title>', match[0])
        detail_matches = re.findall(r'<description>((.|\n)*?)<\/description>', match[0])
        severity_matches = re.findall(r'<severity>((.|\n)*?)<\/severity>', match[0])
        location_matches = re.findall(r'<locations>((.|\n)*?)<\/locations>', match[0])
        date_matches = re.findall(r'<date>((.|\n)*?)<\/date>', match[0])
        locations = location_matches[0][0].split(",") if (len(location_matches) > 0) else []
        locations = [s.strip() for s in locations]
    
        result.append({
            'title': title_matches[0][0].strip() if (len(title_matches)) > 0 else "",
            'description': detail_matches[0][0].strip() if (len(detail_matches)) > 0 else "",
            'severity': severity_matches[0][0].strip().lower() if (len(severity_matches)) > 0 else "",
            # 'date': date_matches[0][0].strip() if (len(date_matches)) > 0 else start_date,
            # 'locations': locations
            # 'search_query': search_query_matches[0][0],

        })

    
    return result

In [None]:
from datetime import date, timedelta
import json

start_date = date(2024, 4, 1)
final_date = date(2024, 4, 8)
delta = timedelta(days=1)
end_date = start_date+delta

while start_date <= final_date:
    print(start_date)
    folder_path = './data/' + str(start_date)
    os.makedirs(folder_path, exist_ok=True)

    data = raw_data.loc[(raw_data['time'] >= str(start_date))
                        & (raw_data['time'] < str(end_date))]
    data = data.sample(500)
    
    batch_size = 50

    # Create an empty list to store the results
    results = []

    # Iterate over the DataFrame in batches
    for i in range(0, len(data), batch_size):
        batch_df = data.iloc[i:i+batch_size]  # Get the current batch

        posts = ''
        for j in range(0, len(batch_df)):
            row = batch_df.iloc[j]
            posts += f"""
<post>
<id>{j}</id>
<text>
{row['translation']}
</text>
<poster>
{row['description']}
</poster>
</post>
""" 
        tags = tag(posts, len(batch_df))
        print(f'batch_start: {i}, batch_size: {len(tags)}')

        for j in range(0, len(batch_df)):
            if j in tags:
                results.append(tags[j])
            else:
                results.append({})
        # print(results)
        time.sleep(20)
        
    data = pd.concat([data, pd.DataFrame(results).set_index(data.index)], axis=1)
    data.rename(columns={'translation': 'text', 'time' : 'date'}, inplace=True)
    data = data [["id",
        "text",
        "locations",
        "tags",
        "sentiment",
        "date"]]
    
    json_array = data.to_json(orient='records')

    file_path = f"./{folder_path}/tagged_tweets.json"
    with open(file_path, 'w') as file:
        file.write(json_array)

    
    filtered_df = data.dropna()
    batch_size = len(filtered_df)

    # Create an empty list to store the results
    themes = []

    # Iterate over the DataFrame in batches
    for i in range(0, len(filtered_df), batch_size):
        batch_df = filtered_df.iloc[i:i+batch_size]  # Get the current batch

        posts = ""
        for j in range(0, len(batch_df)):
            row = batch_df.iloc[j]
            posts += f"""
<post>
<text>
{row['text']}
</text>
<locations>{', '.join(row['locations'])}</locations>
<date>{row['date']}</date>
</post>
""" 
    
        summary_result = summarize(posts, str(start_date))
        print(f'batch_start: {i}, batch_size: {len(summary_result)}')

        themes.extend(summary_result)  # Append the processed batch to the results list
        time.sleep(10)
        
    json_array = pd.DataFrame(themes).to_json(orient='records')

    file_path = f"./{folder_path}/events.json"
    with open(file_path, 'w') as file:
        file.write(json_array) 


    events = ""

    for event in themes:
        events += f"""
<event>
<title>
{event['title']}
</title>
<descrpition>
{event['description']}
</descrpition>
<date>{event['date']}</date>
</event>
""" 

    future_events = predict(events)

    print(future_events)

    json_array = pd.DataFrame(future_events).to_json(orient='records')

    file_path = f"./{folder_path}/future_events.json"
    with open(file_path, 'w') as file:
        file.write(json_array) 

    
    
    start_date = end_date
    end_date += delta

2024-04-01
batch_start: 0, batch_size: 50
batch_start: 50, batch_size: 50
batch_start: 100, batch_size: 50
batch_start: 150, batch_size: 50
batch_start: 200, batch_size: 50
batch_start: 250, batch_size: 50
batch_start: 300, batch_size: 50
batch_start: 350, batch_size: 50
Error code: 429 - {'error': {'code': 429, 'message': 'Quota exceeded for aiplatform.googleapis.com/online_prediction_tokens_per_minute_per_base_model with base model: anthropic-claude-3-sonnet. Please submit a quota increase request. https://cloud.google.com/vertex-ai/docs/generative-ai/quotas-genai.', 'status': 'RESOURCE_EXHAUSTED'}}
batch_start: 400, batch_size: 50
Error code: 429 - {'error': {'code': 429, 'message': 'Quota exceeded for aiplatform.googleapis.com/online_prediction_tokens_per_minute_per_base_model with base model: anthropic-claude-3-sonnet. Please submit a quota increase request. https://cloud.google.com/vertex-ai/docs/generative-ai/quotas-genai.', 'status': 'RESOURCE_EXHAUSTED'}}
batch_start: 450, bat

Based on the provided events, here are some possible future events related to the Russia-Ukraine conflict:

<event>
<title>Escalation of Attacks on Russian Cities</title>
<description>As the conflict intensifies, Ukraine may launch more attacks on Russian cities near the border, such as Belgorod, using long-range missiles or drones, leading to increased civilian casualties and infrastructure damage on Russian territory.</description>
<severity>important</severity>
</event>

<event>
<title>Capture of Key Ukrainian Cities</title>
<description>With Russian forces making advances in the Donetsk region, they may eventually capture strategically important Ukrainian cities like Bakhmut or Kramatorsk, dealing a significant blow to Ukraine's defensive efforts.</description>
<severity>important</severity>
</event>

<event>
<title>Increased Terrorist Attacks in Occupied Territories</title>
<description>Following the terrorist attack in Starobelsk, there could be a rise in similar attacks by Ukrai