In [1]:
!pip install --upgrade openai



In [2]:
import os
import json
import pandas as pd
import numpy as np

from datetime import datetime
from openai import OpenAI

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
%cd drive/MyDrive/voter-behavior-prediction-LLM/

/content/drive/MyDrive/voter-behavior-prediction-LLM


In [5]:
# Constants
SAMPLE_SIZE = 90
RANDOM_STATE = 42
ENV_FILE_PATH = '.env'

In [6]:
def load_env_variables(file_path):
    """Load key-value pairs from a .env file and set them as environment variables."""
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line and not line.startswith('#'):
                key, value = line.split('=', 1)
                os.environ[key] = value

load_env_variables(ENV_FILE_PATH)

In [7]:
# Function to load city codes from a file into a dictionary
def load_city_codes(file_path):
    city_codes = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            code, name = line.strip().split(':')
            city_codes[int(code)] = name
    return city_codes

# Load city codes from the file
file_path = 'city_codes.txt'
city_mapping = load_city_codes(file_path)

In [8]:
# Load survey data
df_survey_data = pd.read_excel('survey.xlsx')
print("Survey data loaded successfully.")

Survey data loaded successfully.


In [9]:
# Preprocess the data
# 1) Convert float to integer for 'Age' column
df_survey_data['Age_EN'] = df_survey_data['Age_survey'].fillna(0).astype(int)

# 2) Map education levels to desired strings
education_mapping = {
    'Üniversite mezunu': 'university graduate',
    'Yüksek lisans/Doktora': "master's degree/PhD holder",
    'Lise mezunu': "high-school graduate",
    'İlköğretim/Ortaokul mezunu': "secondary school graduate",
    'İlkokul mezunu': "primary school graduate",
    'Diplomasız okur': "literate but nongraduate",
    np.nan: ""
}
df_survey_data['Eğitim_EN'] = df_survey_data['Eğitim'].map(education_mapping)

# 3) Map ethnicity to desired strings
ethnicity_mapping = {
    'Türk': 'Turkish',
    'Kürt': 'Kurdish',
    'Türk,Kürt': 'Turkish and Kurdish',
    'Türk,Zaza': 'Turkish and Zaza',
    'Türk,Diğer': 'partly Turkish',
    'Arap': 'Arab',
    'Diğer': "",
    np.nan: ""
}
df_survey_data['Etnisite_EN'] = df_survey_data['Etnisite'].map(ethnicity_mapping)

# 4) Map city codes to city names
df_survey_data['Şehir_EN'] = df_survey_data['Şehir'].map(city_mapping)

In [10]:
# Display the first few rows of the survey data
df_survey_data[['username', 'Age_EN', 'Eğitim_EN', 'Etnisite_EN', 'Şehir_EN']].head()

Unnamed: 0,username,Age_EN,Eğitim_EN,Etnisite_EN,Şehir_EN
0,otisabi,59,university graduate,Turkish,Muğla
1,drcihanyilmaz,37,master's degree/PhD holder,Turkish,Manisa
2,arda,0,high-school graduate,Turkish,Çanakkale
3,fridakahlo,33,master's degree/PhD holder,Turkish,Ankara
4,dilekozcengiz,62,master's degree/PhD holder,Turkish,Adana


In [11]:
# Calculate the percentage of NaN values in 'location_x'
nan_count = df_survey_data['location_x'].isna().sum()
total_rows = df_survey_data.shape[0]
nan_percentage = (nan_count / total_rows) * 100
print(f"Percentage of NaN values in 'location_x': {nan_percentage:.2f}%")

Percentage of NaN values in 'location_x': 70.46%


In [12]:
# Sample supporters
supporters_kemal = df_survey_data[df_survey_data['CB 2. Tur'] == 'Kemal Kılıçdaroğlu']
print(f"Total number of Kemal Kılıçdaroğlu supporters: {len(supporters_kemal)}")
sample_kemal = supporters_kemal.sample(n=SAMPLE_SIZE, random_state=RANDOM_STATE)
supporters_erdogan = df_survey_data[df_survey_data['CB 2. Tur'] == 'Recep Tayyip Erdoğan']
print(f"Total number of Recep Tayyip Erdoğan supporters: {len(supporters_erdogan)}")

Total number of Kemal Kılıçdaroğlu supporters: 797
Total number of Recep Tayyip Erdoğan supporters: 91


In [13]:
sample_erdogan = supporters_erdogan.sample(n=SAMPLE_SIZE, random_state=RANDOM_STATE)
combined_samples = pd.concat([sample_kemal, sample_erdogan], ignore_index=True)
print(f"Total samples we picked: {len(combined_samples)}")

Total samples we picked: 180


In [14]:
# Display the usernames and their selected candidate
combined_samples[['username', 'CB 2. Tur']]

Unnamed: 0,username,CB 2. Tur
0,coscosgg,Kemal Kılıçdaroğlu
1,zbayzdemir1,Kemal Kılıçdaroğlu
2,halilaltinay,Kemal Kılıçdaroğlu
3,ucurtmam,Kemal Kılıçdaroğlu
4,suvuck,Kemal Kılıçdaroğlu
...,...,...
175,zekiserttas,Recep Tayyip Erdoğan
176,hamzaatunc,Recep Tayyip Erdoğan
177,paptya727,Recep Tayyip Erdoğan
178,mstfhayri,Recep Tayyip Erdoğan


In [15]:
# Load tweets data
with open('tweets_data.json', 'r', encoding='utf-8') as file:
    tweets_data = json.load(file)
print("Tweets data loaded successfully.")

Tweets data loaded successfully.


In [16]:
MODEL_NAME = "gpt-4o-2024-05-13"
openai_api_key = os.getenv('OPENAI_API_KEY')

# Initialize OpenAI client
client = OpenAI(api_key=openai_api_key)

In [17]:
def analyze_tweets_with_prompts(username, user_tweets, row):
    """Analyze tweets using three different prompts and return the responses."""
    tweets_text = "\n".join(tweet["text"] for tweet in user_tweets[:70])

    prompt1 = (
        f"I am {row['Age_EN']} years old, a {row['Eğitim_EN']}, of {row['Etnisite_EN']} ethnicity, "
        f"and I identify as {row['gender_survey']}. I live in {row['Şehir_EN']}. "
        f"In the second round of the 2023 Turkish presidential elections, between Kemal Kılıçdaroğlu and Recep Tayyip Erdoğan, "
        f"whom did I vote for? Please respond with either 'Kılıçdaroğlu' or 'Erdoğan'."
    )

    prompt2 = (
        f"I am {row['Age_EN']} years old, a {row['Eğitim_EN']}, of {row['Etnisite_EN']} ethnicity, "
        f"and I identify as {row['gender_survey']}. I live in {row['Şehir_EN']}. Here are some of my tweets: "
        f"'''{tweets_text}'''. Based on this information, in the second round of the 2023 Turkish presidential elections, "
        f"between Kemal Kılıçdaroğlu and Recep Tayyip Erdoğan, whom did I vote for? Please respond with either 'Kılıçdaroğlu' or 'Erdoğan'."
    )

    prompt3 = (
        f"Here are some of my tweets: '''{tweets_text}'''. Based on my tweets, in the second round of the 2023 Turkish presidential elections, "
        f"between Kemal Kılıçdaroğlu and Recep Tayyip Erdoğan, whom did I vote for? Please respond with either 'Kılıçdaroğlu' or 'Erdoğan'."
    )

    prompts = [prompt1, prompt2, prompt3]
    results = []

    for prompt in prompts:
        try:
            completion = client.chat.completions.create(
                model=MODEL_NAME,
                messages=[
                    {"role": "system", "content": "You are a political analysis assistant. Analyze the tweets and return only the name of one of the two listed candidates."},
                    {"role": "user", "content": prompt},
                ],
            )
            results.append(completion.choices[0].message.content)
        except Exception as e:
            print(f"An error occurred for user {username} with prompt: {prompt[:30]}... : {str(e)}")
            results.append("Error")

    return results

In [18]:
# Analyze tweets and collect results for each prompt
all_results = {}
response_dict = {}

for index, row in combined_samples.iterrows():
    username = row['username']
    if username in tweets_data:
        user_tweets = tweets_data[username]
        candidate_responses = analyze_tweets_with_prompts(username, user_tweets, row)
        all_results[username] = candidate_responses
        response_dict[username] = {
            "prompt1": candidate_responses[0],
            "prompt2": candidate_responses[1],
            "prompt3": candidate_responses[2]
        }
        print(f"{username} -> prompt1: {candidate_responses[0]}, prompt2: {candidate_responses[1]}, prompt3: {candidate_responses[2]}")
    else:
        print(f"No tweets found for user: {username}")

coscosgg -> prompt1: Kılıçdaroğlu, prompt2: Kılıçdaroğlu, prompt3: Kılıçdaroğlu
zbayzdemir1 -> prompt1: Erdoğan, prompt2: Kılıçdaroğlu, prompt3: Kılıçdaroğlu
halilaltinay -> prompt1: Kılıçdaroğlu, prompt2: Erdoğan, prompt3: Erdoğan
ucurtmam -> prompt1: Erdoğan, prompt2: Kılıçdaroğlu, prompt3: Kılıçdaroğlu
suvuck -> prompt1: Kılıçdaroğlu, prompt2: Kılıçdaroğlu, prompt3: Kılıçdaroğlu
kahwedelisi -> prompt1: Erdoğan, prompt2: Kılıçdaroğlu, prompt3: Kılıçdaroğlu
yesimozn -> prompt1: Kılıçdaroğlu, prompt2: Kılıçdaroğlu, prompt3: Kılıçdaroğlu
muuguney -> prompt1: Erdoğan, prompt2: Erdoğan, prompt3: Kılıçdaroğlu
cidominko -> prompt1: Kılıçdaroğlu, prompt2: Kılıçdaroğlu, prompt3: Kılıçdaroğlu
lutfiduran3 -> prompt1: Kılıçdaroğlu, prompt2: Kılıçdaroğlu, prompt3: Kılıçdaroğlu
sitivingerrard -> prompt1: Kılıçdaroğlu, prompt2: Erdoğan, prompt3: Erdoğan
chatdenoir1 -> prompt1: Erdoğan, prompt2: Kılıçdaroğlu, prompt3: Kılıçdaroğlu
emekorh -> prompt1: Erdoğan, prompt2: Erdoğan, prompt3: Kılıçdaroğlu


In [19]:
# Save results to a JSON file
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
file_name = f"results_{MODEL_NAME}_{timestamp}.json"
with open(file_name, "w", encoding='utf-8') as f:
    json.dump(response_dict, f, indent=4, ensure_ascii=False)
print(f"Results saved to {file_name}.")

Results saved to results_gpt-4o-2024-05-13_20240519155949.json.
