In [None]:
import pandas as pd
import requests
import re
import time
import urllib.parse
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm


# Function to remove isolated letters followed by a period and space
def clean_name(name):
    return re.sub(r'\b[A-Z]\.\s+', '', name)

# Function to get Wikipedia link for a given name, including country, state, and party
def get_wikipedia_link(row):
    name = row['Name']
    country = row['Country']
    state = row['State']
    party = row['Party']
    
    search_queries = [
        f"{name} {country} politician",
        f"{name} {state} politician",
        f"{name} {party} politician"
    ]
    
    for query in search_queries:
        search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={query}&format=json"
        response = requests.get(search_url).json()
        
        try:
            # Get the first search result page ID
            page_id = response['query']['search'][0]['pageid']
            page_url = f"https://en.wikipedia.org/?curid={page_id}"
            
            # Get the content of the page
            page_content_url = f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exintro&explaintext&format=json&pageids={page_id}"
            page_content = requests.get(page_content_url).json()
            extract = page_content['query']['pages'][str(page_id)]['extract']
            
            # Check if the page mentions the person as a politician
            if 'politician' in extract.lower():
                return page_url
        except (IndexError, KeyError):
            continue
    
    return None

# Load and process the Control dataset
control_file_path = 'C:/Users/Guill/OneDrive/Documents/Control.csv'  # Replace with the path to your actual Control dataset
control_df = pd.read_csv(control_file_path, encoding='ISO-8859-1')

# Remove leading and trailing spaces from the 'Name' column
control_df['Name'] = control_df['Name'].str.strip()

# Filter out rows that contain 'Ã' in any column
control_df = control_df[~control_df.apply(lambda row: row.astype(str).str.contains('Ã').any(), axis=1)]

# Clean names in the Control dataset
control_df['Name'] = control_df['Name'].apply(clean_name)

# Apply the Wikipedia link function to the Control dataset
control_df['Wikipedia_Link'] = control_df.apply(get_wikipedia_link, axis=1)

# Save the updated Control DataFrame to a new CSV file
control_output_file_path = 'C:/Users/Guill/OneDrive/Documents/Control_with_links.csv'
control_df.to_csv(control_output_file_path, index=False)

# Load and process the Treatment dataset
treatment_file_path = 'C:/Users/Guill/OneDrive/Documents/Treatment.csv'  # Replace with the path to your actual Treatment dataset
treatment_df = pd.read_csv(treatment_file_path, encoding='ISO-8859-1')

# Remove leading and trailing spaces from the 'Name' column
treatment_df['Name'] = treatment_df['Name'].str.strip()

# Filter out rows that contain 'Ã' in any column
treatment_df = treatment_df[~treatment_df.apply(lambda row: row.astype(str).str.contains('Ã').any(), axis=1)]

# Clean names in the Treatment dataset
treatment_df['Name'] = treatment_df['Name'].apply(clean_name)

# Apply the Wikipedia link function to the Treatment dataset
treatment_df['Wikipedia_Link'] = treatment_df.apply(get_wikipedia_link, axis=1)

# Save the updated Treatment DataFrame to a new CSV file
treatment_output_file_path = 'C:/Users/Guill/OneDrive/Documents/Treatment_with_links.csv'
treatment_df.to_csv(treatment_output_file_path, index=False)

# Display the updated DataFrames
print(control_df)
print(treatment_df)


# Step 1: Delete all observations without a link in both datasets
control_df = control_df.dropna(subset=['Wikipedia_Link'])
treatment_df = treatment_df.dropna(subset=['Wikipedia_Link'])

# Step 2: Remove whole rows with duplicate Wikipedia links in Control dataset
control_df = control_df.drop_duplicates(subset=['Wikipedia_Link'])

# Step 3: Ensure columns are the same
# Remove 'Date in' and 'Date Out' from Control, add 'Date of party switch' as NA, and set 'Party' and 'New Party' to the same value
control_df = control_df.drop(columns=['Date in', 'Date Out'])
control_df['Date of party switch'] = pd.NA
control_df['New Party'] = control_df['Party']

# Reorder columns to match Treatment dataset
control_df = control_df[['Country', 'Name', 'State', 'Date of party switch', 'Party', 'New Party', 'Wikipedia_Link']]
treatment_df = treatment_df[['Country', 'Name', 'State', 'Date of party switch', 'Party', 'New Party', 'Wikipedia_Link']]

# Step 4: Remove Control observations that are also in Treatment
treatment_links = treatment_df['Wikipedia_Link'].tolist()
control_df = control_df[~control_df['Wikipedia_Link'].isin(treatment_links)]
control_df.to_csv('C:/Users/Guill/OneDrive/Documents/Control1.csv', index=False)
treatment_df.to_csv('C:/Users/Guill/OneDrive/Documents/Treatment1.csv', index=False)


In [None]:
# Define the file path
file_path = 'C:/Users/Guill/Downloads/FullDF.csv'

# Load the CSV file with utf-8 encoding
df = pd.read_csv(file_path, encoding='utf-8')

# Ensure all Wikipedia_Link entries are strings and handle NaN values
df['Wikipedia_Link'] = df['Wikipedia_Link'].astype(str).fillna('')

# Define the conversion function with retry mechanism
def get_standard_wikipedia_url(curid_url):
    # Extract the curid from the URL
    curid = curid_url.split('curid=')[-1]
    
    # Query the Wikipedia API to get the page title with retries
    api_url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&pageids={curid}"
    for attempt in range(5):  # Retry up to 5 times
        try:
            response = requests.get(api_url)
            response.raise_for_status()  # Raise an exception for HTTP errors
            data = response.json()
            # Extract the page title
            page = data['query']['pages'][curid]
            title = page['title']
            # Construct the standard URL and properly encode it
            standard_url = f"https://en.wikipedia.org/wiki/{urllib.parse.quote(title.replace(' ', '_'))}"
            # Decode the URL back to human-readable format
            human_readable_url = urllib.parse.unquote(standard_url)
            return human_readable_url
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            time.sleep(2)  # Wait for 2 seconds before retrying
    print("Failed to retrieve the page title after 5 attempts.")
    return curid_url  # Return the original URL if all attempts fail

# Apply the conversion function to all Wikipedia_Link entries
df['Wikipedia_Link'] = df['Wikipedia_Link'].apply(lambda url: get_standard_wikipedia_url(url) if 'curid=' in url else url)

# Save the updated DataFrame with utf-8 encoding
output_path = 'C:/Users/Guill/Downloads/FullDF_Updated.csv'
df.to_csv(output_path, index=False, encoding='utf-8')

# Display the first few rows of the updated dataframe
print(df.head())

print(df.head(23))
# TO GET SENTEMENT AND ALL THAT IS ON A DIFFERNT FILE START HERE AGAIN TO CLEAN MORE AND MERGE 

# Function to preprocess and standardize date formats
def preprocess_date(date_str):
    if pd.isnull(date_str) or date_str.strip() == '':
        return date_str  # Leave it as is if it's empty or NaN
    
    date_str = str(date_str).strip()
    
    if re.match(r'^\d{4}$', date_str):  # Year only, e.g., '2005'
        return f'{date_str}-01-01'
    elif re.match(r'^\d{2}-\d{2}$', date_str):  # YY-MM, e.g., '18-07'
        parts = date_str.split('-')
        year, month = '20' + parts[0], parts[1]  # Assuming all dates are post-2005
        return f'{year}-{month}-01'
    elif re.match(r'^\d{2}-\w{3}$', date_str):  # YY-MMM, e.g., '19-Jul'
        parts = date_str.split('-')
        year, month = '20' + parts[0], parts[1]  # Assuming all dates are post-2005
        return f'{year}-{month}-01'
    
    return date_str  # Leave it as is if it doesn't match the format

# Load the data from CSV file
full_df = pd.read_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\FullDF.csv')

# Apply preprocessing to the 'Date of party switch' column
full_df['Date of party switch'] = full_df['Date of party switch'].apply(preprocess_date)

# Ensure dates are in correct datetime format, leave non-matching as is
full_df['Date of party switch'] = pd.to_datetime(full_df['Date of party switch'], errors='coerce')

# Find the oldest and newest dates, ignoring NaT values
oldest_date = full_df['Date of party switch'].min()
newest_date = full_df['Date of party switch'].max()

# Print the oldest and newest dates
print("Oldest date:", oldest_date)
print("Newest date:", newest_date)

# Save the dataframe to a new CSV file
full_df.to_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\dateflter1.csv', index=False)

# Check if all dates with values are formatted the same
def check_date_format(date_series):
    # Define the expected format
    expected_format = '%Y-%m-%d'
    
    # Drop NaN values
    date_series = date_series.dropna()
    
    # Convert to datetime and back to string in the expected format
    formatted_dates = pd.to_datetime(date_series, errors='coerce').dt.strftime(expected_format)
    
    # Check if all non-NaN dates match the expected format
    consistent = (date_series == formatted_dates).all()
    
    return consistent

consistent_format = check_date_format(full_df['Date of party switch'])

# Count how many dates are present in the 'Date of party switch' column
date_count = full_df['Date of party switch'].notna().sum()

print("All dates formatted the same:", consistent_format)
print("Number of dates in 'Date of party switch' column:", date_count)
# Ensure 'Date of party switch' is in datetime format
full_df['Date of party switch'] = pd.to_datetime(full_df['Date of party switch'], errors='coerce')

# Separate rows with and without 'Date of party switch'
with_date = full_df.dropna(subset=['Date of party switch'])
without_date = full_df[full_df['Date of party switch'].isna()]

# Sort by 'Name' and 'Date of party switch'
with_date = with_date.sort_values(by=['Name', 'Date of party switch'])

# Drop duplicates, keeping the first (earliest date)
with_date = with_date.drop_duplicates(subset=['Name', 'Country', 'State', 'Wikipedia_Link'], keep='first')

# Concatenate the cleaned dataframes back together
cleaned_full_df = pd.concat([with_date, without_date])

# Count the number of individual names with a 'Date of party switch'
unique_names_with_date = cleaned_full_df.dropna(subset=['Date of party switch'])['Name'].nunique()
print("Number of individual names with 'Date of party switch':", unique_names_with_date)



# Count the number of individual names with a 'Date of party switch'
unique_names_with_date = full_df.dropna(subset=['Date of party switch'])['Name'].nunique()
print("Number of individual names with 'Date of party switch':", unique_names_with_date)

# Ensure 'Date of party switch' is in datetime format
full_df['Date of party switch'] = pd.to_datetime(full_df['Date of party switch'], errors='coerce')

# Identify observations where both 'Party' and 'New Party' are filled but 'Date of party switch' is not filled
missing_date_switch = full_df[(full_df['Party'].notna()) & (full_df['New Party'].notna()) & (full_df['Date of party switch'].isna())]

# Display the identified observations
print(missing_date_switch.head(20))
# Ensure 'Date of party switch' is in datetime format
full_df['Date of party switch'] = pd.to_datetime(full_df['Date of party switch'], errors='coerce')

# Drop duplicates, prioritizing rows with 'Date of party switch'
full_df = full_df.sort_values(by=['Date of party switch'], na_position='last')
full_df = full_df.drop_duplicates(subset=['Name', 'Country', 'State', 'Wikipedia_Link'], keep='first')

full_df.to_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\dateflter3.csv', index=False)

score_df = pd.read_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\score_all.csv')

# Merge score_df with full_df based on the 'Name' column
merged_df = pd.merge(score_df, full_df[['Name', 'Country', 'State', 'Date of party switch', 'Party', 'New Party']], on='Name', how='left')

# Check if there are any names in score_df that do not have a match in full_df
unmatched_names = score_df[~score_df['Name'].isin(full_df['Name'])]['Name']

if not unmatched_names.empty:
    print("Names in score_df that do not have a match in full_df:")
    print(unmatched_names)
else:
    print("All names in score_df have a match in full_df.")

# Save the merged dataframe to a new CSV file
merged_df.to_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\merged_score_full.csv', index=False)

merged_df=pd.read_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\merged_score_full.csv')
# Display unique entries in 'Party' and 'New Party' columns
unique_parties = merged_df['Party'].dropna().unique()
unique_new_parties = merged_df['New Party'].dropna().unique()

print("Unique entries in 'Party' column:")
print(unique_parties)

print("\nUnique entries in 'New Party' column:")
print(unique_new_parties)

# Load the datasets
merged_score_full_df = pd.read_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\merged_score_full.csv')
party_afil_df = pd.read_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\Party afil.csv', encoding='latin1')

# Remove spaces only on the outside of the words in 'Country' and 'Party' columns
merged_score_full_df['Country'] = merged_score_full_df['Country'].str.strip()
merged_score_full_df['Party'] = merged_score_full_df['Party'].str.strip()
merged_score_full_df['New Party'] = merged_score_full_df['New Party'].str.strip()

party_afil_df['Country'] = party_afil_df['Country'].str.strip()
party_afil_df['Party'] = party_afil_df['Party'].str.strip()

# Fill empty 'New Party' with 'Party'
merged_score_full_df['New Party'].fillna(merged_score_full_df['Party'], inplace=True)

# Merging based on Party and Country
merged_df = merged_score_full_df.merge(
    party_afil_df[['Country', 'Party', 'Party_Alignment', 'Party Right Wing']],
    how='left',
    left_on=['Country', 'Party'],
    right_on=['Country', 'Party']
)

# Merging based on New Party and Country
merged_df = merged_df.merge(
    party_afil_df[['Country', 'Party', 'New Party_Alignment', 'New Party Right Wing']],
    how='left',
    left_on=['Country', 'New Party'],
    right_on=['Country', 'Party'],
    suffixes=('', '_New')
)

# Drop redundant columns
merged_df = merged_df.drop(columns=['Party_New'])

# Save the merged dataframe to a new CSV file
merged_df.to_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\merged_score_full_with_party_info1.csv', index=False)

print(merged_df.head())

# Load the datasets
merged_score_full_with_party_info_df = pd.read_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\merged_score_full_with_party_info1.csv', encoding='latin1')
full_word_analyzed_score_df = pd.read_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\full word analyzed_score.csv', encoding='latin1')

# Merging based on URL and Date
final_merged_df = merged_score_full_with_party_info_df.merge(
    full_word_analyzed_score_df[['url', 'date', 'word_count']],
    how='left',
    left_on=['url', 'date'],
    right_on=['url', 'date']
)


# Number of observations before deletion
initial_count = len(final_merged_df)

# Remove observations missing all three of 'positive', 'neutral', and 'negative'
final_merged_df = final_merged_df.dropna(subset=['positive', 'neutral', 'negative'], how='all')

# Number of observations after deletion
final_count = len(final_merged_df)

# Calculate the number and percentage of deleted observations
deleted_count = initial_count - final_count
deleted_percentage = (deleted_count / initial_count) * 100

# Save the final merged dataframe to a new CSV file
final_merged_df.to_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\final_merged_with_word_count.csv', index=False)

# Display the merged dataframe and deletion stats
print(f"Merge complete. Output saved to 'final_merged_with_word_count.csv'.")
print(f"Number of observations deleted: {deleted_count}")
print(f"Percentage of observations deleted: {deleted_percentage:.2f}%")
#removing time observation 
final_merged_df['date'] = pd.to_datetime(final_merged_df['date']).dt.date

# Save the final merged dataframe to a new CSV file
final_merged_df.to_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\final_DF.csv', index=False)



# Create 'more_left' column
final_merged_df['more_left'] = np.where(
    ((final_merged_df['Party Right Wing'] == 1) & (final_merged_df['New Party Right Wing'] == 0)) |
    (final_merged_df['Party_Alignment'] > final_merged_df['New Party_Alignment']) |
    (final_merged_df['Party Right Wing'].isna() & (final_merged_df['New Party Right Wing'] == 0)),
    1, 0
)

# Create 'more_right' column
final_merged_df['more_right'] = np.where(
    ((final_merged_df['Party Right Wing'] == 0) & (final_merged_df['New Party Right Wing'] == 1)) |
    (final_merged_df['Party_Alignment'] < final_merged_df['New Party_Alignment']) |
    (final_merged_df['Party_Alignment'].notna() & final_merged_df['New Party_Alignment'].isna()) |
    (final_merged_df['Party Right Wing'].isna() & (final_merged_df['New Party Right Wing'] == 1)),
    1, 0
)

# Create 'equal' column
final_merged_df['equal'] = np.where(
    (final_merged_df['Party Right Wing'] == final_merged_df['New Party Right Wing']) &
    (final_merged_df['Party_Alignment'] == final_merged_df['New Party_Alignment'])|
    (final_merged_df['Party Right Wing'].isna() & (final_merged_df['New Party Right Wing'].isna())),
    1, 0
)

# Ensure date columns are in datetime format
final_merged_df['Date of party switch'] = pd.to_datetime(final_merged_df['Date of party switch'], errors='coerce')
final_merged_df['date'] = pd.to_datetime(final_merged_df['date'], errors='coerce')

# Create 'treated' column
final_merged_df['treated'] = np.where(
    final_merged_df['Date of party switch'].notna() & (final_merged_df['date'] > final_merged_df['Date of party switch']),
    1, 0
)
#analysis below 

# Load the dataset
file_path = 'C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\FinalDFForAnalysisCleanScore.csv'
final_merged_df = pd.read_csv(file_path)
final_merged_df = final_merged_df.drop_duplicates()
# Split into two DataFrames
df_equal = final_merged_df[final_merged_df['equal'] == 1]

df_more_left = final_merged_df[(final_merged_df['more_left'] == 1) & (final_merged_df['more_right'] != 1)]
df_more_left = pd.concat([df_more_left, df_equal])

df_more_right = final_merged_df[(final_merged_df['more_right'] == 1) & (final_merged_df['more_left'] != 1)]
df_more_right = pd.concat([df_more_right, df_equal])
# Save the split DataFrames to new CSV files
df_more_left.to_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\final_DF_more_left.csv', index=False)
df_more_right.to_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\final_DF_more_right.csv', index=False)
final_merged_df.to_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\FinalDFForAnalysisWithALL.csv', index=False)




In [None]:
# Load the CSV files into DataFrames
df_more_left = pd.read_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\final_DF_more_left.csv')
df_more_right = pd.read_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\final_DF_more_right.csv')
final_merged_df = pd.read_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\FinalDFForAnalysisWithALL.csv')

# Load your DataFrame
df = final_merged_df

# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Group by date and calculate the mean sentiment score
avg_sentiment_over_time = df.groupby('date')['score'].mean().reset_index()

# Smooth the data using a rolling window of 150 days
avg_sentiment_over_time['score'] = avg_sentiment_over_time['score'].rolling(window=150).mean()

# Create a plot with a colorblind-friendly color
plt.figure(figsize=(15, 10))
plt.plot(avg_sentiment_over_time['date'], avg_sentiment_over_time['score'], label='Average Sentiment (150-day Rolling Average)',
         color=to_rgba('#117733'), linewidth=2)  # Colorblind-friendly green

# Customize the plot with labels, title, and grid
plt.xlabel('Date', fontsize=14)
plt.ylabel('Average Sentiment Score', fontsize=14)
plt.title('Average Sentiment Over Time for All Politicians', fontsize=16)
plt.legend(loc='best')
plt.grid(True, linestyle='--', alpha=0.5)  # Lighter grid lines for better readability
plt.show()



# Create a new column to indicate treated group based on "Date of party switch"
df['treated_group'] = df['Date_of_party_switch'].notna().astype(int)

# Group by date and treated group, then calculate the mean sentiment score
avg_sentiment_treated = df.groupby(['date', 'treated_group'])['score'].mean().reset_index()

# Smooth the data using a rolling window of 150 days
avg_sentiment_treated['score'] = avg_sentiment_treated.groupby('treated_group')['score'].transform(lambda x: x.rolling(window=150).mean())

# Prepare a colorblind-friendly color palette
colors = ['#117733', '#CC6677']  # Green and pink, colorblind-friendly

# Plot the average sentiment over time for treated and untreated groups
plt.figure(figsize=(15, 10))
for idx, treated in enumerate(avg_sentiment_treated['treated_group'].unique()):
    group_data = avg_sentiment_treated[avg_sentiment_treated['treated_group'] == treated]
    plt.plot(group_data['date'], group_data['score'], label=f'Treated Group: {treated}', color=colors[idx], linewidth=2)

# Customize the plot with labels, title, and grid
plt.xlabel('Date', fontsize=14)
plt.ylabel('Average Sentiment Score', fontsize=14)
plt.title('Average Sentiment Over Time by Treated Group (Date of Party Switch)', fontsize=16)
plt.legend(loc='best')
plt.grid(True, linestyle='--', alpha=0.5)  # Lighter grid lines for better readability
plt.show()
# Filter for treated=0 and group by date and Party_Alignment
avg_sentiment_party_alignment = df[df['treated_group'] == 0].groupby(['date', 'Party_Alignment'])['score'].mean().reset_index()

# Smooth the data using a rolling window of 150 days
avg_sentiment_party_alignment['score'] = avg_sentiment_party_alignment.groupby('Party_Alignment')['score'].transform(lambda x: x.rolling(window=150).mean())

# Prepare a colorblind-friendly color palette
colors = ['#88CCEE', '#DDCC77', '#117733', '#CC6677']  # Blue, yellow, green, pink - all colorblind-friendly

# Plot the average sentiment over time for treated=0, split by Party_Alignment
plt.figure(figsize=(15, 10))
for idx, alignment in enumerate(avg_sentiment_party_alignment['Party_Alignment'].unique()):
    alignment_data = avg_sentiment_party_alignment[avg_sentiment_party_alignment['Party_Alignment'] == alignment]
    plt.plot(alignment_data['date'], alignment_data['score'], label=f'Party_Alignment: {alignment}', color=colors[idx % len(colors)], linewidth=2)

# Customize the plot with labels, title, and grid
plt.xlabel('Date', fontsize=14)
plt.ylabel('Average Sentiment Score', fontsize=14)
plt.title('Average Sentiment Over Time for Treated=0, Split by Party_Alignment (150-day Rolling Average)', fontsize=16)
plt.legend(loc='best')
plt.grid(True, linestyle='--', alpha=0.5)  # Lighter and dashed grid lines for better readability
plt.show()

# Filter for treated=0 and group by date and Party Right Wing
avg_sentiment_right_wing = df[df['treated_group'] == 0].groupby(['date', 'Party_Right_Wing'])['score'].mean().reset_index()

# Smooth the data using a rolling window of 150 days
avg_sentiment_right_wing['score'] = avg_sentiment_right_wing.groupby('Party_Right_Wing')['score'].transform(lambda x: x.rolling(window=150).mean())

# Prepare a colorblind-friendly color palette
colors = ['#E69F00', '#56B4E9', '#009E73', '#F0E442', '#0072B2', '#D55E00', '#CC79A7']  # A set of distinct colors for colorblind viewers

# Plot the average sentiment over time for treated=0, split by Party Right Wing
plt.figure(figsize=(15, 10))
for idx, right_wing in enumerate(avg_sentiment_right_wing['Party_Right_Wing'].unique()):
    right_wing_data = avg_sentiment_right_wing[avg_sentiment_right_wing['Party_Right_Wing'] == right_wing]
    plt.plot(right_wing_data['date'], right_wing_data['score'], label=f'Party Right Wing: {right_wing}', color=colors[idx % len(colors)], linewidth=2)

# Customize the plot with labels, title, and grid
plt.xlabel('Date', fontsize=14)
plt.ylabel('Average Sentiment Score', fontsize=14)
plt.title('Average Sentiment Over Time for Treated=0, Split by Party Right Wing (150-day Rolling Average)', fontsize=16)
plt.legend(loc='best')
plt.grid(True, linestyle='--', alpha=0.5)  # Lighter and dashed grid lines for better readability
plt.show()
# Filter for treated=0 and group by Party_Alignment
avg_sentiment_party_alignment_bar = df[df['treated_group'] == 0].groupby('Party_Alignment')['score'].mean().reset_index()

# Colorblind-friendly color palette
colors = ['#0072B2', '#E69F00', '#F0E442', '#009E73', '#56B4E9', '#D55E00', '#CC79A7']

# Plot bar graph for average sentiment for treated=0, split by Party_Alignment
plt.figure(figsize=(15, 10))
bars = plt.bar(avg_sentiment_party_alignment_bar['Party_Alignment'], avg_sentiment_party_alignment_bar['score'],
               color=colors[:len(avg_sentiment_party_alignment_bar)], width=0.6)  # Adjust bar width here

# Customize plot with labels, title
plt.xlabel('Party_Alignment', fontsize=14)
plt.ylabel('Average Sentiment Score', fontsize=14)
plt.title('Average Sentiment for Treated=0, Split by Party_Alignment', fontsize=16)

# Improve aesthetics for the bar chart
plt.xticks(rotation=45)  # Rotate labels to fit better if needed
plt.gca().set_facecolor('#f0f0f0')  # Light gray background for better contrast
plt.grid(True, linestyle='--', alpha=0.5, axis='y')  # Grid only horizontally

# Display the plot
plt.show()

# Filter for treated=1 and group by Party_Alignment
avg_sentiment_party_alignment_bar = df[df['treated_group'] == 1].groupby('Party_Alignment')['score'].mean().reset_index()

# Colorblind-friendly color palette
colors = ['#0072B2', '#E69F00', '#F0E442', '#009E73', '#56B4E9', '#D55E00', '#CC79A7']

# Plot bar graph for average sentiment for treated=0, split by Party_Alignment
plt.figure(figsize=(15, 10))
bars = plt.bar(avg_sentiment_party_alignment_bar['Party_Alignment'], avg_sentiment_party_alignment_bar['score'],
               color=colors[:len(avg_sentiment_party_alignment_bar)], width=0.6)  # Adjust bar width here

# Customize plot with labels, title
plt.xlabel('Party_Alignment', fontsize=14)
plt.ylabel('Average Sentiment Score', fontsize=14)
plt.title('Average Sentiment for Treated=1, Split by Party_Alignment', fontsize=16)

# Improve aesthetics for the bar chart
plt.xticks(rotation=45)  # Rotate labels to fit better if needed
plt.gca().set_facecolor('#f0f0f0')  # Light gray background for better contrast
plt.grid(True, linestyle='--', alpha=0.5, axis='y')  # Grid only horizontally

# Display the plot
plt.show()


# Filter for treated=1 and group by Party_Alignment
avg_sentiment_party_alignment_bar = df.groupby('Party_Alignment')['score'].mean().reset_index()

# Colorblind-friendly color palette
colors = ['#0072B2', '#E69F00', '#F0E442', '#009E73', '#56B4E9', '#D55E00', '#CC79A7']

# Plot bar graph for average sentiment for treated=1, split by Party_Alignment
plt.figure(figsize=(15, 10))
bars = plt.bar(avg_sentiment_party_alignment_bar['Party_Alignment'], avg_sentiment_party_alignment_bar['score'],
               color=colors[:len(avg_sentiment_party_alignment_bar)], width=0.6)  # Adjust bar width here

# Customize plot with labels, title
plt.xlabel('Party_Alignment', fontsize=14)
plt.ylabel('Average Sentiment Score', fontsize=14)
plt.title('Average Sentiment, Split by Party_Alignment', fontsize=16)

# Improve aesthetics for the bar chart
plt.xticks(rotation=45)  # Rotate labels to fit better if needed
plt.gca().set_facecolor('#f0f0f0')  # Light gray background for better contrast
plt.grid(True, linestyle='--', alpha=0.5, axis='y')  # Grid only horizontally

# Display the plot
plt.show()
# Colorblind-friendly color palette
colors = ['#0072B2', '#009E73', '#56B4E9', '#D55E00']  # Dark blue, teal, light blue, orange

# Plot function for average sentiment, split by Party Right Wing
def plot_avg_sentiment(df, treated, title):
    avg_sentiment_right_wing_bar = df[df['treated_group'] == treated].groupby('Party_Right_Wing')['score'].mean().reset_index()
    plt.figure(figsize=(15, 7))  # Reduced height for a slimmer look
    bars = plt.bar(avg_sentiment_right_wing_bar['Party_Right_Wing'], avg_sentiment_right_wing_bar['score'],
                   color=colors[:len(avg_sentiment_right_wing_bar)], width=0.5, edgecolor='black')  # Less width and edge color for definition
    plt.xlabel('Party Right Wing', fontsize=12)
    plt.ylabel('Average Sentiment Score', fontsize=12)
    plt.title(title, fontsize=14)
    plt.xticks(rotation=45, fontsize=10)  # Smaller font size for axis ticks
    plt.yticks(fontsize=10)
    plt.gca().set_facecolor('#f0f0f0')  # Light gray background for better contrast
    plt.grid(True, linestyle='--', alpha=0.5, axis='y', which='major')  # Grid only horizontally
    plt.tight_layout()  # Adjust layout to make sure everything fits without overlapping
    plt.show()

# Call function for treated=0
plot_avg_sentiment(df, 0, 'Average Sentiment for Treated=0, Split by Party Right Wing')

# Call function for treated=1
plot_avg_sentiment(df, 1, 'Average Sentiment for Treated=1, Split by Party Right Wing')

# Filter for treated=1 and group by Party Right Wing
avg_sentiment_right_wing_bar = df.groupby('Party_Right_Wing')['score'].mean().reset_index()

## Colorblind-friendly color palette, subtle and professional
colors = ['#0072B2', '#56B4E9', '#009E73', '#E69F00']  # Blue, light blue, teal, orange

# Plot bar graph for average sentiment for treated=1, split by Party Right Wing
plt.figure(figsize=(15, 7))  # Adjusted for a slimmer look
bars = plt.bar(avg_sentiment_right_wing_bar['Party_Right_Wing'], avg_sentiment_right_wing_bar['score'],
               color=colors[:len(avg_sentiment_right_wing_bar)], width=0.5, edgecolor='black')  # More elegant bar width and definition

# Customize the plot
plt.xlabel('Party Right Wing', fontsize=12)
plt.ylabel('Average Sentiment Score', fontsize=12)
plt.title('Average Sentiment, Split by Party Right Wing', fontsize=14)
plt.xticks(rotation=45, fontsize=10)  # Smaller font size for labels
plt.yticks(fontsize=10)
plt.gca().set_facecolor('#f0f0f0')  # Subtle background color for better contrast
plt.grid(True, linestyle='--', alpha=0.5, axis='y')  # Subtle grid for better readability
plt.tight_layout()  # Ensures everything fits well without overlapping

# Show the plot
plt.show()
# Colorblind-friendly color palette
colors = ['#E69F00', '#56B4E9', '#009E73', '#F0E442', '#0072B2', '#D55E00', '#CC79A7']  # Orange, sky blue, teal, yellow, dark blue, red-orange, pink

def plot_overlap_line_graph(df, group_col, title):
    plt.figure(figsize=(15, 10))
    color_index = 0  # Index to keep track of colors for lines
    for treated in df['treated_group'].unique():
        for group in df[group_col].unique():
            group_data = df[(df['treated_group'] == treated) & (df[group_col] == group)].groupby('date')['score'].mean().reset_index()
            group_data['score'] = group_data['score'].rolling(window=150).mean()  # Smooth the data
            # Assign a color from the palette and increment the index
            plt.plot(group_data['date'], group_data['score'], label=f'Treated={treated}, {group_col}={group}', color=colors[color_index % len(colors)])
            color_index += 1
    plt.xlabel('Date')
    plt.ylabel('Average Sentiment Score')
    plt.title(title)
    plt.legend(loc='best', fontsize='small', title_fontsize='medium')
    plt.grid(True)
    plt.show()

# Example call to the function
plot_overlap_line_graph(df, 'Party_Alignment', 'Average Sentiment Over Time Overlapping Treated Groups, Split by Party_Alignment (150-day Rolling Average)')
plot_overlap_line_graph(df, 'Party Right Wing', 'Average Sentiment Over Time Overlapping Treated Groups, Split by Party Right Wing (7-day Rolling Average)')

date_ranges = [(2000, 2006), (2007, 2011), (2012, 2019), (2020, 2024)]

def plot_bar_graph(df, group_col, title, xlabel='Group', ylabel='Average Sentiment Score'):
    plt.figure(figsize=(15, 10))
    group_data = df.groupby(group_col)['score'].mean().reset_index()
    plt.bar(group_data[group_col], group_data['score'])
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.show()

def plot_bar_graph_by_date_range(df, group_col, date_ranges, treated_label):
    for start, end in date_ranges:
        date_range_df = df[(df['date'].dt.year >= start) & (df['date'].dt.year <= end)]
        plot_bar_graph(date_range_df, group_col, f'Average Sentiment ({start}-{end}) by {group_col} for Treated={treated_label}')

plot_bar_graph_by_date_range(df[df['treated'] == 0], 'Party_Alignment', date_ranges, treated_label=0)
plot_bar_graph_by_date_range(df[df['treated'] == 1], 'Party_Alignment', date_ranges, treated_label=1)
plot_bar_graph_by_date_range(df[df['treated'] == 0], 'Party Right Wing', date_ranges, treated_label=0)
plot_bar_graph_by_date_range(df[df['treated'] == 1], 'Party Right Wing', date_ranges, treated_label=1)


# Colorblind-friendly color palette
colors = ['#E69F00', '#56B4E9', '#009E73', '#F0E442', '#0072B2', '#D55E00', '#CC79A7']  # Orange, sky blue, teal, yellow, dark blue, red-orange, pink

plt.figure(figsize=(15, 10))
countries = df['Country'].unique()
color_index = 0  # Index to manage color assignment for each country

for country in countries:
    country_data = df[df['Country'] == country]
    avg_sentiment_over_time = country_data.groupby('date')['score'].mean().reset_index()
    avg_sentiment_over_time['score'] = avg_sentiment_over_time['score'].rolling(window=150).mean()  # Smooth the data
    plt.plot(avg_sentiment_over_time['date'], avg_sentiment_over_time['score'], label=f'Average Sentiment in {country}', color=colors[color_index % len(colors)])
    color_index += 1

plt.xlabel('Date', fontsize=12)
plt.ylabel('Average Sentiment Score', fontsize=12)
plt.title('Average Sentiment Over Time for All Politicians by Country', fontsize=14)
plt.legend(loc='best', fontsize='small', title_fontsize='medium')
plt.grid(True)
plt.show()
plt.figure(figsize=(15, 10))
for country in countries:
    country_data = df[df['Country'] == country]
    country_data['treated_group'] = country_data['Date of party switch'].notna().astype(int)
    avg_sentiment_treated = country_data.groupby(['date', 'treated_group'])['score'].mean().reset_index()
    avg_sentiment_treated['score'] = avg_sentiment_treated.groupby('treated_group')['score'].transform(lambda x: x.rolling(window=150).mean())
    for treated in avg_sentiment_treated['treated_group'].unique():
        group_data = avg_sentiment_treated[avg_sentiment_treated['treated_group'] == treated]
        plt.plot(group_data['date'], group_data['score'], label=f'{country}, Treated Group: {treated}')

plt.xlabel('Date')
plt.ylabel('Average Sentiment Score')
plt.title('Average Sentiment Over Time by Treated Group and Country')
plt.legend(loc='best')
plt.grid(True)
plt.show()

# Colorblind-friendly color palette
colors = ['#0072B2', '#E69F00', '#F0E442', '#009E73', '#56B4E9', '#D55E00', '#CC79A7']  # Blue, orange, yellow, teal, sky blue, red-orange, pink

countries = df['Country'].unique()

for country in countries:
    country_data = df[(df['Country'] == country) & (df['treated_group'] == 0)]
    avg_sentiment_party_alignment = country_data.groupby(['date', 'Party_Alignment'])['score'].mean().reset_index()
    avg_sentiment_party_alignment['score'] = avg_sentiment_party_alignment.groupby('Party_Alignment')['score'].transform(lambda x: x.rolling(window=120).mean())

    plt.figure(figsize=(15, 10))
    color_index = 0  # Index to manage color assignment for each party alignment
    for alignment in avg_sentiment_party_alignment['Party_Alignment'].unique():
        alignment_data = avg_sentiment_party_alignment[avg_sentiment_party_alignment['Party_Alignment'] == alignment]
        plt.plot(alignment_data['date'], alignment_data['score'], label=f'Party_Alignment: {alignment}', color=colors[color_index % len(colors)])
        color_index += 1

    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Average Sentiment Score', fontsize=12)
    plt.title(f'Average Sentiment Over Time for Treated=0 in {country}, Split by Party_Alignment', fontsize=14)
    plt.legend(loc='best', fontsize='small', title_fontsize='medium')
    plt.grid(True)
    plt.show()

plt.figure(figsize=(15, 10))
for country in countries:
    country_data = df[(df['Country'] == country) & (df['treated'] == 0)]
    avg_sentiment_right_wing = country_data.groupby(['date', 'Party Right Wing'])['score'].mean().reset_index()
    avg_sentiment_right_wing['score'] = avg_sentiment_right_wing.groupby('Party Right Wing')['score'].transform(lambda x: x.rolling(window=150).mean())

    for right_wing in avg_sentiment_right_wing['Party Right Wing'].unique():
        right_wing_data = avg_sentiment_right_wing[avg_sentiment_right_wing['Party Right Wing'] == right_wing]
        plt.plot(right_wing_data['date'], right_wing_data['score'], label=f'{country}, Party Right Wing: {right_wing}')

plt.xlabel('Date')
plt.ylabel('Average Sentiment Score')
plt.title('Average Sentiment Over Time for Treated=0, Split by Party Right Wing and Country')
plt.legend(loc='best')
plt.grid(True)
plt.show()



# Assuming df is your DataFrame

# Colorblind-friendly color palette, subtle yet distinct
colors = ['#0072B2', '#D55E00', '#009E73', '#56B4E9', '#CC79A7']  # Dark blue, red-orange, teal, light blue, pink

countries = df['Country'].unique()

for country in countries:
    country_data = df[df['Country'] == country]
    avg_sentiment_right_wing_bar = country_data.groupby('Party_Right_Wing')['score'].mean().reset_index()
    
    plt.figure(figsize=(10, 6))  # Slightly smaller for a more refined look
    bars = plt.bar(avg_sentiment_right_wing_bar['Party_Right_Wing'], avg_sentiment_right_wing_bar['score'],
                   color=colors[:len(avg_sentiment_right_wing_bar)], width=0.6, edgecolor='black')  # Add edgecolor for definition
    
    plt.xlabel('Party Right Wing', fontsize=12)
    plt.ylabel('Average Sentiment Score', fontsize=12)
    plt.title(f'Average Sentiment by Party Right Wing in {country}', fontsize=14)
    plt.xticks(rotation=45, fontsize=10)  # Rotate labels to fit better if needed
    plt.yticks(fontsize=10)
    plt.gca().set_facecolor('#f0f0f0')  # Light gray background for better contrast
    plt.grid(True, linestyle='--', alpha=0.5, axis='y')  # Subtle grid for better readability
    plt.tight_layout()  # Adjust layout to make sure everything fits without overlapping
    plt.show()

# Calculate the average number of words split by country
avg_word_count_by_country = df.groupby('Country')['word_count'].mean().reset_index()

# Colorblind-friendly color palette, elegant and suitable for all viewers
colors = ['#0072B2', '#009E73', '#56B4E9', '#D55E00', '#CC79A7']  # Dark blue, teal, light blue, red-orange, pink

plt.figure(figsize=(12, 8))  # Adjusted for a more refined and proportional look
bars = plt.bar(avg_word_count_by_country['Country'], avg_word_count_by_country['word_count'],
               color=colors[:len(avg_word_count_by_country)], width=0.6, edgecolor='black')  # Uniform colors and defined edges

plt.xlabel('Country', fontsize=12)
plt.ylabel('Average Number of Words', fontsize=12)
plt.title('Average Number of Words by Country', fontsize=14)
plt.xticks(rotation=45, fontsize=10)  # Rotate labels to ensure they are readable
plt.yticks(fontsize=10)
plt.gca().set_facecolor('#f0f0f0')  # Soft grey background for contrast improvement
plt.grid(True, linestyle='--', alpha=0.5, axis='y')  # Refined grid on y-axis only
plt.tight_layout()  # Ensure all labels and titles fit well within the canvas
plt.show()


# Prepare data for regression
df_clean = df.dropna(subset=['score', 'Party_Alignment'])

# Convert categorical 'Party_Alignment' to numeric if not already numeric
# This assumes 'Party_Alignment' is categorical and maps each unique category to a unique integer
if df_clean['Party_Alignment'].dtype == 'object':
    df_clean['Party_Alignment'] = pd.Categorical(df_clean['Party_Alignment']).codes

# Linear regression using seaborn with colorblind-friendly settings
plt.figure(figsize=(12, 8))
sns.regplot(x='Party_Alignment', y='score', data=df_clean, scatter_kws={'s': 50, 'alpha': 0.5}, line_kws={'color': '#0072B2'})
plt.xlabel('Party Alignment', fontsize=12)
plt.ylabel('Sentiment Score', fontsize=12)
plt.title('Sentiment vs. Party Alignment', fontsize=14)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(True, linestyle='--', alpha=0.5)  # Enhance grid visibility for better measurement reading
plt.show()

# Linear regression using statsmodels for more details
X = df_clean['Party_Alignment']
y = df_clean['score']
X = sm.add_constant(X)  # Adds a constant term to the predictor

model = sm.OLS(y, X).fit()
predictions = model.predict(X)

# Print the summary of the regression
print(model.summary())
# Select relevant columns for correlation matrix
corr_df = df[['score', 'Party_Right_Wing']]


# Calculate the correlation matrix
corr_matrix = corr_df.corr()

# Plot the correlation matrix using a colorblind-friendly palette
plt.figure(figsize=(8, 6))
# 'viridis' is a good choice for colorblind-friendly palettes; it's perceptually uniform and looks good in black and white as well.
sns.heatmap(corr_matrix, annot=True, cmap='viridis', center=0)
plt.title('Correlation Matrix: Sentiment with Party Right Wing', fontsize=12)
plt.xticks(fontsize=10)  # Adjust font size for readability
plt.yticks(fontsize=10)
plt.show()
# Drop rows with missing values in the relevant columns
df_clean = df.dropna(subset=['score', 'Party_Alignment', 'Country'])

# Convert the 'Country' column to dummy variables
df_dummies = pd.get_dummies(df_clean, columns=['Country'], drop_first=True)

# Define the independent variables (including dummy variables for countries)
X = df_dummies[['Party_Alignment'] + [col for col in df_dummies.columns if col.startswith('Country_')]]
y = df_dummies['score']

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Display the regression results
print(model.summary())



In [None]:
# Load the dataset
file_path = 'C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\FinalDFForAnalysisWithALL.csv'
df = pd.read_csv(file_path)
df['ever_treated'] = df['Date_of_party_switch'].notna().astype(int)
df.to_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\FinalDFForAnalysisWithALL.csv')

# Convert date to datetime format for accurate time calculations
df['date'] = pd.to_datetime(df['date'])

# Function to process each individual
def process_individual(group):
    if group['ever_treated'].iloc[0] == 1:
        # Check if there's a treatment time
        if not any(group['treated_dummy'] == 1):
            return pd.DataFrame()  # Return empty DataFrame if no treatment time
        
        # Identify treatment time
        treatment_time = group[group['treated_dummy'] == 1].index[0]
        treatment_date = group.loc[treatment_time, 'date']

        # Select up to 4 pre-treatment observations
        pre_treatment = group[group['treated_dummy'] == 0].iloc[-4:]

        # Select up to 4 post-treatment observations
        post_treatment = group[group['treated_dummy'] == 1].iloc[:4]

        # Filter out observations beyond 2 years before and after treatment
        max_date = treatment_date + pd.DateOffset(years=2)
        min_date = treatment_date - pd.DateOffset(years=2)
        group = group[(group['date'] <= max_date) & (group['date'] >= min_date)]
        
        # Combine pre-treatment and post-treatment observations
        combined = pd.concat([pre_treatment, post_treatment])
        
        return combined
    else:
        return group

# Apply processing to each individual based on the 'Name' column
processed_df = df.groupby('Name').apply(process_individual).reset_index(drop=True)

# Save the processed data to a new CSV file
processed_df.to_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\Processed_FinalDF.csv', index=False)

data = pd.read_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\Processed_FinalDF.csv')
# Define the treatment and control groups based on the criteria
left_treatment_group = data[(data['more_left'] == 1)]
right_treatment_group = data[(data['more_right'] == 1)]
control_group = data[data['ever_treated'] == 0]

# Combine the left treatment group with the control group
left_treatment_plus_control = pd.concat([left_treatment_group, control_group])

# Combine the right treatment group with the control group
right_treatment_plus_control = pd.concat([right_treatment_group, control_group])

# Now you have two DataFrames:
# left_treatment_plus_control - contains the left treatment group and control group
# right_treatment_plus_control - contains the right treatment group and control group

right_treatment_plus_control.to_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\RightDF.csv', index=False)
left_treatment_plus_control.to_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\LeftDF.csv', index=False)

# Load the dataset

# Filter out rows with missing word_count

# Grouping by Name to calculate per-politician statistics
per_politician_stats_corrected = data.groupby(['Name']).agg(
    avg_score_per_politician=('score', 'mean'),
    avg_word_count_per_politician=('word_count', 'mean'),
    Party_Alignment=('Party_Alignment', 'mean'),
    ever_treated=('ever_treated', 'first'),
    Party_Right_Wing=('Party_Right_Wing', 'first'),
    Country=('Country', 'first')
).reset_index()

# Function to calculate mean and standard error
def mean_and_se(series):
    mean = series.mean()
    se = np.std(series, ddof=1) / np.sqrt(series.count())
    return mean, se

# Calculating summary statistics for Treatment and Control groups
treatment_stats_corrected = per_politician_stats_corrected[per_politician_stats_corrected['ever_treated'] == 1.0]
control_stats_corrected = per_politician_stats_corrected[per_politician_stats_corrected['ever_treated'] == 0.0]

# Right Wing and Left Wing groups
right_wing_stats_corrected = per_politician_stats_corrected[per_politician_stats_corrected['Party_Right_Wing'] == 1.0]
left_wing_stats_corrected = per_politician_stats_corrected[per_politician_stats_corrected['Party_Right_Wing'] == 0.0]

# By Country groups
usa_stats_corrected = per_politician_stats_corrected[per_politician_stats_corrected['Country'] == 'USA']
canada_stats_corrected = per_politician_stats_corrected[per_politician_stats_corrected['Country'] == 'Canada']
uk_stats_corrected = per_politician_stats_corrected[per_politician_stats_corrected['Country'] == 'UK']

# Creating the summary table
summary_table = {
    'Group': ['Treatment', 'Control', 'Right Wing', 'Left Wing', 'USA', 'Canada', 'UK'],
    'N': [
        treatment_stats_corrected['Name'].nunique(),
        control_stats_corrected['Name'].nunique(),
        right_wing_stats_corrected['Name'].nunique(),
        left_wing_stats_corrected['Name'].nunique(),
        usa_stats_corrected['Name'].nunique(),
        canada_stats_corrected['Name'].nunique(),
        uk_stats_corrected['Name'].nunique()
    ],
    'Average Score (SE)': [
        f"{mean_and_se(treatment_stats_corrected['avg_score_per_politician'])[0]:.2f} ({mean_and_se(treatment_stats_corrected['avg_score_per_politician'])[1]:.2f})",
        f"{mean_and_se(control_stats_corrected['avg_score_per_politician'])[0]:.2f} ({mean_and_se(control_stats_corrected['avg_score_per_politician'])[1]:.2f})",
        f"{mean_and_se(right_wing_stats_corrected['avg_score_per_politician'])[0]:.2f} ({mean_and_se(right_wing_stats_corrected['avg_score_per_politician'])[1]:.2f})",
        f"{mean_and_se(left_wing_stats_corrected['avg_score_per_politician'])[0]:.2f} ({mean_and_se(left_wing_stats_corrected['avg_score_per_politician'])[1]:.2f})",
        f"{mean_and_se(usa_stats_corrected['avg_score_per_politician'])[0]:.2f} ({mean_and_se(usa_stats_corrected['avg_score_per_politician'])[1]:.2f})",
        f"{mean_and_se(canada_stats_corrected['avg_score_per_politician'])[0]:.2f} ({mean_and_se(canada_stats_corrected['avg_score_per_politician'])[1]:.2f})",
        f"{mean_and_se(uk_stats_corrected['avg_score_per_politician'])[0]:.2f} ({mean_and_se(uk_stats_corrected['avg_score_per_politician'])[1]:.2f})"
    ],
    'Word Count (SE)': [
        f"{mean_and_se(treatment_stats_corrected['avg_word_count_per_politician'])[0]:.2f} ({mean_and_se(treatment_stats_corrected['avg_word_count_per_politician'])[1]:.2f})",
        f"{mean_and_se(control_stats_corrected['avg_word_count_per_politician'])[0]:.2f} ({mean_and_se(control_stats_corrected['avg_word_count_per_politician'])[1]:.2f})",
        f"{mean_and_se(right_wing_stats_corrected['avg_word_count_per_politician'])[0]:.2f} ({mean_and_se(right_wing_stats_corrected['avg_word_count_per_politician'])[1]:.2f})",
        f"{mean_and_se(left_wing_stats_corrected['avg_word_count_per_politician'])[0]:.2f} ({mean_and_se(left_wing_stats_corrected['avg_word_count_per_politician'])[1]:.2f})",
        f"{mean_and_se(usa_stats_corrected['avg_word_count_per_politician'])[0]:.2f} ({mean_and_se(usa_stats_corrected['avg_word_count_per_politician'])[1]:.2f})",
        f"{mean_and_se(canada_stats_corrected['avg_word_count_per_politician'])[0]:.2f} ({mean_and_se(canada_stats_corrected['avg_word_count_per_politician'])[1]:.2f})",
        f"{mean_and_se(uk_stats_corrected['avg_word_count_per_politician'])[0]:.2f} ({mean_and_se(uk_stats_corrected['avg_word_count_per_politician'])[1]:.2f})"
    ],
    'Party Alignment (SE)': [
        f"{mean_and_se(treatment_stats_corrected['Party_Alignment'])[0]:.2f} ({mean_and_se(treatment_stats_corrected['Party_Alignment'])[1]:.2f})",
        f"{mean_and_se(control_stats_corrected['Party_Alignment'])[0]:.2f} ({mean_and_se(control_stats_corrected['Party_Alignment'])[1]:.2f})",
        f"{mean_and_se(right_wing_stats_corrected['Party_Alignment'])[0]:.2f} ({mean_and_se(right_wing_stats_corrected['Party_Alignment'])[1]:.2f})",
        f"{mean_and_se(left_wing_stats_corrected['Party_Alignment'])[0]:.2f} ({mean_and_se(left_wing_stats_corrected['Party_Alignment'])[1]:.2f})",
        f"{mean_and_se(usa_stats_corrected['Party_Alignment'])[0]:.2f} ({mean_and_se(usa_stats_corrected['Party_Alignment'])[1]:.2f})",
        f"{mean_and_se(canada_stats_corrected['Party_Alignment'])[0]:.2f} ({mean_and_se(canada_stats_corrected['Party_Alignment'])[1]:.2f})",
        f"{mean_and_se(uk_stats_corrected['Party_Alignment'])[0]:.2f} ({mean_and_se(uk_stats_corrected['Party_Alignment'])[1]:.2f})"
    ]
}

# Recalculate All Politicians with corrected Party Alignment
all_politicians_row_corrected = {
    'Group': 'All Politicians',
    'N': all_politicians_stats_corrected['Name'].nunique(),
    'Average Score (SE)': f"{mean_and_se(all_politicians_stats_corrected['avg_score_per_politician'])[0]:.2f} ({mean_and_se(all_politicians_stats_corrected['avg_score_per_politician'])[1]:.2f})",
    'Word Count (SE)': f"{mean_and_se(all_politicians_stats_corrected['avg_word_count_per_politician'])[0]:.2f} ({mean_and_se(all_politicians_stats_corrected['avg_word_count_per_politician'])[1]:.2f})",
    'Party Alignment (SE)': f"{mean_and_se(all_politicians_stats_corrected['Party_Alignment'])[0]:.2f} ({mean_and_se(all_politicians_stats_corrected['Party_Alignment'])[1]:.2f})"
}

# Append the corrected "All Politicians" row
summary_df_corrected = summary_df_corrected.append(all_politicians_row_corrected, ignore_index=True)

# Calculating statistics for the "Pre Treatment" group

# Filter the dataset for pre-treatment observations
pre_treatment_stats_corrected = data[(data['ever_treated'] == 1.0) & (data['treated_dummy'] == 0)]

# Grouping by Name to calculate per-politician pre-treatment statistics
pre_treatment_per_politician_stats_corrected = pre_treatment_stats_corrected.groupby(['Name']).agg(
    avg_score_per_politician=('score', 'mean'),
    avg_word_count_per_politician=('word_count', 'mean'),
    Party_Alignment=('Party_Alignment', 'mean')
).reset_index()

# Pre Treatment row calculation
pre_treatment_row_corrected = {
    'Group': 'Pre Treatment',
    'N': pre_treatment_per_politician_stats_corrected['Name'].nunique(),
    'Average Score (SE)': f"{mean_and_se(pre_treatment_per_politician_stats_corrected['avg_score_per_politician'])[0]:.2f} ({mean_and_se(pre_treatment_per_politician_stats_corrected['avg_score_per_politician'])[1]:.2f})",
    'Word Count (SE)': f"{mean_and_se(pre_treatment_per_politician_stats_corrected['avg_word_count_per_politician'])[0]:.2f} ({mean_and_se(pre_treatment_per_politician_stats_corrected['avg_word_count_per_politician'])[1]:.2f})",
    'Party Alignment (SE)': f"{mean_and_se(pre_treatment_per_politician_stats_corrected['Party_Alignment'])[0]:.2f} ({mean_and_se(pre_treatment_per_politician_stats_corrected['Party_Alignment'])[1]:.2f})"
}

# Remove any previous "Pre Treatment" row if present
summary_df_corrected = summary_df_corrected[summary_df_corrected['Group'] != 'Pre Treatment']

# Append the corrected "Pre Treatment" row
summary_df_corrected = summary_df_corrected.append(pre_treatment_row_corrected, ignore_index=True)

# Display the final corrected summary table with all groups
print(summary_df_corrected)




# Function to convert the summary DataFrame to a LaTeX table
def dataframe_to_latex(df: DataFrame, filename: str = None):
    latex_table = df.to_latex(index=False, float_format="%.2f", column_format="|l|c|c|c|c|", header=True, escape=False)
    if filename:
        with open(filename, 'w') as f:
            f.write(latex_table)
    return latex_table

# Generating the LaTeX table and optionally saving it to a file
latex_table = dataframe_to_latex(summary_df_corrected, 'summary_statistics.tex')

# Print the LaTeX table
print(latex_table)
# Save the split DataFrames to new CSV files
df_more_left.to_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\final_DF_more_left.csv', index=False)
df_more_right.to_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\final_DF_more_right.csv', index=False)
final_merged_df.to_csv('C:\\Users\\Guill\\OneDrive\\Documents\\School\\Masters\\Sem 3\\FinalDFForAnalysisWithALL.csv', index=False)