In [1]:
import pandas as pd
import altair as alt
from statsmodels.nonparametric.smoothers_lowess import lowess
import numpy as np

In [2]:
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('president_polls.csv')

# Display the first few rows of the DataFrame to verify the import
df.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,stage,nationwide_batch,ranked_choice_reallocated,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct
0,88707,235,InsiderAdvantage,,,InsiderAdvantage,243,InsiderAdvantage,2.0,-0.3,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,47.0
1,88707,235,InsiderAdvantage,,,InsiderAdvantage,243,InsiderAdvantage,2.0,-0.3,...,general,False,False,,False,REP,Trump,16651,Donald Trump,49.0
2,88715,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,50.0
3,88715,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,general,False,False,,False,REP,Trump,16651,Donald Trump,46.0
4,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,48.0


In [3]:
df_state_na = df[df['state'].isna()]

df_state_not_na = df[df['state'].notna()]

print(df_state_na.head())
print(df_state_not_na.columns)
print(df_state_not_na.head())

   poll_id  pollster_id pollster sponsor_ids   sponsors   display_name  \
2    88715          770     TIPP         NaN        NaN  TIPP Insights   
3    88715          770     TIPP         NaN        NaN  TIPP Insights   
4    88710          568   YouGov         352  Economist         YouGov   
5    88710          568   YouGov         352  Economist         YouGov   
6    88710          568   YouGov         352  Economist         YouGov   

   pollster_rating_id pollster_rating_name  numeric_grade  pollscore  ...  \
2                 144        TIPP Insights            1.8       -0.4  ...   
3                 144        TIPP Insights            1.8       -0.4  ...   
4                 391               YouGov            3.0       -1.1  ...   
5                 391               YouGov            3.0       -1.1  ...   
6                 391               YouGov            3.0       -1.1  ...   

     stage  nationwide_batch ranked_choice_reallocated ranked_choice_round  \
2  general    

Let's filter out non scored national pollsters 

In [4]:
df_state_na_clean = df_state_na[df_state_na['numeric_grade'].notna()]
print(df_state_na_clean['numeric_grade'])
print(df_state_na_clean['numeric_grade'].max())

2        1.8
3        1.8
4        3.0
5        3.0
6        3.0
        ... 
15627    2.8
15628    2.8
15629    2.8
15630    2.8
15631    2.8
Name: numeric_grade, Length: 6998, dtype: float64
3.0


Let's create a weight for numerically graded pollsters

In [5]:
# Make a copy to avoid the warning
df_state_na_clean = df_state_na_clean.copy()

# Now safely create the 'weight_score' column
df_state_na_clean.loc[:, 'weight_score'] = df_state_na_clean['numeric_grade'] / 3.0


Let's check weight_grade column

In [6]:
df_state_na_clean.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,nationwide_batch,ranked_choice_reallocated,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score
2,88715,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,False,,False,DEM,Harris,16661,Kamala Harris,50.0,0.6
3,88715,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,False,,False,REP,Trump,16651,Donald Trump,46.0,0.6
4,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,False,False,,False,DEM,Harris,16661,Kamala Harris,48.0,1.0
5,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,False,False,,False,REP,Trump,16651,Donald Trump,45.0,1.0
6,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,False,False,,False,GRE,Stein,31116,Jill Stein,1.0,1.0


Let's go ahead and examine unique methodologies

In [7]:
print(df_state_na_clean['methodology'].unique())

['Online Panel' 'Live Phone/Text-to-Web' 'Probability Panel'
 'Online Panel/Text-to-Web' 'IVR/Text' 'Live Phone' nan
 'IVR/Online Panel/Text-to-Web' 'Live Phone/Online Panel/Text-to-Web'
 'Live Phone/Online Panel/App Panel' 'Live Phone/Online Panel/Text'
 'Live Phone/Probability Panel' 'Online Panel/Probability Panel' 'IVR'
 'Text-to-Web/Online Ad' 'Online Ad' 'Live Phone/Online Panel'
 'Live Phone/Text-to-Web/Online Ad' 'IVR/Text-to-Web'
 'Live Phone/Text/Online Panel' 'IVR/Online Panel' 'Text'
 'Online Panel/Online Ad' 'IVR/Online Panel/Email'
 'IVR/Live Phone/Text/Online Panel/Email' 'Live Phone/Text/Online Ad'
 'Online Panel/Text-to-Web/Text' 'Live Phone/Text-to-Web/App Panel'
 'App Panel' 'IVR/Online Panel/Text-to-Web/Email']


mapping different methodologies to weight_mode

In [8]:
# Make a copy to avoid the warning
df_state_na_clean = df_state_na_clean.copy()
# Mapping the weights to modes based on the table above
mode_weights = {
    'Live Phone': 1.00,
    'Live Phone/Probability Panel': 0.95,
    'Live Phone/Online Panel/Text-to-Web': 0.90,
    'Live Phone/Online Panel/Text': 0.90,
    'Live Phone/Text-to-Web/App Panel': 0.82,
    'Live Phone/Text-to-Web/Online Ad': 0.85,
    'Live Phone/Text-to-Web': 0.85,
    'Live Phone/Text/Online Panel': 0.90,
    'Live Phone/Online Panel': 0.85,
    'Live Phone/Online Panel/App Panel': 0.85,
    'IVR/Live Phone/Text/Online Panel/Email': 0.80,
    'Live Phone/Text/Online Ad': 0.80,
    'IVR/Online Panel/Email': 0.77,
    'IVR/Online Panel/Text-to-Web/Email': 0.75,
    'IVR/Online Panel/Text-to-Web': 0.75,
    'IVR/Online Panel': 0.70,
    'IVR': 0.70,
    'Online Panel/Probability Panel': 0.65,
    'Probability Panel': 0.65,
    'Online Panel/Text-to-Web': 0.60,
    'Online Panel/Online Ad': 0.55,
    'Online Panel': 0.50,
    'Online Ad': 0.50,
    'App Panel': 0.50,
    'Online Panel/Text-to-Web/Text': 0.50,
    'IVR/Text-to-Web': 0.50,
    'Text-to-Web/Online Ad': 0.45,
    'Text': 0.40,
    'IVR/Text': 0.40,
    'nan' : 0.50,
     np.nan: 0.50  # Handling missing or unknown values
}

# Apply the mapping to create a new column 'weight_mode'
df_state_na_clean.loc[:,'weight_mode'] = df_state_na_clean['methodology'].map(mode_weights)

Let's check out the 'weight_mode' column

In [9]:
print(df_state_na_clean.head())

   poll_id  pollster_id pollster sponsor_ids   sponsors   display_name  \
2    88715          770     TIPP         NaN        NaN  TIPP Insights   
3    88715          770     TIPP         NaN        NaN  TIPP Insights   
4    88710          568   YouGov         352  Economist         YouGov   
5    88710          568   YouGov         352  Economist         YouGov   
6    88710          568   YouGov         352  Economist         YouGov   

   pollster_rating_id pollster_rating_name  numeric_grade  pollscore  ...  \
2                 144        TIPP Insights            1.8       -0.4  ...   
3                 144        TIPP Insights            1.8       -0.4  ...   
4                 391               YouGov            3.0       -1.1  ...   
5                 391               YouGov            3.0       -1.1  ...   
6                 391               YouGov            3.0       -1.1  ...   

  ranked_choice_reallocated  ranked_choice_round hypothetical party  answer  \
2            

Let's create a weight for sample size, but first let's look for NaN in sample_size column

In [10]:
# Count the number of NaN values in the 'sample_size' column
nan_count = df['sample_size'].isna().sum()

print(f"Number of NaN values in 'sample_size': {nan_count}")

# Calculate the mean of the available (non-NaN) sample sizes
mean_sample_size = df['sample_size'].mean()

print(f"Mean of available sample sizes: {mean_sample_size}")


Number of NaN values in 'sample_size': 139
Mean of available sample sizes: 1613.878283317199


In [11]:
import numpy as np

In [12]:
# Step 2: Create the 'weight_sample' column
df_state_na_clean['weight_sample'] = df_state_na_clean['sample_size'].apply(lambda x: np.sqrt(x) if not np.isnan(x) else np.sqrt(mean_sample_size))

# Display the first few rows to verify
df_state_na_clean.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
2,88715,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,,False,DEM,Harris,16661,Kamala Harris,50.0,0.6,0.5,35.327043
3,88715,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,,False,REP,Trump,16651,Donald Trump,46.0,0.6,0.5,35.327043
4,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,,False,DEM,Harris,16661,Kamala Harris,48.0,1.0,0.5,38.170669
5,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,,False,REP,Trump,16651,Donald Trump,45.0,1.0,0.5,38.170669
6,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,,False,GRE,Stein,31116,Jill Stein,1.0,1.0,0.5,38.170669


Sort end_date values in descending order

In [13]:
# Convert 'end_date' to datetime format with specified format for single/double digits in month/day
df_state_na_clean['end_date'] = pd.to_datetime(df_state_na_clean['end_date'], format='%m/%d/%y', errors='coerce')

# Sort the DataFrame by 'end_date'
df_state_na_clean_sorted = df_state_na_clean.sort_values(by='end_date',ascending=False)

In [14]:
df_state_na_clean_sorted.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
2,88715,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,,False,DEM,Harris,16661,Kamala Harris,50.0,0.6,0.5,35.327043
4,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,,False,DEM,Harris,16661,Kamala Harris,48.0,1.0,0.5,38.170669
5,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,,False,REP,Trump,16651,Donald Trump,45.0,1.0,0.5,38.170669
6,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,,False,GRE,Stein,31116,Jill Stein,1.0,1.0,0.5,38.170669
7,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,,False,IND,West,31097,Cornel West,1.0,1.0,0.5,38.170669


Let's create a 'days_past_index' that can be used for weight_time_decay value for moving average

In [15]:
# Step 3: Get the first (top) date after sorting
first_date = df_state_na_clean_sorted['end_date'].iloc[0]

# Step 4: Compute the difference in days and create the 'days_past_index' column
df_state_na_clean_sorted['days_past_index'] = (first_date - df_state_na_clean_sorted['end_date']).dt.days

In [16]:
df_state_na_clean_sorted.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample,days_past_index
2,88715,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,DEM,Harris,16661,Kamala Harris,50.0,0.6,0.5,35.327043,0
4,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,False,DEM,Harris,16661,Kamala Harris,48.0,1.0,0.5,38.170669,0
5,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,False,REP,Trump,16651,Donald Trump,45.0,1.0,0.5,38.170669,0
6,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,False,GRE,Stein,31116,Jill Stein,1.0,1.0,0.5,38.170669,0
7,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,False,IND,West,31097,Cornel West,1.0,1.0,0.5,38.170669,0


In [17]:
df_state_na_clean_sorted.tail()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample,days_past_index
15627,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,REP,Cruz,16641,Ted Cruz,24.0,0.933333,0.65,33.24154,1278
15628,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,DEM,Biden,19368,Joe Biden,41.0,0.933333,0.65,33.256578,1278
15629,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,REP,DeSantis,16646,Ron DeSantis,25.0,0.933333,0.65,33.256578,1278
15630,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,DEM,Biden,19368,Joe Biden,44.0,0.933333,0.65,33.27161,1278
15631,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,REP,Haley,16640,Nikki Haley,19.0,0.933333,0.65,33.27161,1278


In [18]:
# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_na_clean_sorted_cutoff = df_state_na_clean_sorted[df_state_na_clean_sorted['end_date'] >= cutoff_date].copy()

# Check for NaN values in 'weight_mode', 'weight_sample', and 'weight_score'
nan_check = df_state_na_clean_sorted_cutoff[['weight_mode', 'weight_sample', 'weight_score']].isna().sum()

# Step 2: Check for infinite values in the involved columns
inf_check = df_state_na_clean_sorted_cutoff[['weight_mode', 'weight_sample', 'weight_score']].isin([np.inf, -np.inf]).sum()
print(f"Number of infinite values:\n{inf_check}")


# Print the result to verify if there are any NaN values
print(nan_check)

# Step 1: Filter rows where 'weight_mode' is NaN
nan_weight_mode = df_state_na_clean_sorted_cutoff[df_state_na_clean_sorted_cutoff['weight_mode'].isna()]

# Step 2: Display the 'methodology' or other relevant columns to investigate the methodology used
# For example, we'll check 'pollster', 'sponsors', and 'methodology' (if available) along with 'weight_mode'
nan_weight_mode_info = nan_weight_mode[['pollster', 'sponsors', 'methodology', 'weight_mode']]

# Print the resulting DataFrame for verification
print(nan_weight_mode_info)

Number of infinite values:
weight_mode      0
weight_sample    0
weight_score     0
dtype: int64
weight_mode      0
weight_sample    0
weight_score     0
dtype: int64
Empty DataFrame
Columns: [pollster, sponsors, methodology, weight_mode]
Index: []


Let's compute a weight_i where weight_i is for a given 'end_date' the determined index weight which determines for a given end_date the weight assigned to a row used in computing the point average_i. Note: we will use the sorted dates to filter dates that only on the date or before, then we will compute the 'weight_time' using the formula: exp(-lambdat) where lambda = 1.0 and t = days elapsed since the beginning of the poll end date.
finally we can compute the weight_i = 'weight_mode''weight_sample'*weight_score'*weight_time'

In [19]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = 1.0
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_na_clean_sorted_cutoff = df_state_na_clean_sorted[df_state_na_clean_sorted['end_date'] >= cutoff_date].copy()
df_lv = df_state_na_clean_sorted_cutoff[df_state_na_clean_sorted_cutoff['population']=='lv'].copy()

# Iterate through each unique end date
for current_date in df_lv['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_lv[df_lv['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        #  We select H2H if avaible and only use non H2H if H2H is not available
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()
        
        # Step 7: Filter data for the specific candidate
        
        c_mean = candidate_data[candidate_data['days_past_index']< 30]['pct'].mean() #gather mean for past 30 days of likely voters
        c_std =  candidate_data[candidate_data['days_past_index']< 30]['pct'].std()  #gather standard deviation for past 30 days of likely voters
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.0*candidate_data['zscores'])              
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        #Start House effect data computation#
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < 40) & 
                                            (candidate_data['days_past_index'] > 1)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        #print(candidate_data['adjusted_pct'].head(10))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()

Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-15,16661,Kamala Harris,49.658663
1,2024-10-15,16651,Donald Trump,45.857155
2,2024-10-14,16661,Kamala Harris,49.594931
3,2024-10-14,16651,Donald Trump,46.482702
4,2024-10-13,16661,Kamala Harris,49.483571


Let's view the plot with weight average:

In [20]:
# Create the line chart with points
# Create a customized color encoding for Trump and Harris
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Create the line chart with points, assigning specific colors to each candidate
chart = alt.Chart(df_weighted_averages).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate_name:N', scale=color_scale),  # Custom color scale for candidates
    tooltip=['end_date', 'candidate_name', 'weighted_average_pct']  # Add tooltips to show details
).properties(
    title='Weighted Average Polling Results Over Time',
    width=600,
    height=400
)

chart.show()


In [21]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q']  # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time',
    width=600,
    height=400
)

chart.show()

Let's look at the loess smoothed curve of this

In [22]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


Let's look at A/B graded pollsters

In [23]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = 0.23
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_na_clean_sorted_cutoff = df_state_na_clean_sorted[df_state_na_clean_sorted['end_date'] >= cutoff_date].copy()
df_ab_pollsters = df_state_na_clean_sorted_cutoff[df_state_na_clean_sorted_cutoff['numeric_grade'] >= 2.4].copy()
df_lv = df_ab_pollsters[df_ab_pollsters['population'] == 'lv'].copy()
# Iterate through each unique end date
for current_date in df_lv['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_lv[df_lv['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        #  We select H2H if avaible and only use non H2H if H2H is not available
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Staep 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy() 
        
        # Step 7: Filter data for the specific candidate
        
        c_mean = candidate_data[candidate_data['days_past_index']< 40]['pct'].mean() #gather mean for past 30 days of likely voters
        c_std =  candidate_data[candidate_data['days_past_index']< 40]['pct'].std()  #gather standard deviation for past 30 days of likely voters
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.0*candidate_data['zscores'])              
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )

        #Start House effect data computation#
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < 40) & 
                                            (candidate_data['days_past_index'] > 1)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        ## finish house effect weighting  ##   
        
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-15,16661,Kamala Harris,49.080595
1,2024-10-15,16651,Donald Trump,46.340183
2,2024-10-13,16661,Kamala Harris,49.258202
3,2024-10-13,16651,Donald Trump,46.621008
4,2024-10-11,16661,Kamala Harris,49.589798


In [24]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time',
    width=600,
    height=400
)

chart.show()

In [25]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


### Moving on to Battleground state data

Let's clean 'numeric_grade' rows for df_state_not_na so that there aren't ungraded pollsters in our list

In [26]:
#df_state_not_na_clean = df_state_not_na[df_state_not_na['numeric_grade'].notna()]

In [27]:
#na_num = df_state_not_na_clean['numeric_grade'].isna().sum()
#print(na_num)

Let's normalize 'numeric_grade' to create a 'weight_score'

In [28]:
# Make a copy to avoid the warning
df_state_not_na = df_state_not_na.copy()

# Step 1: Replace NaN values in 'numeric_grade' with a low grade (e.g., 1.0)
df_state_not_na['numeric_grade'].fillna(0.1, inplace=True)

df_state_not_na_clean = df_state_not_na.copy()
# Step 2: Safely create the 'weight_score' column by dividing 'numeric_grade' by 3.0
df_state_not_na_clean.loc[:, 'weight_score'] = df_state_not_na_clean['numeric_grade'] / 3.0

# Verify the updated DataFrame
print(df_state_not_na_clean[['numeric_grade', 'weight_score']].head())

    numeric_grade  weight_score
0             2.0      0.666667
1             2.0      0.666667
14            1.1      0.366667
15            1.1      0.366667
16            2.9      0.966667


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_state_not_na['numeric_grade'].fillna(0.1, inplace=True)


In [29]:
na_num = df_state_not_na_clean['weight_score'].isna().sum()
print(na_num)

0


#### Creating 'weight_mode' weight for methodology

In [30]:
import numpy as np
# Make a copy to avoid the warning
df_state_not_na_clean = df_state_not_na_clean.copy()
# Mapping the weights to modes based on the table above
mode_weights = {
    "Text-to-Web/Email": 0.75,
    "IVR/Live Phone/Text-to-Web": 0.76,
    "IVR/Live Phone/Online Panel": 0.78,
    "IVR/Live Phone/Online Panel/Text-to-Web": 0.77,
    "Live Phone/Text-to-Web/Email/Mail-to-Web": 0.76,
    "Live Phone/Text-to-Web/Email": 0.78,
    "Email/Online Ad": 0.73,
    "Online Panel/Email": 0.78,
    "Live Phone/Online Panel/Mail-to-Web": 0.78,
    "IVR/Text-to-Web/Email": 0.72,
    'Email':0.8,
    'Live Phone': 1.00,
    'Live Phone/Probability Panel': 0.95,
    'Live Phone/Online Panel/Text-to-Web': 0.90,
    'Live Phone/Online Panel/Text': 0.90,
    'Live Phone/Text-to-Web/App Panel': 0.85,
    'Live Phone/Text-to-Web/Online Ad': 0.85,
    'Live Phone/Text-to-Web': 0.85,
    'Live Phone/Text/Online Panel': 0.90,
    'Live Phone/Online Panel': 0.85,
    'Live Phone/Online Panel/App Panel': 0.85,
    'Live Phone/Text-to-Web/Email/Mail-to-Web/Mail-to-Phone':0.76,
    'Live Phone/Email':0.82,
    'Live Phone/Online Panel/Text-to-Web/Text':0.8,
    'Live Phone/Text':0.83,
    'IVR/Live Phone/Text/Online Panel/Email': 0.80,
    'Live Phone/Text/Online Ad': 0.80,
    'IVR/Live Phone/Text':0.78,
    'IVR/Online Panel/Email': 0.77,
    'IVR/Online Panel/Text-to-Web/Email': 0.75,
    'IVR/Online Panel/Text-to-Web': 0.75,
    'IVR/Online Panel': 0.70,
    'IVR': 0.70,
    'Mail-to-Web/Mail-to-Phone': 0.7,
    'Online Panel/Probability Panel': 0.65,
    'Probability Panel': 0.65,
    'Online Panel/Email/Text-to-Web':0.77,
    'Online Panel/Text-to-Web': 0.60,
    'Online Panel/Text':0.78,
    'Online Panel/Online Ad': 0.55,
    'Online Panel': 0.50,
    'Online Ad': 0.50,
    'App Panel': 0.50,
    'Online Panel/Text-to-Web/Text': 0.50,
    'IVR/Text-to-Web': 0.50,
    'Text-to-Web/Online Ad': 0.45,
    'Text-to-Web':0.45,
    'Text': 0.40,
    'IVR/Text': 0.40,
    'nan' : 0.50,
     np.nan: 0.50  # Handling missing or unknown values
}

# Apply the mapping to create a new column 'weight_mode'
df_state_not_na_clean.loc[:,'weight_mode'] = df_state_not_na_clean['methodology'].map(mode_weights)

In [31]:
num_na = df_state_not_na_clean['weight_mode'].isna().sum()
print(num_na)
df_ret = df_state_not_na_clean[df_state_not_na_clean['weight_mode'].isna()][['methodology']]
print(df_ret['methodology'].unique())

0
[]


#### Create a Weight_Sample weight for sample size

In [32]:
# Step 2: Create the 'weight_sample' column
df_state_not_na_clean['weight_sample'] = df_state_not_na_clean['sample_size'].apply(lambda x: np.sqrt(x/600) if not np.isnan(x) else np.sqrt(mean_sample_size/600))

# Display the first few rows to verify
df_state_not_na_clean.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
0,88707,235,InsiderAdvantage,,,InsiderAdvantage,243,InsiderAdvantage,2.0,-0.3,...,,False,DEM,Harris,16661,Kamala Harris,47.0,0.666667,0.5,1.154701
1,88707,235,InsiderAdvantage,,,InsiderAdvantage,243,InsiderAdvantage,2.0,-0.3,...,,False,REP,Trump,16651,Donald Trump,49.0,0.666667,0.5,1.154701
14,88691,1754,Patriot Polling,,,Patriot Polling,732,Patriot Polling,1.1,0.6,...,,False,DEM,Harris,16661,Kamala Harris,49.0,0.366667,0.7,1.156864
15,88691,1754,Patriot Polling,,,Patriot Polling,732,Patriot Polling,1.1,0.6,...,,False,REP,Trump,16651,Donald Trump,50.0,0.366667,0.7,1.156864
16,88721,1102,Emerson,9601656.0,The Hill | Inside California Politics,Emerson College,88,Emerson College,2.9,-1.1,...,,False,DEM,Harris,16661,Kamala Harris,61.0,0.966667,0.7,1.290994


In [33]:
num_na = df_state_not_na_clean['weight_sample'].isna().sum()
print(num_na)

0


####  Converting 'end_date' to datetime format and sorting dataframe by end_date

In [34]:
# Convert 'end_date' to datetime format with specified format for single/double digits in month/day
df_state_not_na_clean['end_date'] = pd.to_datetime(df_state_not_na_clean['end_date'], format='%m/%d/%y', errors='coerce')

# Sort the DataFrame by 'end_date'
df_state_not_na_clean_sorted = df_state_not_na_clean.sort_values(by='end_date',ascending=False)

In [35]:
df_state_not_na_clean_sorted.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
0,88707,235,InsiderAdvantage,,,InsiderAdvantage,243,InsiderAdvantage,2.0,-0.3,...,,False,DEM,Harris,16661,Kamala Harris,47.0,0.666667,0.5,1.154701
1,88707,235,InsiderAdvantage,,,InsiderAdvantage,243,InsiderAdvantage,2.0,-0.3,...,,False,REP,Trump,16651,Donald Trump,49.0,0.666667,0.5,1.154701
14,88691,1754,Patriot Polling,,,Patriot Polling,732,Patriot Polling,1.1,0.6,...,,False,DEM,Harris,16661,Kamala Harris,49.0,0.366667,0.7,1.156864
15,88691,1754,Patriot Polling,,,Patriot Polling,732,Patriot Polling,1.1,0.6,...,,False,REP,Trump,16651,Donald Trump,50.0,0.366667,0.7,1.156864
16,88721,1102,Emerson,9601656.0,The Hill | Inside California Politics,Emerson College,88,Emerson College,2.9,-1.1,...,,False,DEM,Harris,16661,Kamala Harris,61.0,0.966667,0.7,1.290994


#### Let's compute battleground state PA

In [111]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]  # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Pennsylvania"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
                
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
    
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_2 = group[group['count'] == 2]
                    
                    if not question_ids_with_count_2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()
        
                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                #c_mean2data = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]
                #c_mean2 = np.average(c_mean2data['pct'], weights=c_mean2data['w_i'])
                #c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                ##c_mean_by_pollster = candidate_data.groupby('pollster_id')['pct'].mean()
                
                    # Create an empty dictionary to store the results
                pollster_means = {}
            
                # Group the data by 'pollster_id'
                grouped = candidate_data.groupby('pollster_id')
            
                # Loop through each group (i.e., each pollster)
                for pollster_id, group in grouped:
                    # Find the most recent 'end_date' for this pollster
                    last_end_date_index = group['days_past_index'].min()
                    
                    # Calculate the start date for the filtering (last_end_date - x_days)
                    start_date_index = last_end_date_index+avg_window
            
                    # Filter the group's data to only include rows between start_date and last_end_date
                    filtered_group = group[(group['days_past_index'] >= last_end_date_index) & (group['days_past_index'] <= start_date_index)]
                    
                    # Calculate the mean of 'pct' for this filtered group
                    pollster_mean = filtered_group['pct'].mean()
                    c_mean2data = candidate_data[(candidate_data['days_past_index'] >= last_end_date_index) & (candidate_data['days_past_index'] <= start_date_index)]
                    c_mean2 = c_mean2data['pct'].mean() #np.average(c_mean2data['pct'], weights=c_mean2data['w_i']) #c_mean2data['pct'].mean()
                    # Store the result in the dictionary
                    house_effect = pollster_mean - c_mean2
                    pollster_means[pollster_id] = house_effect
            
                # Convert the dictionary to a pandas Series for easier handling
                c_mean_by_pollster = pd.Series(pollster_means)

                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Rename the columns for clarity
                c_mean_by_pollster_df.columns = ['pollster_id', 'house_effect']
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['mean_pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect']  # Existing house effect data
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll

                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
                
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Pennsylvania")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Pennsylvania


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-10,16661,Kamala Harris,48.680407,0.05,19
1,2024-10-10,16651,Donald Trump,47.542245,0.05,19
2,2024-10-09,16661,Kamala Harris,48.733529,0.05,19
3,2024-10-09,16651,Donald Trump,47.430235,0.05,19
4,2024-10-08,16661,Kamala Harris,48.756075,0.05,19
5,2024-10-08,16651,Donald Trump,47.347468,0.05,19
6,2024-10-07,16661,Kamala Harris,48.768355,0.05,19
7,2024-10-07,16651,Donald Trump,47.250791,0.05,19
8,2024-10-02,16661,Kamala Harris,48.669073,0.05,19
9,2024-10-02,16651,Donald Trump,47.128705,0.05,19


In [112]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print('Pennsylvania')
print(df_aggregated.head(10))
# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (Pennsylvania)"
)

# Display the final chart
final_chart.display()


Pennsylvania
     end_date candidate_name  weighted_average_pct
71 2024-10-10  Kamala Harris             48.577746
70 2024-10-10   Donald Trump             47.824826
69 2024-10-09  Kamala Harris             48.714224
68 2024-10-09   Donald Trump             47.673876
67 2024-10-08  Kamala Harris             48.804074
66 2024-10-08   Donald Trump             47.571861
65 2024-10-07  Kamala Harris             48.848870
64 2024-10-07   Donald Trump             47.472110
63 2024-10-02  Kamala Harris             48.872918
62 2024-10-02   Donald Trump             47.293286


#### Kamala Harris and Trump Forecast in PA using Lowess curve fit data

In [113]:
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np
import datetime

In [114]:
import pandas as pd
import altair as alt
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming df_aggregated is already defined
# Make sure 'end_date' is in datetime format
df_aggregated['end_date'] = pd.to_datetime(df_aggregated['end_date'])

# Define a new dataframe to store the LOESS smoothed data
loess_data = pd.DataFrame()

# Perform LOESS smoothing for each candidate
for candidate in df_aggregated['candidate_name'].unique():
    # Filter the data for each candidate
    candidate_df = df_aggregated[df_aggregated['candidate_name'] == candidate]
    
    # Perform LOESS smoothing (use frac to control the smoothing level)
    loess_smoothed = lowess(candidate_df['weighted_average_pct'], 
                            candidate_df['end_date'].apply(lambda x: x.timestamp()), 
                            frac=0.3)  # Adjust frac as needed
    
    # Create a dataframe from the smoothed data
    smoothed_df = pd.DataFrame({
        'end_date': pd.to_datetime(loess_smoothed[:, 0], unit='s'),
        'weighted_average_pct': loess_smoothed[:, 1],
        'candidate_name': candidate
    })
    
    # Append to the main loess_data dataframe
    loess_data = pd.concat([loess_data, smoothed_df], ignore_index=True)

# Now let's fit an ARIMA model using the LOESS smoothed data
candidate_forecasts = {}
# Store the final prediction for each candidate
final_predictions = {}
# Forecast up to November 5, 2024
forecast_end_date = pd.Timestamp('2024-11-05')

# Loop through each candidate and apply ARIMA
for candidate in loess_data['candidate_name'].unique():
    # Filter the LOESS smoothed data for this candidate
    candidate_loess_df = loess_data[loess_data['candidate_name'] == candidate].sort_values(by='end_date')
    
    # Set 'end_date' as the index for ARIMA (must be a time series index)
    candidate_loess_df.set_index('end_date', inplace=True)

    # Calculate the number of days to forecast (from the last date to Nov 5, 2024)
    last_date = candidate_loess_df.index[-1]
    days_to_forecast = (forecast_end_date - last_date).days
    
    # Fit ARIMA model (you can tune the order=(p, d, q))
    model = ARIMA(candidate_loess_df['weighted_average_pct'], order=(1, 1, 1))
    model_fit = model.fit()

    # In-sample forecast (fitted values) and out-of-sample forecast
    candidate_loess_df['fitted'] = model_fit.fittedvalues
    
    # Forecast up to November 5, 2024
    forecast = model_fit.forecast(steps=days_to_forecast)
    
    # Store the forecasted results for this candidate
    candidate_forecasts[candidate] = forecast
    
    # Generate forecast dates for visualization
    forecast_dates = pd.date_range(candidate_loess_df.index[-1], periods=days_to_forecast + 1, freq='D')[1:]
    
    forecast_df = pd.DataFrame({
        'end_date': forecast_dates,
        'forecast': forecast,
        'candidate_name': candidate
    })
    
    # Reset index for easier plotting
    candidate_loess_df = candidate_loess_df.reset_index()

    # Altair plot for the actual, smoothed, and forecast data
    loess_chart = alt.Chart(candidate_loess_df).mark_line(color='blue', size=3).encode(
        x='end_date:T',
        y='weighted_average_pct:Q',
        tooltip=[alt.Tooltip('end_date:T', title='Date'),
                 alt.Tooltip('weighted_average_pct:Q', title='Smoothed Data')]
    ).properties(title=f'ARIMA Forecast on LOESS Smoothed Data for {candidate}')

    fitted_chart = alt.Chart(candidate_loess_df).mark_line(color='orange', size=2).encode(
        x='end_date:T',
        y='fitted:Q',
        tooltip=['fitted:Q']
    )

    forecast_chart = alt.Chart(forecast_df).mark_line(color='green', size=2).encode(
        x='end_date:T',
        y='forecast:Q',
        tooltip=[alt.Tooltip('end_date:T', title='Forecast Date'),
                 alt.Tooltip('forecast:Q', title='Forecasted Value')]
    )
    
    # Combine the charts and make the graph larger
    combined_chart = (loess_chart + fitted_chart + forecast_chart).properties(
        width=800,  # Set larger width
        height=400  # Set larger height
    )
    
    # Display the chart
    combined_chart.display()

    # Optionally, calculate the RMSE of the model
    rmse = np.sqrt(mean_squared_error(candidate_loess_df['weighted_average_pct'][1:], candidate_loess_df['fitted'][1:]))
    print(f"RMSE for {candidate}: {rmse}")
    # Extract the forecasted value for Nov 5, 2024
    nov_5_prediction = forecast_df[forecast_df['end_date'] == forecast_end_date]['forecast'].values[0]
    
    # Store the prediction in the final_predictions dictionary
    final_predictions[candidate] = nov_5_prediction

    # Print the final prediction for the candidate
    print(f"Final forecast for {candidate} on Nov 5, 2024: {nov_5_prediction:.2f}%")
    
# Optionally, print the final predictions for both candidates in a clear format
print("\nFinal Predictions for November 5, 2024:")
for candidate, prediction in final_predictions.items():
    print(f"{candidate}: {prediction:.2f}%")    


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


RMSE for Kamala Harris: 0.13423213653994825
Final forecast for Kamala Harris on Nov 5, 2024: 48.60%


RMSE for Donald Trump: 0.14133083271733343
Final forecast for Donald Trump on Nov 5, 2024: 48.74%

Final Predictions for November 5, 2024:
Kamala Harris: 48.60%
Donald Trump: 48.74%


#### Summary Stats of Pollster House Effects in State

In [40]:
# Group by 'pollster' and calculate summary statistics for 'house_effect_data'
he_agg = house_effect_data.groupby(['end_date','pollster','candidate_name']).agg({
    'house_effect': 'mean'
}).reset_index()
house_effect_summary = he_agg.groupby(['pollster','candidate_name'])['house_effect'].describe()

# Display the summary statistics
print(house_effect_summary)

                                                          count      mean  \
pollster                                  candidate_name                    
AtlasIntel                                Donald Trump      7.0  2.228195   
                                          Kamala Harris     7.0 -0.497446   
Beacon/Shaw                               Donald Trump      8.0  0.679348   
                                          Kamala Harris     8.0 -0.568406   
CNN/SSRS                                  Donald Trump     20.0 -0.049857   
...                                                         ...       ...   
University of Massachusetts Lowell/YouGov Kamala Harris    11.0 -0.517998   
Wick                                      Donald Trump     20.0  0.287643   
                                          Kamala Harris    20.0  0.002993   
YouGov                                    Donald Trump     23.0  0.838573   
                                          Kamala Harris    23.0  0.457240   

#### Time Series of Pollster House Effects in State

In [41]:
import altair as alt

# Define selection for highlighting specific pollster
highlight = alt.selection_multi(fields=['pollster'], bind='legend')

# Manually map shapes for the candidates ('circle' for Kamala Harris and 'square' for Donald Trump)
shape_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['circle', 'square'])

# Scatter plot for house effect data with shape highlight based on candidate_name (restricted to circle and square)
scatter = alt.Chart(he_agg).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('house_effect:Q', title='House Effect'),
    color=alt.condition(highlight, 'pollster:N', alt.value('lightgray'), legend=alt.Legend(title="Pollster")),  # Color for pollster with legend
    shape=alt.condition(
        highlight, 
        alt.Shape('candidate_name:N', scale=shape_scale),  # Shape encoding based on candidate
        alt.value('circle')  # Default shape for non-selected points
    ),
    opacity=alt.condition(highlight, alt.value(1), alt.value(0.3)),  # Reduce opacity for non-selected points
    tooltip=['pollster', 'candidate_name', 'end_date', 'house_effect']
).properties(
    width=800,
    height=600
).add_selection(
    highlight
)

# LOESS smoothing for each pollster (groupby pollster)
loess = alt.Chart(he_agg).transform_loess(
    'end_date', 'house_effect', groupby=['pollster']
).mark_line(size=3).encode(
    x='end_date:T',
    y='house_effect:Q',
    color=alt.condition(highlight, 'pollster:N', alt.value('lightgray')),  # Color highlighting for LOESS curve
    size=alt.condition(highlight, alt.value(3), alt.value(1)),  # Thicker line for selected pollsters
    opacity=alt.condition(highlight, alt.value(1), alt.value(0.3))  # Reduce opacity for non-selected lines
)

# Layering the scatter plot and LOESS smoothing curve
final_chart = alt.layer(scatter, loess).properties(
    title="House Effect Data with LOESS Smoothing (Highlight Pollster)"
)

# Display the final chart
final_chart.display()


  highlight = alt.selection_multi(fields=['pollster'], bind='legend')
  ).add_selection(


In [115]:
import altair as alt

# Group by 'end_date' and 'pollster' and calculate the mean of 'house_effect'
grouped_data = he_agg.groupby(['end_date', 'pollster']).agg(
    avg_house_effect=('house_effect', 'mean')
).reset_index()

# Define selection for pollster highlighting
highlight = alt.selection_multi(fields=['pollster'], bind='legend')

# Create a scatter plot for the average house effect, color-coded by 'pollster'
scatter = alt.Chart(grouped_data).mark_circle(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('avg_house_effect:Q', title='Average House Effect'),
    color=alt.condition(
        highlight,  # Highlight selected pollsters
        'pollster:N',  # Color based on pollster when selected
        alt.value('lightgray')  # Gray out when not selected
    ),
    opacity=alt.condition(highlight, alt.value(1), alt.value(0.3)),  # Opacity for non-selected points
    tooltip=['end_date:T', 'avg_house_effect:Q', 'pollster:N']
).properties(
    width=600,
    height=400,
    title="Average House Effect by End Date, Color-coded by Pollster"
).add_selection(
    highlight  # Add the selection to the chart
)

# LOESS smoothing for each pollster (groupby pollster)
loess = alt.Chart(grouped_data).transform_loess(
    'end_date', 'avg_house_effect', groupby=['pollster']
).mark_line().encode(
    x='end_date:T',
    y='avg_house_effect:Q',
    color=alt.condition(
        highlight,  # Highlight LOESS lines for selected pollsters
        'pollster:N',  # Color based on pollster when selected
        alt.value('lightgray')  # Gray out when not selected
    ),
    size=alt.condition(highlight, alt.value(2), alt.value(1)),  # Thicker line for selected pollsters
    opacity=alt.condition(highlight, alt.value(1), alt.value(0.3))  # Opacity for non-selected lines
)

# Layering the scatter plot and LOESS smoothing lines
final_chart = alt.layer(scatter, loess)

# Display the final chart
final_chart.display()


  highlight = alt.selection_multi(fields=['pollster'], bind='legend')
  ).add_selection(


#### Looking at the Distributions of Pollster House Effects in State

In [116]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Let's compute battleground state Michigan

In [117]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]  # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Michigan"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
                
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
    
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_2 = group[group['count'] == 2]
                    
                    if not question_ids_with_count_2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()
        
                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                #c_mean2data = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]
                #c_mean2 = np.average(c_mean2data['pct'], weights=c_mean2data['w_i'])
                #c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                ##c_mean_by_pollster = candidate_data.groupby('pollster_id')['pct'].mean()
                
                    # Create an empty dictionary to store the results
                pollster_means = {}
            
                # Group the data by 'pollster_id'
                grouped = candidate_data.groupby('pollster_id')
            
                # Loop through each group (i.e., each pollster)
                for pollster_id, group in grouped:
                    # Find the most recent 'end_date' for this pollster
                    last_end_date_index = group['days_past_index'].min()
                    
                    # Calculate the start date for the filtering (last_end_date - x_days)
                    start_date_index = last_end_date_index+avg_window
            
                    # Filter the group's data to only include rows between start_date and last_end_date
                    filtered_group = group[(group['days_past_index'] >= last_end_date_index) & (group['days_past_index'] <= start_date_index)]
                    
                    # Calculate the mean of 'pct' for this filtered group
                    pollster_mean = filtered_group['pct'].mean()
                    c_mean2data = candidate_data[(candidate_data['days_past_index'] >= last_end_date_index) & (candidate_data['days_past_index'] <= start_date_index)]
                    c_mean2 = c_mean2data['pct'].mean() #np.average(c_mean2data['pct'], weights=c_mean2data['w_i']) #c_mean2data['pct'].mean()
                    # Store the result in the dictionary
                    house_effect = pollster_mean - c_mean2
                    pollster_means[pollster_id] = house_effect
            
                # Convert the dictionary to a pandas Series for easier handling
                c_mean_by_pollster = pd.Series(pollster_means)

                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Rename the columns for clarity
                c_mean_by_pollster_df.columns = ['pollster_id', 'house_effect']
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['mean_pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect']  # Existing house effect data
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll

                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
                
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Michigan")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Michigan


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-11,16661,Kamala Harris,48.662302,0.05,19
1,2024-10-11,16651,Donald Trump,46.822191,0.05,19
2,2024-10-09,16661,Kamala Harris,48.708596,0.05,19
3,2024-10-09,16651,Donald Trump,46.799252,0.05,19
4,2024-10-08,16661,Kamala Harris,48.69091,0.05,19
5,2024-10-08,16651,Donald Trump,46.671841,0.05,19
6,2024-10-07,16661,Kamala Harris,48.736914,0.05,19
7,2024-10-07,16651,Donald Trump,46.282367,0.05,19
8,2024-10-04,16661,Kamala Harris,48.741036,0.05,19
9,2024-10-04,16651,Donald Trump,46.537835,0.05,19


In [118]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print('Michigan')
print(df_aggregated.head(10))
# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (Michigan)"
)

# Display the final chart
final_chart.display()


Michigan
     end_date candidate_name  weighted_average_pct
63 2024-10-11  Kamala Harris             48.268332
62 2024-10-11   Donald Trump             47.409327
61 2024-10-09  Kamala Harris             48.351362
60 2024-10-09   Donald Trump             47.373728
59 2024-10-08  Kamala Harris             48.352114
58 2024-10-08   Donald Trump             47.283145
57 2024-10-07  Kamala Harris             48.317087
56 2024-10-07   Donald Trump             46.863011
55 2024-10-04  Kamala Harris             48.673651
54 2024-10-04   Donald Trump             46.829732


#### Kamala Harris and Trump Forecast in MI using Lowess curve fit data

In [119]:
import pandas as pd
import altair as alt
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming df_aggregated is already defined
# Make sure 'end_date' is in datetime format
df_aggregated['end_date'] = pd.to_datetime(df_aggregated['end_date'])

# Define a new dataframe to store the LOESS smoothed data
loess_data = pd.DataFrame()

# Perform LOESS smoothing for each candidate
for candidate in df_aggregated['candidate_name'].unique():
    # Filter the data for each candidate
    candidate_df = df_aggregated[df_aggregated['candidate_name'] == candidate]
    
    # Perform LOESS smoothing (use frac to control the smoothing level)
    loess_smoothed = lowess(candidate_df['weighted_average_pct'], 
                            candidate_df['end_date'].apply(lambda x: x.timestamp()), 
                            frac=0.3)  # Adjust frac as needed
    
    # Create a dataframe from the smoothed data
    smoothed_df = pd.DataFrame({
        'end_date': pd.to_datetime(loess_smoothed[:, 0], unit='s'),
        'weighted_average_pct': loess_smoothed[:, 1],
        'candidate_name': candidate
    })
    
    # Append to the main loess_data dataframe
    loess_data = pd.concat([loess_data, smoothed_df], ignore_index=True)

# Now let's fit an ARIMA model using the LOESS smoothed data
candidate_forecasts = {}
# Store the final prediction for each candidate
final_predictions = {}
# Forecast up to November 5, 2024
forecast_end_date = pd.Timestamp('2024-11-05')

# Loop through each candidate and apply ARIMA
for candidate in loess_data['candidate_name'].unique():
    # Filter the LOESS smoothed data for this candidate
    candidate_loess_df = loess_data[loess_data['candidate_name'] == candidate].sort_values(by='end_date')
    
    # Set 'end_date' as the index for ARIMA (must be a time series index)
    candidate_loess_df.set_index('end_date', inplace=True)

    # Calculate the number of days to forecast (from the last date to Nov 5, 2024)
    last_date = candidate_loess_df.index[-1]
    days_to_forecast = (forecast_end_date - last_date).days
    
    # Fit ARIMA model (you can tune the order=(p, d, q))
    model = ARIMA(candidate_loess_df['weighted_average_pct'], order=(1, 1, 1))
    model_fit = model.fit()

    # In-sample forecast (fitted values) and out-of-sample forecast
    candidate_loess_df['fitted'] = model_fit.fittedvalues
    
    # Forecast up to November 5, 2024
    forecast = model_fit.forecast(steps=days_to_forecast)
    
    # Store the forecasted results for this candidate
    candidate_forecasts[candidate] = forecast
    
    # Generate forecast dates for visualization
    forecast_dates = pd.date_range(candidate_loess_df.index[-1], periods=days_to_forecast + 1, freq='D')[1:]
    
    forecast_df = pd.DataFrame({
        'end_date': forecast_dates,
        'forecast': forecast,
        'candidate_name': candidate
    })
    
    # Reset index for easier plotting
    candidate_loess_df = candidate_loess_df.reset_index()

    # Altair plot for the actual, smoothed, and forecast data
    loess_chart = alt.Chart(candidate_loess_df).mark_line(color='blue', size=3).encode(
        x='end_date:T',
        y='weighted_average_pct:Q',
        tooltip=[alt.Tooltip('end_date:T', title='Date'),
                 alt.Tooltip('weighted_average_pct:Q', title='Smoothed Data')]
    ).properties(title=f'ARIMA Forecast on LOESS Smoothed Data for {candidate}')

    fitted_chart = alt.Chart(candidate_loess_df).mark_line(color='orange', size=2).encode(
        x='end_date:T',
        y='fitted:Q',
        tooltip=['fitted:Q']
    )

    forecast_chart = alt.Chart(forecast_df).mark_line(color='green', size=2).encode(
        x='end_date:T',
        y='forecast:Q',
        tooltip=[alt.Tooltip('end_date:T', title='Forecast Date'),
                 alt.Tooltip('forecast:Q', title='Forecasted Value')]
    )
    
    # Combine the charts and make the graph larger
    combined_chart = (loess_chart + fitted_chart + forecast_chart).properties(
        width=800,  # Set larger width
        height=400  # Set larger height
    )
    
    # Display the chart
    combined_chart.display()

    # Optionally, calculate the RMSE of the model
    rmse = np.sqrt(mean_squared_error(candidate_loess_df['weighted_average_pct'][1:], candidate_loess_df['fitted'][1:]))
    print(f"RMSE for {candidate}: {rmse}")
    # Extract the forecasted value for Nov 5, 2024
    nov_5_prediction = forecast_df[forecast_df['end_date'] == forecast_end_date]['forecast'].values[0]
    
    # Store the prediction in the final_predictions dictionary
    final_predictions[candidate] = nov_5_prediction

    # Print the final prediction for the candidate
    print(f"Final forecast for {candidate} on Nov 5, 2024: {nov_5_prediction:.2f}%")
    
# Optionally, print the final predictions for both candidates in a clear format
print("\nFinal Predictions for November 5, 2024:")
for candidate, prediction in final_predictions.items():
    print(f"{candidate}: {prediction:.2f}%")    


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


RMSE for Kamala Harris: 0.10884521452762408
Final forecast for Kamala Harris on Nov 5, 2024: 47.65%


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


RMSE for Donald Trump: 0.08413072783759809
Final forecast for Donald Trump on Nov 5, 2024: 49.64%

Final Predictions for November 5, 2024:
Kamala Harris: 47.65%
Donald Trump: 49.64%


#### Boxplots of House Effects distributions by Pollster and Candidate

In [120]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground Wisconsin

In [121]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]  # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Wisconsin"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
                
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
    
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_2 = group[group['count'] == 2]
                    
                    if not question_ids_with_count_2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()
        
                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                #c_mean2data = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]
                #c_mean2 = np.average(c_mean2data['pct'], weights=c_mean2data['w_i'])
                #c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                ##c_mean_by_pollster = candidate_data.groupby('pollster_id')['pct'].mean()
                
                    # Create an empty dictionary to store the results
                pollster_means = {}
            
                # Group the data by 'pollster_id'
                grouped = candidate_data.groupby('pollster_id')
            
                # Loop through each group (i.e., each pollster)
                for pollster_id, group in grouped:
                    # Find the most recent 'end_date' for this pollster
                    last_end_date_index = group['days_past_index'].min()
                    
                    # Calculate the start date for the filtering (last_end_date - x_days)
                    start_date_index = last_end_date_index+avg_window
            
                    # Filter the group's data to only include rows between start_date and last_end_date
                    filtered_group = group[(group['days_past_index'] >= last_end_date_index) & (group['days_past_index'] <= start_date_index)]
                    
                    # Calculate the mean of 'pct' for this filtered group
                    pollster_mean = filtered_group['pct'].mean()
                    c_mean2data = candidate_data[(candidate_data['days_past_index'] >= last_end_date_index) & (candidate_data['days_past_index'] <= start_date_index)]
                    c_mean2 = c_mean2data['pct'].mean() #np.average(c_mean2data['pct'], weights=c_mean2data['w_i']) #c_mean2data['pct'].mean()
                    # Store the result in the dictionary
                    house_effect = pollster_mean - c_mean2
                    pollster_means[pollster_id] = house_effect
            
                # Convert the dictionary to a pandas Series for easier handling
                c_mean_by_pollster = pd.Series(pollster_means)

                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Rename the columns for clarity
                c_mean_by_pollster_df.columns = ['pollster_id', 'house_effect']
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['mean_pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect']  # Existing house effect data
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll

                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
                
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Wisconsin")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Wisconsin


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-09,16661,Kamala Harris,49.2973,0.05,19
1,2024-10-09,16651,Donald Trump,47.342684,0.05,19
2,2024-10-08,16661,Kamala Harris,49.30825,0.05,19
3,2024-10-08,16651,Donald Trump,47.294993,0.05,19
4,2024-10-07,16661,Kamala Harris,49.52224,0.05,19
5,2024-10-07,16651,Donald Trump,47.192636,0.05,19
6,2024-10-02,16661,Kamala Harris,49.335265,0.05,19
7,2024-10-02,16651,Donald Trump,47.127874,0.05,19
8,2024-09-26,16661,Kamala Harris,49.311896,0.05,19
9,2024-09-26,16651,Donald Trump,47.153297,0.05,19


In [122]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print('Wisconsin')
print(df_aggregated.head(10))
# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (Wisconsin)"
)

# Display the final chart
final_chart.display()


Wisconsin
     end_date candidate_name  weighted_average_pct
61 2024-10-09  Kamala Harris             48.642760
60 2024-10-09   Donald Trump             47.587944
59 2024-10-08  Kamala Harris             48.634857
58 2024-10-08   Donald Trump             47.501809
57 2024-10-07  Kamala Harris             49.366624
56 2024-10-07   Donald Trump             47.246931
55 2024-10-02  Kamala Harris             49.291256
54 2024-10-02   Donald Trump             47.446596
53 2024-09-26  Kamala Harris             49.295157
52 2024-09-26   Donald Trump             47.541161


#### Kamala Harris and Trump Forecast in WI using Lowess curve fit data

In [124]:
import pandas as pd
import altair as alt
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming df_aggregated is already defined
# Make sure 'end_date' is in datetime format
df_aggregated['end_date'] = pd.to_datetime(df_aggregated['end_date'])

# Define a new dataframe to store the LOESS smoothed data
loess_data = pd.DataFrame()

# Perform LOESS smoothing for each candidate
for candidate in df_aggregated['candidate_name'].unique():
    # Filter the data for each candidate
    candidate_df = df_aggregated[df_aggregated['candidate_name'] == candidate]
    
    # Perform LOESS smoothing (use frac to control the smoothing level)
    loess_smoothed = lowess(candidate_df['weighted_average_pct'], 
                            candidate_df['end_date'].apply(lambda x: x.timestamp()), 
                            frac=0.3)  # Adjust frac as needed
    
    # Create a dataframe from the smoothed data
    smoothed_df = pd.DataFrame({
        'end_date': pd.to_datetime(loess_smoothed[:, 0], unit='s'),
        'weighted_average_pct': loess_smoothed[:, 1],
        'candidate_name': candidate
    })
    
    # Append to the main loess_data dataframe
    loess_data = pd.concat([loess_data, smoothed_df], ignore_index=True)

# Now let's fit an ARIMA model using the LOESS smoothed data
candidate_forecasts = {}
# Store the final prediction for each candidate
final_predictions = {}
# Forecast up to November 5, 2024
forecast_end_date = pd.Timestamp('2024-11-05')

# Loop through each candidate and apply ARIMA
for candidate in loess_data['candidate_name'].unique():
    # Filter the LOESS smoothed data for this candidate
    candidate_loess_df = loess_data[loess_data['candidate_name'] == candidate].sort_values(by='end_date')
    
    # Set 'end_date' as the index for ARIMA (must be a time series index)
    candidate_loess_df.set_index('end_date', inplace=True)

    # Calculate the number of days to forecast (from the last date to Nov 5, 2024)
    last_date = candidate_loess_df.index[-1]
    days_to_forecast = (forecast_end_date - last_date).days
    
    # Fit ARIMA model (you can tune the order=(p, d, q))
    model = ARIMA(candidate_loess_df['weighted_average_pct'], order=(1, 1, 1))
    model_fit = model.fit()

    # In-sample forecast (fitted values) and out-of-sample forecast
    candidate_loess_df['fitted'] = model_fit.fittedvalues
    
    # Forecast up to November 5, 2024
    forecast = model_fit.forecast(steps=days_to_forecast)
    
    # Store the forecasted results for this candidate
    candidate_forecasts[candidate] = forecast
    
    # Generate forecast dates for visualization
    forecast_dates = pd.date_range(candidate_loess_df.index[-1], periods=days_to_forecast + 1, freq='D')[1:]
    
    forecast_df = pd.DataFrame({
        'end_date': forecast_dates,
        'forecast': forecast,
        'candidate_name': candidate
    })
    
    # Reset index for easier plotting
    candidate_loess_df = candidate_loess_df.reset_index()

    # Altair plot for the actual, smoothed, and forecast data
    loess_chart = alt.Chart(candidate_loess_df).mark_line(color='blue', size=3).encode(
        x='end_date:T',
        y='weighted_average_pct:Q',
        tooltip=[alt.Tooltip('end_date:T', title='Date'),
                 alt.Tooltip('weighted_average_pct:Q', title='Smoothed Data')]
    ).properties(title=f'ARIMA Forecast on LOESS Smoothed Data for {candidate}')

    fitted_chart = alt.Chart(candidate_loess_df).mark_line(color='orange', size=2).encode(
        x='end_date:T',
        y='fitted:Q',
        tooltip=['fitted:Q']
    )

    forecast_chart = alt.Chart(forecast_df).mark_line(color='green', size=2).encode(
        x='end_date:T',
        y='forecast:Q',
        tooltip=[alt.Tooltip('end_date:T', title='Forecast Date'),
                 alt.Tooltip('forecast:Q', title='Forecasted Value')]
    )
    
    # Combine the charts and make the graph larger
    combined_chart = (loess_chart + fitted_chart + forecast_chart).properties(
        width=800,  # Set larger width
        height=400  # Set larger height
    )
    
    # Display the chart
    combined_chart.display()

    # Optionally, calculate the RMSE of the model
    rmse = np.sqrt(mean_squared_error(candidate_loess_df['weighted_average_pct'][1:], candidate_loess_df['fitted'][1:]))
    print(f"RMSE for {candidate}: {rmse}")
    # Extract the forecasted value for Nov 5, 2024
    nov_5_prediction = forecast_df[forecast_df['end_date'] == forecast_end_date]['forecast'].values[0]
    
    # Store the prediction in the final_predictions dictionary
    final_predictions[candidate] = nov_5_prediction

    # Print the final prediction for the candidate
    print(f"Final forecast for {candidate} on Nov 5, 2024: {nov_5_prediction:.2f}%")
    
# Optionally, print the final predictions for both candidates in a clear format
print("\nFinal Predictions for November 5, 2024:")
for candidate, prediction in final_predictions.items():
    print(f"{candidate}: {prediction:.2f}%")    


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


RMSE for Kamala Harris: 0.1269549460532577
Final forecast for Kamala Harris on Nov 5, 2024: 48.43%


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


RMSE for Donald Trump: 0.14622914818080915
Final forecast for Donald Trump on Nov 5, 2024: 47.56%

Final Predictions for November 5, 2024:
Kamala Harris: 48.43%
Donald Trump: 47.56%


#### Boxplots of House Effects distributions by Pollster and Candidate

In [125]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground North Carolina

In [126]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]  # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="North Carolina"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
                
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
    
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_2 = group[group['count'] == 2]
                    
                    if not question_ids_with_count_2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()
        
                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                #c_mean2data = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]
                #c_mean2 = np.average(c_mean2data['pct'], weights=c_mean2data['w_i'])
                #c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                ##c_mean_by_pollster = candidate_data.groupby('pollster_id')['pct'].mean()
                
                    # Create an empty dictionary to store the results
                pollster_means = {}
            
                # Group the data by 'pollster_id'
                grouped = candidate_data.groupby('pollster_id')
            
                # Loop through each group (i.e., each pollster)
                for pollster_id, group in grouped:
                    # Find the most recent 'end_date' for this pollster
                    last_end_date_index = group['days_past_index'].min()
                    
                    # Calculate the start date for the filtering (last_end_date - x_days)
                    start_date_index = last_end_date_index+avg_window
            
                    # Filter the group's data to only include rows between start_date and last_end_date
                    filtered_group = group[(group['days_past_index'] >= last_end_date_index) & (group['days_past_index'] <= start_date_index)]
                    
                    # Calculate the mean of 'pct' for this filtered group
                    pollster_mean = filtered_group['pct'].mean()
                    c_mean2data = candidate_data[(candidate_data['days_past_index'] >= last_end_date_index) & (candidate_data['days_past_index'] <= start_date_index)]
                    c_mean2 = c_mean2data['pct'].mean() #np.average(c_mean2data['pct'], weights=c_mean2data['w_i']) #c_mean2data['pct'].mean()
                    # Store the result in the dictionary
                    house_effect = pollster_mean - c_mean2
                    pollster_means[pollster_id] = house_effect
            
                # Convert the dictionary to a pandas Series for easier handling
                c_mean_by_pollster = pd.Series(pollster_means)

                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Rename the columns for clarity
                c_mean_by_pollster_df.columns = ['pollster_id', 'house_effect']
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['mean_pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect']  # Existing house effect data
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll

                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
                
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("North Carolina")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


North Carolina


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-08,16661,Kamala Harris,47.87658,0.05,19
1,2024-10-08,16651,Donald Trump,48.08238,0.05,19
2,2024-10-02,16661,Kamala Harris,47.789633,0.05,19
3,2024-10-02,16651,Donald Trump,48.01072,0.05,19
4,2024-09-30,16661,Kamala Harris,47.772055,0.05,19
5,2024-09-30,16651,Donald Trump,48.0164,0.05,19
6,2024-09-29,16661,Kamala Harris,47.771597,0.05,19
7,2024-09-29,16651,Donald Trump,47.97755,0.05,19
8,2024-09-28,16661,Kamala Harris,47.621571,0.05,19
9,2024-09-28,16651,Donald Trump,47.796243,0.05,19


In [127]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print('North Carolina')
print(df_aggregated.head(10))
# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (North Carolina)"
)

# Display the final chart
final_chart.display()


North Carolina
     end_date candidate_name  weighted_average_pct
57 2024-10-08  Kamala Harris             47.887288
56 2024-10-08   Donald Trump             48.436533
55 2024-10-02  Kamala Harris             47.941329
54 2024-10-02   Donald Trump             48.343505
53 2024-09-30  Kamala Harris             47.929051
52 2024-09-30   Donald Trump             48.338675
51 2024-09-29  Kamala Harris             47.910605
50 2024-09-29   Donald Trump             48.304684
49 2024-09-28  Kamala Harris             47.709041
48 2024-09-28   Donald Trump             48.051470


In [128]:
import pandas as pd
import altair as alt
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming df_aggregated is already defined
# Make sure 'end_date' is in datetime format
df_aggregated['end_date'] = pd.to_datetime(df_aggregated['end_date'])

# Define a new dataframe to store the LOESS smoothed data
loess_data = pd.DataFrame()

# Perform LOESS smoothing for each candidate
for candidate in df_aggregated['candidate_name'].unique():
    # Filter the data for each candidate
    candidate_df = df_aggregated[df_aggregated['candidate_name'] == candidate]
    
    # Perform LOESS smoothing (use frac to control the smoothing level)
    loess_smoothed = lowess(candidate_df['weighted_average_pct'], 
                            candidate_df['end_date'].apply(lambda x: x.timestamp()), 
                            frac=0.3)  # Adjust frac as needed
    
    # Create a dataframe from the smoothed data
    smoothed_df = pd.DataFrame({
        'end_date': pd.to_datetime(loess_smoothed[:, 0], unit='s'),
        'weighted_average_pct': loess_smoothed[:, 1],
        'candidate_name': candidate
    })
    
    # Append to the main loess_data dataframe
    loess_data = pd.concat([loess_data, smoothed_df], ignore_index=True)

# Now let's fit an ARIMA model using the LOESS smoothed data
candidate_forecasts = {}
# Store the final prediction for each candidate
final_predictions = {}
# Forecast up to November 5, 2024
forecast_end_date = pd.Timestamp('2024-11-05')

# Loop through each candidate and apply ARIMA
for candidate in loess_data['candidate_name'].unique():
    # Filter the LOESS smoothed data for this candidate
    candidate_loess_df = loess_data[loess_data['candidate_name'] == candidate].sort_values(by='end_date')
    
    # Set 'end_date' as the index for ARIMA (must be a time series index)
    candidate_loess_df.set_index('end_date', inplace=True)

    # Calculate the number of days to forecast (from the last date to Nov 5, 2024)
    last_date = candidate_loess_df.index[-1]
    days_to_forecast = (forecast_end_date - last_date).days
    
    # Fit ARIMA model (you can tune the order=(p, d, q))
    model = ARIMA(candidate_loess_df['weighted_average_pct'], order=(1, 1, 1))
    model_fit = model.fit()

    # In-sample forecast (fitted values) and out-of-sample forecast
    candidate_loess_df['fitted'] = model_fit.fittedvalues
    
    # Forecast up to November 5, 2024
    forecast = model_fit.forecast(steps=days_to_forecast)
    
    # Store the forecasted results for this candidate
    candidate_forecasts[candidate] = forecast
    
    # Generate forecast dates for visualization
    forecast_dates = pd.date_range(candidate_loess_df.index[-1], periods=days_to_forecast + 1, freq='D')[1:]
    
    forecast_df = pd.DataFrame({
        'end_date': forecast_dates,
        'forecast': forecast,
        'candidate_name': candidate
    })
    
    # Reset index for easier plotting
    candidate_loess_df = candidate_loess_df.reset_index()

    # Altair plot for the actual, smoothed, and forecast data
    loess_chart = alt.Chart(candidate_loess_df).mark_line(color='blue', size=3).encode(
        x='end_date:T',
        y='weighted_average_pct:Q',
        tooltip=[alt.Tooltip('end_date:T', title='Date'),
                 alt.Tooltip('weighted_average_pct:Q', title='Smoothed Data')]
    ).properties(title=f'ARIMA Forecast on LOESS Smoothed Data for {candidate}')

    fitted_chart = alt.Chart(candidate_loess_df).mark_line(color='orange', size=2).encode(
        x='end_date:T',
        y='fitted:Q',
        tooltip=['fitted:Q']
    )

    forecast_chart = alt.Chart(forecast_df).mark_line(color='green', size=2).encode(
        x='end_date:T',
        y='forecast:Q',
        tooltip=[alt.Tooltip('end_date:T', title='Forecast Date'),
                 alt.Tooltip('forecast:Q', title='Forecasted Value')]
    )
    
    # Combine the charts and make the graph larger
    combined_chart = (loess_chart + fitted_chart + forecast_chart).properties(
        width=800,  # Set larger width
        height=400  # Set larger height
    )
    
    # Display the chart
    combined_chart.display()

    # Optionally, calculate the RMSE of the model
    rmse = np.sqrt(mean_squared_error(candidate_loess_df['weighted_average_pct'][1:], candidate_loess_df['fitted'][1:]))
    print(f"RMSE for {candidate}: {rmse}")
    # Extract the forecasted value for Nov 5, 2024
    nov_5_prediction = forecast_df[forecast_df['end_date'] == forecast_end_date]['forecast'].values[0]
    
    # Store the prediction in the final_predictions dictionary
    final_predictions[candidate] = nov_5_prediction

    # Print the final prediction for the candidate
    print(f"Final forecast for {candidate} on Nov 5, 2024: {nov_5_prediction:.2f}%")
    
# Optionally, print the final predictions for both candidates in a clear format
print("\nFinal Predictions for November 5, 2024:")
for candidate, prediction in final_predictions.items():
    print(f"{candidate}: {prediction:.2f}%")    


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


RMSE for Kamala Harris: 0.3765765769950478
Final forecast for Kamala Harris on Nov 5, 2024: 48.32%


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


RMSE for Donald Trump: 0.069397419151641
Final forecast for Donald Trump on Nov 5, 2024: 48.87%

Final Predictions for November 5, 2024:
Kamala Harris: 48.32%
Donald Trump: 48.87%


#### Boxplots of House Effects distributions by Pollster and Candidate

In [129]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground Georgia

In [130]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]  # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Georgia"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
                
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
    
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_2 = group[group['count'] == 2]
                    
                    if not question_ids_with_count_2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()
        
                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                #c_mean2data = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]
                #c_mean2 = np.average(c_mean2data['pct'], weights=c_mean2data['w_i'])
                #c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                ##c_mean_by_pollster = candidate_data.groupby('pollster_id')['pct'].mean()
                
                    # Create an empty dictionary to store the results
                pollster_means = {}
            
                # Group the data by 'pollster_id'
                grouped = candidate_data.groupby('pollster_id')
            
                # Loop through each group (i.e., each pollster)
                for pollster_id, group in grouped:
                    # Find the most recent 'end_date' for this pollster
                    last_end_date_index = group['days_past_index'].min()
                    
                    # Calculate the start date for the filtering (last_end_date - x_days)
                    start_date_index = last_end_date_index+avg_window
            
                    # Filter the group's data to only include rows between start_date and last_end_date
                    filtered_group = group[(group['days_past_index'] >= last_end_date_index) & (group['days_past_index'] <= start_date_index)]
                    
                    # Calculate the mean of 'pct' for this filtered group
                    pollster_mean = filtered_group['pct'].mean()
                    c_mean2data = candidate_data[(candidate_data['days_past_index'] >= last_end_date_index) & (candidate_data['days_past_index'] <= start_date_index)]
                    c_mean2 = c_mean2data['pct'].mean() #np.average(c_mean2data['pct'], weights=c_mean2data['w_i']) #c_mean2data['pct'].mean()
                    # Store the result in the dictionary
                    house_effect = pollster_mean - c_mean2
                    pollster_means[pollster_id] = house_effect
            
                # Convert the dictionary to a pandas Series for easier handling
                c_mean_by_pollster = pd.Series(pollster_means)

                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Rename the columns for clarity
                c_mean_by_pollster_df.columns = ['pollster_id', 'house_effect']
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['mean_pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect']  # Existing house effect data
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll

                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
                
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Georgia")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Georgia


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-10,16661,Kamala Harris,47.364244,0.05,19
1,2024-10-10,16651,Donald Trump,48.705652,0.05,19
2,2024-10-09,16661,Kamala Harris,47.352226,0.05,19
3,2024-10-09,16651,Donald Trump,48.68845,0.05,19
4,2024-10-08,16661,Kamala Harris,47.357943,0.05,19
5,2024-10-08,16651,Donald Trump,48.648834,0.05,19
6,2024-10-02,16661,Kamala Harris,47.408426,0.05,19
7,2024-10-02,16651,Donald Trump,48.567148,0.05,19
8,2024-09-30,16661,Kamala Harris,47.315322,0.05,19
9,2024-09-30,16651,Donald Trump,48.532829,0.05,19


In [131]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print('Georgia')
print(df_aggregated.head(10))
# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (Georgia)"
)

# Display the final chart
final_chart.display()


Georgia
     end_date candidate_name  weighted_average_pct
45 2024-10-10  Kamala Harris             47.528655
44 2024-10-10   Donald Trump             48.750841
43 2024-10-09  Kamala Harris             47.456305
42 2024-10-09   Donald Trump             48.763781
41 2024-10-08  Kamala Harris             47.489456
40 2024-10-08   Donald Trump             48.721862
39 2024-10-02  Kamala Harris             47.659477
38 2024-10-02   Donald Trump             48.742285
37 2024-09-30  Kamala Harris             47.513952
36 2024-09-30   Donald Trump             48.749361


In [132]:
import pandas as pd
import altair as alt
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming df_aggregated is already defined
# Make sure 'end_date' is in datetime format
df_aggregated['end_date'] = pd.to_datetime(df_aggregated['end_date'])

# Define a new dataframe to store the LOESS smoothed data
loess_data = pd.DataFrame()

# Perform LOESS smoothing for each candidate
for candidate in df_aggregated['candidate_name'].unique():
    # Filter the data for each candidate
    candidate_df = df_aggregated[df_aggregated['candidate_name'] == candidate]
    
    # Perform LOESS smoothing (use frac to control the smoothing level)
    loess_smoothed = lowess(candidate_df['weighted_average_pct'], 
                            candidate_df['end_date'].apply(lambda x: x.timestamp()), 
                            frac=0.3)  # Adjust frac as needed
    
    # Create a dataframe from the smoothed data
    smoothed_df = pd.DataFrame({
        'end_date': pd.to_datetime(loess_smoothed[:, 0], unit='s'),
        'weighted_average_pct': loess_smoothed[:, 1],
        'candidate_name': candidate
    })
    
    # Append to the main loess_data dataframe
    loess_data = pd.concat([loess_data, smoothed_df], ignore_index=True)

# Now let's fit an ARIMA model using the LOESS smoothed data
candidate_forecasts = {}
# Store the final prediction for each candidate
final_predictions = {}
# Forecast up to November 5, 2024
forecast_end_date = pd.Timestamp('2024-11-05')

# Loop through each candidate and apply ARIMA
for candidate in loess_data['candidate_name'].unique():
    # Filter the LOESS smoothed data for this candidate
    candidate_loess_df = loess_data[loess_data['candidate_name'] == candidate].sort_values(by='end_date')
    
    # Set 'end_date' as the index for ARIMA (must be a time series index)
    candidate_loess_df.set_index('end_date', inplace=True)

    # Calculate the number of days to forecast (from the last date to Nov 5, 2024)
    last_date = candidate_loess_df.index[-1]
    days_to_forecast = (forecast_end_date - last_date).days
    
    # Fit ARIMA model (you can tune the order=(p, d, q))
    model = ARIMA(candidate_loess_df['weighted_average_pct'], order=(1, 1, 1))
    model_fit = model.fit()

    # In-sample forecast (fitted values) and out-of-sample forecast
    candidate_loess_df['fitted'] = model_fit.fittedvalues
    
    # Forecast up to November 5, 2024
    forecast = model_fit.forecast(steps=days_to_forecast)
    
    # Store the forecasted results for this candidate
    candidate_forecasts[candidate] = forecast
    
    # Generate forecast dates for visualization
    forecast_dates = pd.date_range(candidate_loess_df.index[-1], periods=days_to_forecast + 1, freq='D')[1:]
    
    forecast_df = pd.DataFrame({
        'end_date': forecast_dates,
        'forecast': forecast,
        'candidate_name': candidate
    })
    
    # Reset index for easier plotting
    candidate_loess_df = candidate_loess_df.reset_index()

    # Altair plot for the actual, smoothed, and forecast data
    loess_chart = alt.Chart(candidate_loess_df).mark_line(color='blue', size=3).encode(
        x='end_date:T',
        y='weighted_average_pct:Q',
        tooltip=[alt.Tooltip('end_date:T', title='Date'),
                 alt.Tooltip('weighted_average_pct:Q', title='Smoothed Data')]
    ).properties(title=f'ARIMA Forecast on LOESS Smoothed Data for {candidate}')

    fitted_chart = alt.Chart(candidate_loess_df).mark_line(color='orange', size=2).encode(
        x='end_date:T',
        y='fitted:Q',
        tooltip=['fitted:Q']
    )

    forecast_chart = alt.Chart(forecast_df).mark_line(color='green', size=2).encode(
        x='end_date:T',
        y='forecast:Q',
        tooltip=[alt.Tooltip('end_date:T', title='Forecast Date'),
                 alt.Tooltip('forecast:Q', title='Forecasted Value')]
    )
    
    # Combine the charts and make the graph larger
    combined_chart = (loess_chart + fitted_chart + forecast_chart).properties(
        width=800,  # Set larger width
        height=400  # Set larger height
    )
    
    # Display the chart
    combined_chart.display()

    # Optionally, calculate the RMSE of the model
    rmse = np.sqrt(mean_squared_error(candidate_loess_df['weighted_average_pct'][1:], candidate_loess_df['fitted'][1:]))
    print(f"RMSE for {candidate}: {rmse}")
    # Extract the forecasted value for Nov 5, 2024
    nov_5_prediction = forecast_df[forecast_df['end_date'] == forecast_end_date]['forecast'].values[0]
    
    # Store the prediction in the final_predictions dictionary
    final_predictions[candidate] = nov_5_prediction

    # Print the final prediction for the candidate
    print(f"Final forecast for {candidate} on Nov 5, 2024: {nov_5_prediction:.2f}%")
    
# Optionally, print the final predictions for both candidates in a clear format
print("\nFinal Predictions for November 5, 2024:")
for candidate, prediction in final_predictions.items():
    print(f"{candidate}: {prediction:.2f}%")    


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


RMSE for Kamala Harris: 0.20343359194991661
Final forecast for Kamala Harris on Nov 5, 2024: 47.44%


  return get_prediction_index(
  return get_prediction_index(


RMSE for Donald Trump: 0.12654367320855564
Final forecast for Donald Trump on Nov 5, 2024: 49.07%

Final Predictions for November 5, 2024:
Kamala Harris: 47.44%
Donald Trump: 49.07%


 #### Boxplots of House Effects distributions by Pollster and Candidate

In [133]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground Florida

In [134]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]  # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Florida"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
                
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
    
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_2 = group[group['count'] == 2]
                    
                    if not question_ids_with_count_2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()
        
                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                #c_mean2data = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]
                #c_mean2 = np.average(c_mean2data['pct'], weights=c_mean2data['w_i'])
                #c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                ##c_mean_by_pollster = candidate_data.groupby('pollster_id')['pct'].mean()
                
                    # Create an empty dictionary to store the results
                pollster_means = {}
            
                # Group the data by 'pollster_id'
                grouped = candidate_data.groupby('pollster_id')
            
                # Loop through each group (i.e., each pollster)
                for pollster_id, group in grouped:
                    # Find the most recent 'end_date' for this pollster
                    last_end_date_index = group['days_past_index'].min()
                    
                    # Calculate the start date for the filtering (last_end_date - x_days)
                    start_date_index = last_end_date_index+avg_window
            
                    # Filter the group's data to only include rows between start_date and last_end_date
                    filtered_group = group[(group['days_past_index'] >= last_end_date_index) & (group['days_past_index'] <= start_date_index)]
                    
                    # Calculate the mean of 'pct' for this filtered group
                    pollster_mean = filtered_group['pct'].mean()
                    c_mean2data = candidate_data[(candidate_data['days_past_index'] >= last_end_date_index) & (candidate_data['days_past_index'] <= start_date_index)]
                    c_mean2 = c_mean2data['pct'].mean() #np.average(c_mean2data['pct'], weights=c_mean2data['w_i']) #c_mean2data['pct'].mean()
                    # Store the result in the dictionary
                    house_effect = pollster_mean - c_mean2
                    pollster_means[pollster_id] = house_effect
            
                # Convert the dictionary to a pandas Series for easier handling
                c_mean_by_pollster = pd.Series(pollster_means)

                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Rename the columns for clarity
                c_mean_by_pollster_df.columns = ['pollster_id', 'house_effect']
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['mean_pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect']  # Existing house effect data
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll

                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
                
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Florida")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Florida


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-07,16661,Kamala Harris,45.078443,0.05,19
1,2024-10-07,16651,Donald Trump,49.700308,0.05,19
2,2024-10-06,16661,Kamala Harris,45.64134,0.05,19
3,2024-10-06,16651,Donald Trump,49.136169,0.05,19
4,2024-10-04,16661,Kamala Harris,45.855782,0.05,19
5,2024-10-04,16651,Donald Trump,49.134564,0.05,19
6,2024-10-02,16661,Kamala Harris,45.924747,0.05,19
7,2024-10-02,16651,Donald Trump,49.182814,0.05,19
8,2024-09-27,16661,Kamala Harris,46.049623,0.05,19
9,2024-09-27,16651,Donald Trump,49.624802,0.05,19


In [135]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print('Florida')
print(df_aggregated.head(10))
# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (Florida)"
)

# Display the final chart
final_chart.display()


Florida
     end_date candidate_name  weighted_average_pct
31 2024-10-07  Kamala Harris             44.511975
30 2024-10-07   Donald Trump             50.261300
29 2024-10-06  Kamala Harris             45.221142
28 2024-10-06   Donald Trump             49.039367
27 2024-10-04  Kamala Harris             45.726443
26 2024-10-04   Donald Trump             48.955154
25 2024-10-02  Kamala Harris             45.885350
24 2024-10-02   Donald Trump             48.869957
23 2024-09-27  Kamala Harris             45.851333
22 2024-09-27   Donald Trump             49.373806


In [136]:
import pandas as pd
import altair as alt
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming df_aggregated is already defined
# Make sure 'end_date' is in datetime format
df_aggregated['end_date'] = pd.to_datetime(df_aggregated['end_date'])

# Define a new dataframe to store the LOESS smoothed data
loess_data = pd.DataFrame()

# Perform LOESS smoothing for each candidate
for candidate in df_aggregated['candidate_name'].unique():
    # Filter the data for each candidate
    candidate_df = df_aggregated[df_aggregated['candidate_name'] == candidate]
    
    # Perform LOESS smoothing (use frac to control the smoothing level)
    loess_smoothed = lowess(candidate_df['weighted_average_pct'], 
                            candidate_df['end_date'].apply(lambda x: x.timestamp()), 
                            frac=0.3)  # Adjust frac as needed
    
    # Create a dataframe from the smoothed data
    smoothed_df = pd.DataFrame({
        'end_date': pd.to_datetime(loess_smoothed[:, 0], unit='s'),
        'weighted_average_pct': loess_smoothed[:, 1],
        'candidate_name': candidate
    })
    
    # Append to the main loess_data dataframe
    loess_data = pd.concat([loess_data, smoothed_df], ignore_index=True)

# Now let's fit an ARIMA model using the LOESS smoothed data
candidate_forecasts = {}
# Store the final prediction for each candidate
final_predictions = {}
# Forecast up to November 5, 2024
forecast_end_date = pd.Timestamp('2024-11-05')

# Loop through each candidate and apply ARIMA
for candidate in loess_data['candidate_name'].unique():
    # Filter the LOESS smoothed data for this candidate
    candidate_loess_df = loess_data[loess_data['candidate_name'] == candidate].sort_values(by='end_date')
    
    # Set 'end_date' as the index for ARIMA (must be a time series index)
    candidate_loess_df.set_index('end_date', inplace=True)

    # Calculate the number of days to forecast (from the last date to Nov 5, 2024)
    last_date = candidate_loess_df.index[-1]
    days_to_forecast = (forecast_end_date - last_date).days
    
    # Fit ARIMA model (you can tune the order=(p, d, q))
    model = ARIMA(candidate_loess_df['weighted_average_pct'], order=(1, 1, 1))
    model_fit = model.fit()

    # In-sample forecast (fitted values) and out-of-sample forecast
    candidate_loess_df['fitted'] = model_fit.fittedvalues
    
    # Forecast up to November 5, 2024
    forecast = model_fit.forecast(steps=days_to_forecast)
    
    # Store the forecasted results for this candidate
    candidate_forecasts[candidate] = forecast
    
    # Generate forecast dates for visualization
    forecast_dates = pd.date_range(candidate_loess_df.index[-1], periods=days_to_forecast + 1, freq='D')[1:]
    
    forecast_df = pd.DataFrame({
        'end_date': forecast_dates,
        'forecast': forecast,
        'candidate_name': candidate
    })
    
    # Reset index for easier plotting
    candidate_loess_df = candidate_loess_df.reset_index()

    # Altair plot for the actual, smoothed, and forecast data
    loess_chart = alt.Chart(candidate_loess_df).mark_line(color='blue', size=3).encode(
        x='end_date:T',
        y='weighted_average_pct:Q',
        tooltip=[alt.Tooltip('end_date:T', title='Date'),
                 alt.Tooltip('weighted_average_pct:Q', title='Smoothed Data')]
    ).properties(title=f'ARIMA Forecast on LOESS Smoothed Data for {candidate}')

    fitted_chart = alt.Chart(candidate_loess_df).mark_line(color='orange', size=2).encode(
        x='end_date:T',
        y='fitted:Q',
        tooltip=['fitted:Q']
    )

    forecast_chart = alt.Chart(forecast_df).mark_line(color='green', size=2).encode(
        x='end_date:T',
        y='forecast:Q',
        tooltip=[alt.Tooltip('end_date:T', title='Forecast Date'),
                 alt.Tooltip('forecast:Q', title='Forecasted Value')]
    )
    
    # Combine the charts and make the graph larger
    combined_chart = (loess_chart + fitted_chart + forecast_chart).properties(
        width=800,  # Set larger width
        height=400  # Set larger height
    )
    
    # Display the chart
    combined_chart.display()

    # Optionally, calculate the RMSE of the model
    rmse = np.sqrt(mean_squared_error(candidate_loess_df['weighted_average_pct'][1:], candidate_loess_df['fitted'][1:]))
    print(f"RMSE for {candidate}: {rmse}")
    # Extract the forecasted value for Nov 5, 2024
    nov_5_prediction = forecast_df[forecast_df['end_date'] == forecast_end_date]['forecast'].values[0]
    
    # Store the prediction in the final_predictions dictionary
    final_predictions[candidate] = nov_5_prediction

    # Print the final prediction for the candidate
    print(f"Final forecast for {candidate} on Nov 5, 2024: {nov_5_prediction:.2f}%")
    
# Optionally, print the final predictions for both candidates in a clear format
print("\nFinal Predictions for November 5, 2024:")
for candidate, prediction in final_predictions.items():
    print(f"{candidate}: {prediction:.2f}%")    


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


RMSE for Kamala Harris: 0.8337841798995669
Final forecast for Kamala Harris on Nov 5, 2024: 44.31%


RMSE for Donald Trump: 0.6316003877505396
Final forecast for Donald Trump on Nov 5, 2024: 50.84%

Final Predictions for November 5, 2024:
Kamala Harris: 44.31%
Donald Trump: 50.84%


#### Boxplots of House Effects distributions by Pollster and Candidate

In [137]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground Arizona

In [138]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]  # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Arizona"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
                
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
    
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_2 = group[group['count'] == 2]
                    
                    if not question_ids_with_count_2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()
        
                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                #c_mean2data = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]
                #c_mean2 = np.average(c_mean2data['pct'], weights=c_mean2data['w_i'])
                #c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                ##c_mean_by_pollster = candidate_data.groupby('pollster_id')['pct'].mean()
                
                    # Create an empty dictionary to store the results
                pollster_means = {}
            
                # Group the data by 'pollster_id'
                grouped = candidate_data.groupby('pollster_id')
            
                # Loop through each group (i.e., each pollster)
                for pollster_id, group in grouped:
                    # Find the most recent 'end_date' for this pollster
                    last_end_date_index = group['days_past_index'].min()
                    
                    # Calculate the start date for the filtering (last_end_date - x_days)
                    start_date_index = last_end_date_index+avg_window
            
                    # Filter the group's data to only include rows between start_date and last_end_date
                    filtered_group = group[(group['days_past_index'] >= last_end_date_index) & (group['days_past_index'] <= start_date_index)]
                    
                    # Calculate the mean of 'pct' for this filtered group
                    pollster_mean = filtered_group['pct'].mean()
                    c_mean2data = candidate_data[(candidate_data['days_past_index'] >= last_end_date_index) & (candidate_data['days_past_index'] <= start_date_index)]
                    c_mean2 = c_mean2data['pct'].mean() #np.average(c_mean2data['pct'], weights=c_mean2data['w_i']) #c_mean2data['pct'].mean()
                    # Store the result in the dictionary
                    house_effect = pollster_mean - c_mean2
                    pollster_means[pollster_id] = house_effect
            
                # Convert the dictionary to a pandas Series for easier handling
                c_mean_by_pollster = pd.Series(pollster_means)

                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Rename the columns for clarity
                c_mean_by_pollster_df.columns = ['pollster_id', 'house_effect']
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['mean_pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect']  # Existing house effect data
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll

                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
                
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Arizona")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Arizona


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-10,16661,Kamala Harris,47.246322,0.05,19
1,2024-10-10,16651,Donald Trump,48.64234,0.05,19
2,2024-10-08,16661,Kamala Harris,47.107035,0.05,19
3,2024-10-08,16651,Donald Trump,48.323676,0.05,19
4,2024-10-02,16661,Kamala Harris,47.045373,0.05,19
5,2024-10-02,16651,Donald Trump,48.328567,0.05,19
6,2024-10-01,16661,Kamala Harris,47.049392,0.05,19
7,2024-10-01,16651,Donald Trump,48.249838,0.05,19
8,2024-09-30,16661,Kamala Harris,47.006898,0.05,19
9,2024-09-30,16651,Donald Trump,48.195565,0.05,19


In [139]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print('Arizona')
print(df_aggregated.head(10))
# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (Arizona)"
)

# Display the final chart
final_chart.display()


Arizona
     end_date candidate_name  weighted_average_pct
49 2024-10-10  Kamala Harris             47.266643
48 2024-10-10   Donald Trump             49.098468
47 2024-10-08  Kamala Harris             47.203755
46 2024-10-08   Donald Trump             48.522890
45 2024-10-02  Kamala Harris             47.183817
44 2024-10-02   Donald Trump             48.600524
43 2024-10-01  Kamala Harris             47.165926
42 2024-10-01   Donald Trump             48.540367
41 2024-09-30  Kamala Harris             47.040026
40 2024-09-30   Donald Trump             48.524227


In [140]:
import pandas as pd
import altair as alt
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming df_aggregated is already defined
# Make sure 'end_date' is in datetime format
df_aggregated['end_date'] = pd.to_datetime(df_aggregated['end_date'])

# Define a new dataframe to store the LOESS smoothed data
loess_data = pd.DataFrame()

# Perform LOESS smoothing for each candidate
for candidate in df_aggregated['candidate_name'].unique():
    # Filter the data for each candidate
    candidate_df = df_aggregated[df_aggregated['candidate_name'] == candidate]
    
    # Perform LOESS smoothing (use frac to control the smoothing level)
    loess_smoothed = lowess(candidate_df['weighted_average_pct'], 
                            candidate_df['end_date'].apply(lambda x: x.timestamp()), 
                            frac=0.3)  # Adjust frac as needed
    
    # Create a dataframe from the smoothed data
    smoothed_df = pd.DataFrame({
        'end_date': pd.to_datetime(loess_smoothed[:, 0], unit='s'),
        'weighted_average_pct': loess_smoothed[:, 1],
        'candidate_name': candidate
    })
    
    # Append to the main loess_data dataframe
    loess_data = pd.concat([loess_data, smoothed_df], ignore_index=True)

# Now let's fit an ARIMA model using the LOESS smoothed data
candidate_forecasts = {}
# Store the final prediction for each candidate
final_predictions = {}
# Forecast up to November 5, 2024
forecast_end_date = pd.Timestamp('2024-11-05')

# Loop through each candidate and apply ARIMA
for candidate in loess_data['candidate_name'].unique():
    # Filter the LOESS smoothed data for this candidate
    candidate_loess_df = loess_data[loess_data['candidate_name'] == candidate].sort_values(by='end_date')
    
    # Set 'end_date' as the index for ARIMA (must be a time series index)
    candidate_loess_df.set_index('end_date', inplace=True)

    # Calculate the number of days to forecast (from the last date to Nov 5, 2024)
    last_date = candidate_loess_df.index[-1]
    days_to_forecast = (forecast_end_date - last_date).days
    
    # Fit ARIMA model (you can tune the order=(p, d, q))
    model = ARIMA(candidate_loess_df['weighted_average_pct'], order=(1, 1, 1))
    model_fit = model.fit()

    # In-sample forecast (fitted values) and out-of-sample forecast
    candidate_loess_df['fitted'] = model_fit.fittedvalues
    
    # Forecast up to November 5, 2024
    forecast = model_fit.forecast(steps=days_to_forecast)
    
    # Store the forecasted results for this candidate
    candidate_forecasts[candidate] = forecast
    
    # Generate forecast dates for visualization
    forecast_dates = pd.date_range(candidate_loess_df.index[-1], periods=days_to_forecast + 1, freq='D')[1:]
    
    forecast_df = pd.DataFrame({
        'end_date': forecast_dates,
        'forecast': forecast,
        'candidate_name': candidate
    })
    
    # Reset index for easier plotting
    candidate_loess_df = candidate_loess_df.reset_index()

    # Altair plot for the actual, smoothed, and forecast data
    loess_chart = alt.Chart(candidate_loess_df).mark_line(color='blue', size=3).encode(
        x='end_date:T',
        y='weighted_average_pct:Q',
        tooltip=[alt.Tooltip('end_date:T', title='Date'),
                 alt.Tooltip('weighted_average_pct:Q', title='Smoothed Data')]
    ).properties(title=f'ARIMA Forecast on LOESS Smoothed Data for {candidate}')

    fitted_chart = alt.Chart(candidate_loess_df).mark_line(color='orange', size=2).encode(
        x='end_date:T',
        y='fitted:Q',
        tooltip=['fitted:Q']
    )

    forecast_chart = alt.Chart(forecast_df).mark_line(color='green', size=2).encode(
        x='end_date:T',
        y='forecast:Q',
        tooltip=[alt.Tooltip('end_date:T', title='Forecast Date'),
                 alt.Tooltip('forecast:Q', title='Forecasted Value')]
    )
    
    # Combine the charts and make the graph larger
    combined_chart = (loess_chart + fitted_chart + forecast_chart).properties(
        width=800,  # Set larger width
        height=400  # Set larger height
    )
    
    # Display the chart
    combined_chart.display()

    # Optionally, calculate the RMSE of the model
    rmse = np.sqrt(mean_squared_error(candidate_loess_df['weighted_average_pct'][1:], candidate_loess_df['fitted'][1:]))
    print(f"RMSE for {candidate}: {rmse}")
    # Extract the forecasted value for Nov 5, 2024
    nov_5_prediction = forecast_df[forecast_df['end_date'] == forecast_end_date]['forecast'].values[0]
    
    # Store the prediction in the final_predictions dictionary
    final_predictions[candidate] = nov_5_prediction

    # Print the final prediction for the candidate
    print(f"Final forecast for {candidate} on Nov 5, 2024: {nov_5_prediction:.2f}%")
    
# Optionally, print the final predictions for both candidates in a clear format
print("\nFinal Predictions for November 5, 2024:")
for candidate, prediction in final_predictions.items():
    print(f"{candidate}: {prediction:.2f}%")    


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


RMSE for Kamala Harris: 0.24388275845201937
Final forecast for Kamala Harris on Nov 5, 2024: 47.35%


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


RMSE for Donald Trump: 0.21647663053056826
Final forecast for Donald Trump on Nov 5, 2024: 49.09%

Final Predictions for November 5, 2024:
Kamala Harris: 47.35%
Donald Trump: 49.09%


#### Boxplots of House Effects distributions by Pollster and Candidate

In [141]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground Nevada

In [143]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]  # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Nevada"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
                
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
    
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_2 = group[group['count'] == 2]
                    
                    if not question_ids_with_count_2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()
        
                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                #c_mean2data = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]
                #c_mean2 = np.average(c_mean2data['pct'], weights=c_mean2data['w_i'])
                #c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                ##c_mean_by_pollster = candidate_data.groupby('pollster_id')['pct'].mean()
                
                    # Create an empty dictionary to store the results
                pollster_means = {}
            
                # Group the data by 'pollster_id'
                grouped = candidate_data.groupby('pollster_id')
            
                # Loop through each group (i.e., each pollster)
                for pollster_id, group in grouped:
                    # Find the most recent 'end_date' for this pollster
                    last_end_date_index = group['days_past_index'].min()
                    
                    # Calculate the start date for the filtering (last_end_date - x_days)
                    start_date_index = last_end_date_index+avg_window
            
                    # Filter the group's data to only include rows between start_date and last_end_date
                    filtered_group = group[(group['days_past_index'] >= last_end_date_index) & (group['days_past_index'] <= start_date_index)]
                    
                    # Calculate the mean of 'pct' for this filtered group
                    pollster_mean = filtered_group['pct'].mean()
                    c_mean2data = candidate_data[(candidate_data['days_past_index'] >= last_end_date_index) & (candidate_data['days_past_index'] <= start_date_index)]
                    c_mean2 = c_mean2data['pct'].mean() #np.average(c_mean2data['pct'], weights=c_mean2data['w_i']) #c_mean2data['pct'].mean()
                    # Store the result in the dictionary
                    house_effect = pollster_mean - c_mean2
                    pollster_means[pollster_id] = house_effect
            
                # Convert the dictionary to a pandas Series for easier handling
                c_mean_by_pollster = pd.Series(pollster_means)

                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Rename the columns for clarity
                c_mean_by_pollster_df.columns = ['pollster_id', 'house_effect']
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['mean_pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect']  # Existing house effect data
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll

                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
                
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Nevada")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Nevada


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-08,16661,Kamala Harris,48.70579,0.05,19
1,2024-10-08,16651,Donald Trump,47.03691,0.05,19
2,2024-10-03,16661,Kamala Harris,48.286145,0.05,19
3,2024-10-03,16651,Donald Trump,46.977196,0.05,19
4,2024-10-02,16661,Kamala Harris,48.197511,0.05,19
5,2024-10-02,16651,Donald Trump,46.964725,0.05,19
6,2024-09-30,16661,Kamala Harris,48.053336,0.05,19
7,2024-09-30,16651,Donald Trump,46.844673,0.05,19
8,2024-09-25,16661,Kamala Harris,47.505665,0.05,19
9,2024-09-25,16651,Donald Trump,46.878253,0.05,19


In [144]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print('Nevada')
print(df_aggregated.head(10))
# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (Nevada)"
)

# Display the final chart
final_chart.display()


Nevada
     end_date candidate_name  weighted_average_pct
35 2024-10-08  Kamala Harris             48.895325
34 2024-10-08   Donald Trump             47.509732
33 2024-10-03  Kamala Harris             48.929286
32 2024-10-03   Donald Trump             47.097834
31 2024-10-02  Kamala Harris             48.852967
30 2024-10-02   Donald Trump             47.082193
29 2024-09-30  Kamala Harris             48.636497
28 2024-09-30   Donald Trump             46.855351
27 2024-09-25  Kamala Harris             48.080426
26 2024-09-25   Donald Trump             46.823962


In [145]:
import pandas as pd
import altair as alt
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming df_aggregated is already defined
# Make sure 'end_date' is in datetime format
df_aggregated['end_date'] = pd.to_datetime(df_aggregated['end_date'])

# Define a new dataframe to store the LOESS smoothed data
loess_data = pd.DataFrame()

# Perform LOESS smoothing for each candidate
for candidate in df_aggregated['candidate_name'].unique():
    # Filter the data for each candidate
    candidate_df = df_aggregated[df_aggregated['candidate_name'] == candidate]
    
    # Perform LOESS smoothing (use frac to control the smoothing level)
    loess_smoothed = lowess(candidate_df['weighted_average_pct'], 
                            candidate_df['end_date'].apply(lambda x: x.timestamp()), 
                            frac=0.3)  # Adjust frac as needed
    
    # Create a dataframe from the smoothed data
    smoothed_df = pd.DataFrame({
        'end_date': pd.to_datetime(loess_smoothed[:, 0], unit='s'),
        'weighted_average_pct': loess_smoothed[:, 1],
        'candidate_name': candidate
    })
    
    # Append to the main loess_data dataframe
    loess_data = pd.concat([loess_data, smoothed_df], ignore_index=True)

# Now let's fit an ARIMA model using the LOESS smoothed data
candidate_forecasts = {}
# Store the final prediction for each candidate
final_predictions = {}
# Forecast up to November 5, 2024
forecast_end_date = pd.Timestamp('2024-11-05')

# Loop through each candidate and apply ARIMA
for candidate in loess_data['candidate_name'].unique():
    # Filter the LOESS smoothed data for this candidate
    candidate_loess_df = loess_data[loess_data['candidate_name'] == candidate].sort_values(by='end_date')
    
    # Set 'end_date' as the index for ARIMA (must be a time series index)
    candidate_loess_df.set_index('end_date', inplace=True)

    # Calculate the number of days to forecast (from the last date to Nov 5, 2024)
    last_date = candidate_loess_df.index[-1]
    days_to_forecast = (forecast_end_date - last_date).days
    
    # Fit ARIMA model (you can tune the order=(p, d, q))
    model = ARIMA(candidate_loess_df['weighted_average_pct'], order=(1, 1, 1))
    model_fit = model.fit()

    # In-sample forecast (fitted values) and out-of-sample forecast
    candidate_loess_df['fitted'] = model_fit.fittedvalues
    
    # Forecast up to November 5, 2024
    forecast = model_fit.forecast(steps=days_to_forecast)
    
    # Store the forecasted results for this candidate
    candidate_forecasts[candidate] = forecast
    
    # Generate forecast dates for visualization
    forecast_dates = pd.date_range(candidate_loess_df.index[-1], periods=days_to_forecast + 1, freq='D')[1:]
    
    forecast_df = pd.DataFrame({
        'end_date': forecast_dates,
        'forecast': forecast,
        'candidate_name': candidate
    })
    
    # Reset index for easier plotting
    candidate_loess_df = candidate_loess_df.reset_index()

    # Altair plot for the actual, smoothed, and forecast data
    loess_chart = alt.Chart(candidate_loess_df).mark_line(color='blue', size=3).encode(
        x='end_date:T',
        y='weighted_average_pct:Q',
        tooltip=[alt.Tooltip('end_date:T', title='Date'),
                 alt.Tooltip('weighted_average_pct:Q', title='Smoothed Data')]
    ).properties(title=f'ARIMA Forecast on LOESS Smoothed Data for {candidate}')

    fitted_chart = alt.Chart(candidate_loess_df).mark_line(color='orange', size=2).encode(
        x='end_date:T',
        y='fitted:Q',
        tooltip=['fitted:Q']
    )

    forecast_chart = alt.Chart(forecast_df).mark_line(color='green', size=2).encode(
        x='end_date:T',
        y='forecast:Q',
        tooltip=[alt.Tooltip('end_date:T', title='Forecast Date'),
                 alt.Tooltip('forecast:Q', title='Forecasted Value')]
    )
    
    # Combine the charts and make the graph larger
    combined_chart = (loess_chart + fitted_chart + forecast_chart).properties(
        width=800,  # Set larger width
        height=400  # Set larger height
    )
    
    # Display the chart
    combined_chart.display()

    # Optionally, calculate the RMSE of the model
    rmse = np.sqrt(mean_squared_error(candidate_loess_df['weighted_average_pct'][1:], candidate_loess_df['fitted'][1:]))
    print(f"RMSE for {candidate}: {rmse}")
    # Extract the forecasted value for Nov 5, 2024
    nov_5_prediction = forecast_df[forecast_df['end_date'] == forecast_end_date]['forecast'].values[0]
    
    # Store the prediction in the final_predictions dictionary
    final_predictions[candidate] = nov_5_prediction

    # Print the final prediction for the candidate
    print(f"Final forecast for {candidate} on Nov 5, 2024: {nov_5_prediction:.2f}%")
    
# Optionally, print the final predictions for both candidates in a clear format
print("\nFinal Predictions for November 5, 2024:")
for candidate, prediction in final_predictions.items():
    print(f"{candidate}: {prediction:.2f}%")    


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


RMSE for Kamala Harris: 0.9119018837141989
Final forecast for Kamala Harris on Nov 5, 2024: 48.89%


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


RMSE for Donald Trump: 1.20427087892464
Final forecast for Donald Trump on Nov 5, 2024: 49.59%

Final Predictions for November 5, 2024:
Kamala Harris: 48.89%
Donald Trump: 49.59%


#### Boxplots of House Effects distributions by Pollster and Candidate

In [146]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### State of Minnesota

In [147]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]  # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Minnesota"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
                
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
    
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_2 = group[group['count'] == 2]
                    
                    if not question_ids_with_count_2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()
        
                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                #c_mean2data = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]
                #c_mean2 = np.average(c_mean2data['pct'], weights=c_mean2data['w_i'])
                #c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                ##c_mean_by_pollster = candidate_data.groupby('pollster_id')['pct'].mean()
                
                    # Create an empty dictionary to store the results
                pollster_means = {}
            
                # Group the data by 'pollster_id'
                grouped = candidate_data.groupby('pollster_id')
            
                # Loop through each group (i.e., each pollster)
                for pollster_id, group in grouped:
                    # Find the most recent 'end_date' for this pollster
                    last_end_date_index = group['days_past_index'].min()
                    
                    # Calculate the start date for the filtering (last_end_date - x_days)
                    start_date_index = last_end_date_index+avg_window
            
                    # Filter the group's data to only include rows between start_date and last_end_date
                    filtered_group = group[(group['days_past_index'] >= last_end_date_index) & (group['days_past_index'] <= start_date_index)]
                    
                    # Calculate the mean of 'pct' for this filtered group
                    pollster_mean = filtered_group['pct'].mean()
                    c_mean2data = candidate_data[(candidate_data['days_past_index'] >= last_end_date_index) & (candidate_data['days_past_index'] <= start_date_index)]
                    c_mean2 = c_mean2data['pct'].mean() #np.average(c_mean2data['pct'], weights=c_mean2data['w_i']) #c_mean2data['pct'].mean()
                    # Store the result in the dictionary
                    house_effect = pollster_mean - c_mean2
                    pollster_means[pollster_id] = house_effect
            
                # Convert the dictionary to a pandas Series for easier handling
                c_mean_by_pollster = pd.Series(pollster_means)

                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Rename the columns for clarity
                c_mean_by_pollster_df.columns = ['pollster_id', 'house_effect']
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['mean_pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect']  # Existing house effect data
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll

                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
                
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Minnesota")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Minnesota


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-02,16661,Kamala Harris,49.521907,0.05,19
1,2024-10-02,16651,Donald Trump,43.566394,0.05,19
2,2024-09-26,16661,Kamala Harris,49.511616,0.05,19
3,2024-09-26,16651,Donald Trump,43.755208,0.05,19
4,2024-09-19,16661,Kamala Harris,49.441724,0.05,19
5,2024-09-19,16651,Donald Trump,43.622292,0.05,19
6,2024-09-18,16661,Kamala Harris,49.625529,0.05,19
7,2024-09-18,16651,Donald Trump,44.178446,0.05,19
8,2024-09-09,16661,Kamala Harris,49.567371,0.05,19
9,2024-09-09,16651,Donald Trump,43.194794,0.05,19


In [148]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print('Minnesota')
print(df_aggregated.head(10))
# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (Minnesota)"
)

# Display the final chart
final_chart.display()


Minnesota
     end_date candidate_name  weighted_average_pct
21 2024-10-02  Kamala Harris             49.736945
20 2024-10-02   Donald Trump             43.355222
19 2024-09-26  Kamala Harris             49.509328
18 2024-09-26   Donald Trump             43.595756
17 2024-09-19  Kamala Harris             49.442725
16 2024-09-19   Donald Trump             43.680429
15 2024-09-18  Kamala Harris             49.482205
14 2024-09-18   Donald Trump             44.089791
12 2024-09-09   Donald Trump             44.011773
13 2024-09-09  Kamala Harris             49.825883


In [149]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### State of Texas

In [150]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]  # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Texas"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
                
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
    
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_2 = group[group['count'] == 2]
                    
                    if not question_ids_with_count_2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()
        
                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                #c_mean2data = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]
                #c_mean2 = np.average(c_mean2data['pct'], weights=c_mean2data['w_i'])
                #c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                ##c_mean_by_pollster = candidate_data.groupby('pollster_id')['pct'].mean()
                
                    # Create an empty dictionary to store the results
                pollster_means = {}
            
                # Group the data by 'pollster_id'
                grouped = candidate_data.groupby('pollster_id')
            
                # Loop through each group (i.e., each pollster)
                for pollster_id, group in grouped:
                    # Find the most recent 'end_date' for this pollster
                    last_end_date_index = group['days_past_index'].min()
                    
                    # Calculate the start date for the filtering (last_end_date - x_days)
                    start_date_index = last_end_date_index+avg_window
            
                    # Filter the group's data to only include rows between start_date and last_end_date
                    filtered_group = group[(group['days_past_index'] >= last_end_date_index) & (group['days_past_index'] <= start_date_index)]
                    
                    # Calculate the mean of 'pct' for this filtered group
                    pollster_mean = filtered_group['pct'].mean()
                    c_mean2data = candidate_data[(candidate_data['days_past_index'] >= last_end_date_index) & (candidate_data['days_past_index'] <= start_date_index)]
                    c_mean2 = c_mean2data['pct'].mean() #np.average(c_mean2data['pct'], weights=c_mean2data['w_i']) #c_mean2data['pct'].mean()
                    # Store the result in the dictionary
                    house_effect = pollster_mean - c_mean2
                    pollster_means[pollster_id] = house_effect
            
                # Convert the dictionary to a pandas Series for easier handling
                c_mean_by_pollster = pd.Series(pollster_means)

                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Rename the columns for clarity
                c_mean_by_pollster_df.columns = ['pollster_id', 'house_effect']
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['mean_pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect']  # Existing house effect data
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll

                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
                
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Texas")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Texas


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-10,16661,Kamala Harris,45.082197,0.05,19
1,2024-10-10,16651,Donald Trump,51.165415,0.05,19
2,2024-10-07,16661,Kamala Harris,45.067886,0.05,19
3,2024-10-07,16651,Donald Trump,51.032052,0.05,19
4,2024-10-04,16661,Kamala Harris,44.844049,0.05,19
5,2024-10-04,16651,Donald Trump,51.143918,0.05,19
6,2024-09-27,16661,Kamala Harris,44.64694,0.05,19
7,2024-09-27,16651,Donald Trump,51.031569,0.05,19
8,2024-09-24,16661,Kamala Harris,44.351701,0.05,19
9,2024-09-24,16651,Donald Trump,50.86565,0.05,19


In [151]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print('Texas')
print(df_aggregated.head(10))
# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (Texas)"
)

# Display the final chart
final_chart.display()


Texas
     end_date candidate_name  weighted_average_pct
19 2024-10-10  Kamala Harris             44.736970
18 2024-10-10   Donald Trump             50.982819
17 2024-10-07  Kamala Harris             44.540655
16 2024-10-07   Donald Trump             50.832577
15 2024-10-04  Kamala Harris             44.827373
14 2024-10-04   Donald Trump             51.050338
13 2024-09-27  Kamala Harris             44.618021
12 2024-09-27   Donald Trump             51.298134
11 2024-09-24  Kamala Harris             44.479457
10 2024-09-24   Donald Trump             51.163818


In [152]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### State of Virginia

In [153]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]  # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Virginia"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
                
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
    
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_2 = group[group['count'] == 2]
                    
                    if not question_ids_with_count_2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()
        
                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                #c_mean2data = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]
                #c_mean2 = np.average(c_mean2data['pct'], weights=c_mean2data['w_i'])
                #c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                ##c_mean_by_pollster = candidate_data.groupby('pollster_id')['pct'].mean()
                
                    # Create an empty dictionary to store the results
                pollster_means = {}
            
                # Group the data by 'pollster_id'
                grouped = candidate_data.groupby('pollster_id')
            
                # Loop through each group (i.e., each pollster)
                for pollster_id, group in grouped:
                    # Find the most recent 'end_date' for this pollster
                    last_end_date_index = group['days_past_index'].min()
                    
                    # Calculate the start date for the filtering (last_end_date - x_days)
                    start_date_index = last_end_date_index+avg_window
            
                    # Filter the group's data to only include rows between start_date and last_end_date
                    filtered_group = group[(group['days_past_index'] >= last_end_date_index) & (group['days_past_index'] <= start_date_index)]
                    
                    # Calculate the mean of 'pct' for this filtered group
                    pollster_mean = filtered_group['pct'].mean()
                    c_mean2data = candidate_data[(candidate_data['days_past_index'] >= last_end_date_index) & (candidate_data['days_past_index'] <= start_date_index)]
                    c_mean2 = c_mean2data['pct'].mean() #np.average(c_mean2data['pct'], weights=c_mean2data['w_i']) #c_mean2data['pct'].mean()
                    # Store the result in the dictionary
                    house_effect = pollster_mean - c_mean2
                    pollster_means[pollster_id] = house_effect
            
                # Convert the dictionary to a pandas Series for easier handling
                c_mean_by_pollster = pd.Series(pollster_means)

                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Rename the columns for clarity
                c_mean_by_pollster_df.columns = ['pollster_id', 'house_effect']
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['mean_pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                #c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect']  # Existing house effect data
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll

                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
                
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Virginia")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Virginia


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-04,16661,Kamala Harris,51.532954,0.05,19
1,2024-10-04,16651,Donald Trump,43.520947,0.05,19
2,2024-09-24,16661,Kamala Harris,51.018343,0.05,19
3,2024-09-24,16651,Donald Trump,42.702076,0.05,19
4,2024-09-18,16661,Kamala Harris,51.005104,0.05,19
5,2024-09-18,16651,Donald Trump,42.216649,0.05,19
6,2024-09-08,16661,Kamala Harris,,0.05,19
7,2024-09-08,16651,Donald Trump,,0.05,19
8,2024-08-16,16661,Kamala Harris,,0.05,19
9,2024-08-16,16651,Donald Trump,,0.05,19


In [154]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print('Texas')
print(df_aggregated.head(10))
# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (Texas)"
)

# Display the final chart
final_chart.display()


Texas
    end_date candidate_name  weighted_average_pct
8 2024-10-04   Donald Trump             43.667059
9 2024-10-04  Kamala Harris             51.695239
6 2024-09-24   Donald Trump             43.069762
7 2024-09-24  Kamala Harris             50.967986
4 2024-09-18   Donald Trump             42.346203
5 2024-09-18  Kamala Harris             50.751938
2 2024-09-08   Donald Trump                   NaN
3 2024-09-08  Kamala Harris                   NaN
0 2024-08-16   Donald Trump                   NaN
1 2024-08-16  Kamala Harris                   NaN
