In [1]:
import pandas as pd
import altair as alt
from statsmodels.nonparametric.smoothers_lowess import lowess
import numpy as np

In [2]:
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('president_polls.csv')

# Display the first few rows of the DataFrame to verify the import
df.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,stage,nationwide_batch,ranked_choice_reallocated,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct
0,88672,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,49.0
1,88672,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,general,False,False,,False,REP,Trump,16651,Donald Trump,46.0
2,88674,1741,ActiVote,,,ActiVote,721,ActiVote,,,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,60.0
3,88674,1741,ActiVote,,,ActiVote,721,ActiVote,,,...,general,False,False,,False,REP,Trump,16651,Donald Trump,40.0
4,88670,568,YouGov,133.0,CBS News,YouGov,391,YouGov,3.0,-1.1,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,51.0


In [3]:
df_state_na = df[df['state'].isna()]

df_state_not_na = df[df['state'].notna()]

print(df_state_na.head())
print(df_state_not_na.columns)
print(df_state_not_na.head())

   poll_id  pollster_id      pollster sponsor_ids                sponsors  \
0    88672          770          TIPP         NaN                     NaN   
1    88672          770          TIPP         NaN                     NaN   
4    88670          568        YouGov         133                CBS News   
5    88670          568        YouGov         133                CBS News   
6    88647         1554  RMG Research        2178  Napolitan News Service   

    display_name  pollster_rating_id pollster_rating_name  numeric_grade  \
0  TIPP Insights                 144        TIPP Insights            1.8   
1  TIPP Insights                 144        TIPP Insights            1.8   
4         YouGov                 391               YouGov            3.0   
5         YouGov                 391               YouGov            3.0   
6   RMG Research                 555         RMG Research            2.3   

   pollscore  ...    stage  nationwide_batch ranked_choice_reallocated  \
0     

Let's filter out non scored national pollsters 

In [4]:
df_state_na_clean = df_state_na[df_state_na['numeric_grade'].notna()]
print(df_state_na_clean['numeric_grade'])
print(df_state_na_clean['numeric_grade'].max())

0        1.8
1        1.8
4        3.0
5        3.0
6        2.3
        ... 
15460    2.8
15461    2.8
15462    2.8
15463    2.8
15464    2.8
Name: numeric_grade, Length: 6922, dtype: float64
3.0


Let's create a weight for numerically graded pollsters

In [5]:
# Make a copy to avoid the warning
df_state_na_clean = df_state_na_clean.copy()

# Now safely create the 'weight_score' column
df_state_na_clean.loc[:, 'weight_score'] = df_state_na_clean['numeric_grade'] / 3.0


Let's check weight_grade column

In [6]:
df_state_na_clean.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,nationwide_batch,ranked_choice_reallocated,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score
0,88672,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,False,,False,DEM,Harris,16661,Kamala Harris,49.0,0.6
1,88672,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,False,,False,REP,Trump,16651,Donald Trump,46.0,0.6
4,88670,568,YouGov,133.0,CBS News,YouGov,391,YouGov,3.0,-1.1,...,False,False,,False,DEM,Harris,16661,Kamala Harris,51.0,1.0
5,88670,568,YouGov,133.0,CBS News,YouGov,391,YouGov,3.0,-1.1,...,False,False,,False,REP,Trump,16651,Donald Trump,48.0,1.0
6,88647,1554,RMG Research,2178.0,Napolitan News Service,RMG Research,555,RMG Research,2.3,-0.4,...,False,False,,False,DEM,Harris,16661,Kamala Harris,50.0,0.766667


Let's go ahead and examine unique methodologies

In [7]:
print(df_state_na_clean['methodology'].unique())

['Online Panel' 'IVR/Text' 'Live Phone' 'Probability Panel' nan
 'IVR/Online Panel/Text-to-Web' 'Live Phone/Online Panel/Text-to-Web'
 'Live Phone/Text-to-Web' 'Live Phone/Online Panel/App Panel'
 'Live Phone/Online Panel/Text' 'Live Phone/Probability Panel' 'IVR'
 'Online Panel/Text-to-Web' 'Text-to-Web/Online Ad' 'Online Ad'
 'Live Phone/Online Panel' 'Live Phone/Text-to-Web/Online Ad'
 'IVR/Text-to-Web' 'Live Phone/Text/Online Panel' 'IVR/Online Panel'
 'Text' 'Online Panel/Online Ad' 'IVR/Online Panel/Email'
 'IVR/Live Phone/Text/Online Panel/Email' 'Live Phone/Text/Online Ad'
 'Online Panel/Text-to-Web/Text' 'Live Phone/Text-to-Web/App Panel'
 'Online Panel/Probability Panel' 'App Panel'
 'IVR/Online Panel/Text-to-Web/Email']


mapping different methodologies to weight_mode

In [8]:
# Make a copy to avoid the warning
df_state_na_clean = df_state_na_clean.copy()
# Mapping the weights to modes based on the table above
mode_weights = {
    'Live Phone': 1.00,
    'Live Phone/Probability Panel': 0.95,
    'Live Phone/Online Panel/Text-to-Web': 0.90,
    'Live Phone/Online Panel/Text': 0.90,
    'Live Phone/Text-to-Web/App Panel': 0.82,
    'Live Phone/Text-to-Web/Online Ad': 0.85,
    'Live Phone/Text-to-Web': 0.85,
    'Live Phone/Text/Online Panel': 0.90,
    'Live Phone/Online Panel': 0.85,
    'Live Phone/Online Panel/App Panel': 0.85,
    'IVR/Live Phone/Text/Online Panel/Email': 0.80,
    'Live Phone/Text/Online Ad': 0.80,
    'IVR/Online Panel/Email': 0.77,
    'IVR/Online Panel/Text-to-Web/Email': 0.75,
    'IVR/Online Panel/Text-to-Web': 0.75,
    'IVR/Online Panel': 0.70,
    'IVR': 0.70,
    'Online Panel/Probability Panel': 0.65,
    'Probability Panel': 0.65,
    'Online Panel/Text-to-Web': 0.60,
    'Online Panel/Online Ad': 0.55,
    'Online Panel': 0.50,
    'Online Ad': 0.50,
    'App Panel': 0.50,
    'Online Panel/Text-to-Web/Text': 0.50,
    'IVR/Text-to-Web': 0.50,
    'Text-to-Web/Online Ad': 0.45,
    'Text': 0.40,
    'IVR/Text': 0.40,
    'nan' : 0.50,
     np.nan: 0.50  # Handling missing or unknown values
}

# Apply the mapping to create a new column 'weight_mode'
df_state_na_clean.loc[:,'weight_mode'] = df_state_na_clean['methodology'].map(mode_weights)

Let's check out the 'weight_mode' column

In [9]:
print(df_state_na_clean.head())

   poll_id  pollster_id      pollster sponsor_ids                sponsors  \
0    88672          770          TIPP         NaN                     NaN   
1    88672          770          TIPP         NaN                     NaN   
4    88670          568        YouGov         133                CBS News   
5    88670          568        YouGov         133                CBS News   
6    88647         1554  RMG Research        2178  Napolitan News Service   

    display_name  pollster_rating_id pollster_rating_name  numeric_grade  \
0  TIPP Insights                 144        TIPP Insights            1.8   
1  TIPP Insights                 144        TIPP Insights            1.8   
4         YouGov                 391               YouGov            3.0   
5         YouGov                 391               YouGov            3.0   
6   RMG Research                 555         RMG Research            2.3   

   pollscore  ... ranked_choice_reallocated  ranked_choice_round hypothetical  \

Let's create a weight for sample size, but first let's look for NaN in sample_size column

In [10]:
# Count the number of NaN values in the 'sample_size' column
nan_count = df['sample_size'].isna().sum()

print(f"Number of NaN values in 'sample_size': {nan_count}")

# Calculate the mean of the available (non-NaN) sample sizes
mean_sample_size = df['sample_size'].mean()

print(f"Mean of available sample sizes: {mean_sample_size}")


Number of NaN values in 'sample_size': 132
Mean of available sample sizes: 1608.8425171177046


In [11]:
import numpy as np

In [12]:
# Step 2: Create the 'weight_sample' column
df_state_na_clean['weight_sample'] = df_state_na_clean['sample_size'].apply(lambda x: np.sqrt(x) if not np.isnan(x) else np.sqrt(mean_sample_size))

# Display the first few rows to verify
df_state_na_clean.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
0,88672,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,,False,DEM,Harris,16661,Kamala Harris,49.0,0.6,0.5,34.81379
1,88672,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,,False,REP,Trump,16651,Donald Trump,46.0,0.6,0.5,34.81379
4,88670,568,YouGov,133.0,CBS News,YouGov,391,YouGov,3.0,-1.1,...,,False,DEM,Harris,16661,Kamala Harris,51.0,1.0,0.5,52.076866
5,88670,568,YouGov,133.0,CBS News,YouGov,391,YouGov,3.0,-1.1,...,,False,REP,Trump,16651,Donald Trump,48.0,1.0,0.5,52.076866
6,88647,1554,RMG Research,2178.0,Napolitan News Service,RMG Research,555,RMG Research,2.3,-0.4,...,,False,DEM,Harris,16661,Kamala Harris,50.0,0.766667,0.5,54.267854


Sort end_date values in descending order

In [13]:
# Convert 'end_date' to datetime format with specified format for single/double digits in month/day
df_state_na_clean['end_date'] = pd.to_datetime(df_state_na_clean['end_date'], format='%m/%d/%y', errors='coerce')

# Sort the DataFrame by 'end_date'
df_state_na_clean_sorted = df_state_na_clean.sort_values(by='end_date',ascending=False)

In [14]:
df_state_na_clean_sorted.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
0,88672,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,,False,DEM,Harris,16661,Kamala Harris,49.0,0.6,0.5,34.81379
1,88672,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,,False,REP,Trump,16651,Donald Trump,46.0,0.6,0.5,34.81379
4,88670,568,YouGov,133.0,CBS News,YouGov,391,YouGov,3.0,-1.1,...,,False,DEM,Harris,16661,Kamala Harris,51.0,1.0,0.5,52.076866
5,88670,568,YouGov,133.0,CBS News,YouGov,391,YouGov,3.0,-1.1,...,,False,REP,Trump,16651,Donald Trump,48.0,1.0,0.5,52.076866
6,88647,1554,RMG Research,2178.0,Napolitan News Service,RMG Research,555,RMG Research,2.3,-0.4,...,,False,DEM,Harris,16661,Kamala Harris,50.0,0.766667,0.5,54.267854


Let's create a 'days_past_index' that can be used for weight_time_decay value for moving average

In [15]:
# Step 3: Get the first (top) date after sorting
first_date = df_state_na_clean_sorted['end_date'].iloc[0]

# Step 4: Compute the difference in days and create the 'days_past_index' column
df_state_na_clean_sorted['days_past_index'] = (first_date - df_state_na_clean_sorted['end_date']).dt.days

In [16]:
df_state_na_clean_sorted.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample,days_past_index
0,88672,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,DEM,Harris,16661,Kamala Harris,49.0,0.6,0.5,34.81379,0
1,88672,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,REP,Trump,16651,Donald Trump,46.0,0.6,0.5,34.81379,0
4,88670,568,YouGov,133.0,CBS News,YouGov,391,YouGov,3.0,-1.1,...,False,DEM,Harris,16661,Kamala Harris,51.0,1.0,0.5,52.076866,2
5,88670,568,YouGov,133.0,CBS News,YouGov,391,YouGov,3.0,-1.1,...,False,REP,Trump,16651,Donald Trump,48.0,1.0,0.5,52.076866,2
6,88647,1554,RMG Research,2178.0,Napolitan News Service,RMG Research,555,RMG Research,2.3,-0.4,...,False,DEM,Harris,16661,Kamala Harris,50.0,0.766667,0.5,54.267854,3


In [17]:
df_state_na_clean_sorted.tail()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample,days_past_index
15460,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,REP,Cruz,16641,Ted Cruz,24.0,0.933333,0.65,33.24154,1276
15461,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,DEM,Biden,19368,Joe Biden,41.0,0.933333,0.65,33.256578,1276
15462,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,REP,DeSantis,16646,Ron DeSantis,25.0,0.933333,0.65,33.256578,1276
15463,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,DEM,Biden,19368,Joe Biden,44.0,0.933333,0.65,33.27161,1276
15464,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,REP,Haley,16640,Nikki Haley,19.0,0.933333,0.65,33.27161,1276


In [18]:
# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_na_clean_sorted_cutoff = df_state_na_clean_sorted[df_state_na_clean_sorted['end_date'] >= cutoff_date].copy()

# Check for NaN values in 'weight_mode', 'weight_sample', and 'weight_score'
nan_check = df_state_na_clean_sorted_cutoff[['weight_mode', 'weight_sample', 'weight_score']].isna().sum()

# Step 2: Check for infinite values in the involved columns
inf_check = df_state_na_clean_sorted_cutoff[['weight_mode', 'weight_sample', 'weight_score']].isin([np.inf, -np.inf]).sum()
print(f"Number of infinite values:\n{inf_check}")


# Print the result to verify if there are any NaN values
print(nan_check)

# Step 1: Filter rows where 'weight_mode' is NaN
nan_weight_mode = df_state_na_clean_sorted_cutoff[df_state_na_clean_sorted_cutoff['weight_mode'].isna()]

# Step 2: Display the 'methodology' or other relevant columns to investigate the methodology used
# For example, we'll check 'pollster', 'sponsors', and 'methodology' (if available) along with 'weight_mode'
nan_weight_mode_info = nan_weight_mode[['pollster', 'sponsors', 'methodology', 'weight_mode']]

# Print the resulting DataFrame for verification
print(nan_weight_mode_info)

Number of infinite values:
weight_mode      0
weight_sample    0
weight_score     0
dtype: int64
weight_mode      0
weight_sample    0
weight_score     0
dtype: int64
Empty DataFrame
Columns: [pollster, sponsors, methodology, weight_mode]
Index: []


Let's compute a weight_i where weight_i is for a given 'end_date' the determined index weight which determines for a given end_date the weight assigned to a row used in computing the point average_i. Note: we will use the sorted dates to filter dates that only on the date or before, then we will compute the 'weight_time' using the formula: exp(-lambdat) where lambda = 1.0 and t = days elapsed since the beginning of the poll end date.
finally we can compute the weight_i = 'weight_mode''weight_sample'*weight_score'*weight_time'

In [19]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = 1.0
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_na_clean_sorted_cutoff = df_state_na_clean_sorted[df_state_na_clean_sorted['end_date'] >= cutoff_date].copy()
df_lv = df_state_na_clean_sorted_cutoff[df_state_na_clean_sorted_cutoff['population']=='lv'].copy()

# Iterate through each unique end date
for current_date in df_lv['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_lv[df_lv['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        #  We select H2H if avaible and only use non H2H if H2H is not available
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()
        
        # Step 7: Filter data for the specific candidate
        
        c_mean = candidate_data[candidate_data['days_past_index']< 30]['pct'].mean() #gather mean for past 30 days of likely voters
        c_std =  candidate_data[candidate_data['days_past_index']< 30]['pct'].std()  #gather standard deviation for past 30 days of likely voters
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.0*candidate_data['zscores'])              
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        #Start House effect data computation#
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < 40) & 
                                            (candidate_data['days_past_index'] > 1)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        #print(candidate_data['adjusted_pct'].head(10))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-13,16661,Kamala Harris,50.160074
1,2024-10-13,16651,Donald Trump,46.507374
2,2024-10-11,16661,Kamala Harris,50.611328
3,2024-10-11,16651,Donald Trump,46.732967
4,2024-10-10,16661,Kamala Harris,49.562413


Let's view the plot with weight average:

In [20]:
# Create the line chart with points
# Create a customized color encoding for Trump and Harris
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Create the line chart with points, assigning specific colors to each candidate
chart = alt.Chart(df_weighted_averages).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate_name:N', scale=color_scale),  # Custom color scale for candidates
    tooltip=['end_date', 'candidate_name', 'weighted_average_pct']  # Add tooltips to show details
).properties(
    title='Weighted Average Polling Results Over Time',
    width=600,
    height=400
)

chart.show()


In [21]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q']  # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time',
    width=600,
    height=400
)

chart.show()

Let's look at the loess smoothed curve of this

In [22]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


Let's look at A/B graded pollsters

In [23]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = 0.23
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_na_clean_sorted_cutoff = df_state_na_clean_sorted[df_state_na_clean_sorted['end_date'] >= cutoff_date].copy()
df_ab_pollsters = df_state_na_clean_sorted_cutoff[df_state_na_clean_sorted_cutoff['numeric_grade'] >= 2.4].copy()
df_lv = df_ab_pollsters[df_ab_pollsters['population'] == 'lv'].copy()
# Iterate through each unique end date
for current_date in df_lv['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_lv[df_lv['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        #  We select H2H if avaible and only use non H2H if H2H is not available
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy() 
        
        # Step 7: Filter data for the specific candidate
        
        c_mean = candidate_data[candidate_data['days_past_index']< 40]['pct'].mean() #gather mean for past 30 days of likely voters
        c_std =  candidate_data[candidate_data['days_past_index']< 40]['pct'].std()  #gather standard deviation for past 30 days of likely voters
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.0*candidate_data['zscores'])              
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )

        #Start House effect data computation#
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < 40) & 
                                            (candidate_data['days_past_index'] > 1)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        ## finish house effect weighting  ##   
        
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-11,16661,Kamala Harris,49.590426
1,2024-10-11,16651,Donald Trump,47.109859
2,2024-10-08,16661,Kamala Harris,49.221268
3,2024-10-08,16651,Donald Trump,47.103795
4,2024-10-07,16661,Kamala Harris,49.511517


In [24]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time',
    width=600,
    height=400
)

chart.show()

In [25]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


### Moving on to Battleground state data

Let's clean 'numeric_grade' rows for df_state_not_na so that there aren't ungraded pollsters in our list

In [26]:
#df_state_not_na_clean = df_state_not_na[df_state_not_na['numeric_grade'].notna()]

In [27]:
#na_num = df_state_not_na_clean['numeric_grade'].isna().sum()
#print(na_num)

Let's normalize 'numeric_grade' to create a 'weight_score'

In [28]:
# Make a copy to avoid the warning
df_state_not_na = df_state_not_na.copy()

# Step 1: Replace NaN values in 'numeric_grade' with a low grade (e.g., 1.0)
df_state_not_na['numeric_grade'].fillna(0.1, inplace=True)

df_state_not_na_clean = df_state_not_na.copy()
# Step 2: Safely create the 'weight_score' column by dividing 'numeric_grade' by 3.0
df_state_not_na_clean.loc[:, 'weight_score'] = df_state_not_na_clean['numeric_grade'] / 3.0

# Verify the updated DataFrame
print(df_state_not_na_clean[['numeric_grade', 'weight_score']].head())

    numeric_grade  weight_score
2             0.1      0.033333
3             0.1      0.033333
8             3.0      1.000000
9             3.0      1.000000
10            3.0      1.000000


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_state_not_na['numeric_grade'].fillna(0.1, inplace=True)


In [29]:
na_num = df_state_not_na_clean['weight_score'].isna().sum()
print(na_num)

0


#### Creating 'weight_mode' weight for methodology

In [30]:
import numpy as np
# Make a copy to avoid the warning
df_state_not_na_clean = df_state_not_na_clean.copy()
# Mapping the weights to modes based on the table above
mode_weights = {
    "Text-to-Web/Email": 0.75,
    "IVR/Live Phone/Text-to-Web": 0.76,
    "IVR/Live Phone/Online Panel": 0.78,
    "IVR/Live Phone/Online Panel/Text-to-Web": 0.77,
    "Live Phone/Text-to-Web/Email/Mail-to-Web": 0.76,
    "Live Phone/Text-to-Web/Email": 0.78,
    "Email/Online Ad": 0.73,
    "Online Panel/Email": 0.78,
    "Live Phone/Online Panel/Mail-to-Web": 0.78,
    "IVR/Text-to-Web/Email": 0.72,
    'Email':0.8,
    'Live Phone': 1.00,
    'Live Phone/Probability Panel': 0.95,
    'Live Phone/Online Panel/Text-to-Web': 0.90,
    'Live Phone/Online Panel/Text': 0.90,
    'Live Phone/Text-to-Web/App Panel': 0.85,
    'Live Phone/Text-to-Web/Online Ad': 0.85,
    'Live Phone/Text-to-Web': 0.85,
    'Live Phone/Text/Online Panel': 0.90,
    'Live Phone/Online Panel': 0.85,
    'Live Phone/Online Panel/App Panel': 0.85,
    'Live Phone/Text-to-Web/Email/Mail-to-Web/Mail-to-Phone':0.76,
    'Live Phone/Email':0.82,
    'Live Phone/Online Panel/Text-to-Web/Text':0.8,
    'Live Phone/Text':0.83,
    'IVR/Live Phone/Text/Online Panel/Email': 0.80,
    'Live Phone/Text/Online Ad': 0.80,
    'IVR/Live Phone/Text':0.78,
    'IVR/Online Panel/Email': 0.77,
    'IVR/Online Panel/Text-to-Web/Email': 0.75,
    'IVR/Online Panel/Text-to-Web': 0.75,
    'IVR/Online Panel': 0.70,
    'IVR': 0.70,
    'Mail-to-Web/Mail-to-Phone': 0.7,
    'Online Panel/Probability Panel': 0.65,
    'Probability Panel': 0.65,
    'Online Panel/Email/Text-to-Web':0.77,
    'Online Panel/Text-to-Web': 0.60,
    'Online Panel/Text':0.78,
    'Online Panel/Online Ad': 0.55,
    'Online Panel': 0.50,
    'Online Ad': 0.50,
    'App Panel': 0.50,
    'Online Panel/Text-to-Web/Text': 0.50,
    'IVR/Text-to-Web': 0.50,
    'Text-to-Web/Online Ad': 0.45,
    'Text-to-Web':0.45,
    'Text': 0.40,
    'IVR/Text': 0.40,
    'nan' : 0.50,
     np.nan: 0.50  # Handling missing or unknown values
}

# Apply the mapping to create a new column 'weight_mode'
df_state_not_na_clean.loc[:,'weight_mode'] = df_state_not_na_clean['methodology'].map(mode_weights)

In [31]:
num_na = df_state_not_na_clean['weight_mode'].isna().sum()
print(num_na)
df_ret = df_state_not_na_clean[df_state_not_na_clean['weight_mode'].isna()][['methodology']]
print(df_ret['methodology'].unique())

0
[]


#### Create a Weight_Sample weight for sample size

In [32]:
# Step 2: Create the 'weight_sample' column
df_state_not_na_clean['weight_sample'] = df_state_not_na_clean['sample_size'].apply(lambda x: np.sqrt(x/600) if not np.isnan(x) else np.sqrt(mean_sample_size/600))

# Display the first few rows to verify
df_state_not_na_clean.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
2,88674,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,,False,DEM,Harris,16661,Kamala Harris,60.0,0.033333,0.5,0.816497
3,88674,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,,False,REP,Trump,16651,Donald Trump,40.0,0.033333,0.5,0.816497
8,88664,1424,Siena/NYT,1875.0,The Philadelphia Inquirer,The New York Times/Siena College,448,The New York Times/Siena College,3.0,-1.5,...,,False,DEM,Harris,16661,Kamala Harris,50.0,1.0,1.0,1.195129
9,88664,1424,Siena/NYT,1875.0,The Philadelphia Inquirer,The New York Times/Siena College,448,The New York Times/Siena College,3.0,-1.5,...,,False,REP,Trump,16651,Donald Trump,47.0,1.0,1.0,1.195129
10,88664,1424,Siena/NYT,1875.0,The Philadelphia Inquirer,The New York Times/Siena College,448,The New York Times/Siena College,3.0,-1.5,...,,False,DEM,Harris,16661,Kamala Harris,49.0,1.0,1.0,1.195129


In [33]:
num_na = df_state_not_na_clean['weight_sample'].isna().sum()
print(num_na)

0


####  Converting 'end_date' to datetime format and sorting dataframe by end_date

In [34]:
# Convert 'end_date' to datetime format with specified format for single/double digits in month/day
df_state_not_na_clean['end_date'] = pd.to_datetime(df_state_not_na_clean['end_date'], format='%m/%d/%y', errors='coerce')

# Sort the DataFrame by 'end_date'
df_state_not_na_clean_sorted = df_state_not_na_clean.sort_values(by='end_date',ascending=False)

In [35]:
df_state_not_na_clean_sorted.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
2,88674,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,,False,DEM,Harris,16661,Kamala Harris,60.0,0.033333,0.5,0.816497
3,88674,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,,False,REP,Trump,16651,Donald Trump,40.0,0.033333,0.5,0.816497
23,88665,1424,Siena/NYT,,,The New York Times/Siena College,448,The New York Times/Siena College,3.0,-1.5,...,,False,REP,Trump,16651,Donald Trump,50.0,1.0,1.0,1.16046
37,88654,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,,False,REP,Trump,16651,Donald Trump,57.2,0.033333,0.5,0.816497
36,88654,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,,False,DEM,Harris,16661,Kamala Harris,42.8,0.033333,0.5,0.816497


#### Let's compute battleground state PA

In [36]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Pennsylvania"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
        #lets grab house effect data
        # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
        #Start House effect data computation#
        c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                            (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        #start saving house effect data for boxplots
        new_data = pd.DataFrame({
            'end_date': current_date,
            'pollster': candidate_data['pollster'],  # Existing pollster data
            'state': candidate_data['state'],  # Existing state data
            'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
            'house_effect': candidate_data['house_effect']  # Existing house effect data
        })
        
        # Append new data to the main DataFrame
        house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
        #end saving house effect data for box plots
        #print(candidate_data['adjusted_pct'].head(10))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Pennsylvania")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Pennsylvania


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-10,16661,Kamala Harris,48.103139
1,2024-10-10,16651,Donald Trump,47.7283
2,2024-10-09,16661,Kamala Harris,48.114252
3,2024-10-09,16651,Donald Trump,47.633548
4,2024-10-08,16661,Kamala Harris,48.149845
5,2024-10-08,16651,Donald Trump,47.375297
6,2024-10-07,16661,Kamala Harris,47.6364
7,2024-10-07,16651,Donald Trump,47.245185
8,2024-10-02,16661,Kamala Harris,48.643393
9,2024-10-02,16651,Donald Trump,47.548393


In [37]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Pennsylvania)',
    width=600,
    height=400
)

chart.show()

In [38]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable) for Pennsylvania',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Summary Stats of Pollster House Effects in State

In [39]:
# Group by 'pollster' and calculate summary statistics for 'house_effect_data'
house_effect_summary = house_effect_data.groupby(['pollster','candidate_name'])['house_effect'].describe()

# Display the summary statistics
print(house_effect_summary)

                                count      mean       std       min       25%  \
pollster        candidate_name                                                  
ActiVote        Donald Trump     34.0  0.507355  0.792029  0.000000  0.000000   
                Kamala Harris    34.0  1.291259  1.596836  0.000000  0.000000   
AtlasIntel      Donald Trump     16.0  1.495602  1.546544  0.000000  0.000000   
                Kamala Harris    16.0 -0.372490  0.413148 -0.951818 -0.654412   
Beacon/Shaw     Donald Trump     18.0  0.443917  0.446600  0.000000  0.000000   
...                               ...       ...       ...       ...       ...   
YouGov          Kamala Harris    70.0  0.364518  0.876033 -0.530769  0.000000   
Z to A Research Donald Trump     24.0 -0.149918  0.362414 -1.307143  0.000000   
                Kamala Harris    24.0 -0.292628  0.672367 -2.030769  0.000000   
co/efficient    Donald Trump     20.0  0.069850  0.272359 -0.166667  0.000000   
                Kamala Harri

#### Time Series of Pollster House Effects in State

In [40]:
# Define selection for highlighting specific pollster
highlight = alt.selection_multi(fields=['pollster'], bind='legend')

# Manually map shapes for the candidates ('circle' for Kamala Harris and 'square' for Donald Trump)
shape_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['circle', 'square'])

# Scatter plot for house effect data with shape highlight based on candidate_name (restricted to circle and square)
scatter = alt.Chart(house_effect_data).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('house_effect:Q', title='House Effect'),
    color=alt.condition(highlight, 'pollster:N', alt.value('lightgray'), legend=alt.Legend(title="Pollster")),  # Color for pollster with legend
    shape=alt.condition(
        highlight, 
        alt.Shape('candidate_name:N', scale=shape_scale),  # Shape encoding based on candidate
        alt.value('circle')  # Default shape for non-selected points
    ),
    tooltip=['pollster', 'candidate_name', 'end_date', 'house_effect']
).properties(
    width=800,
    height=600
).add_selection(
    highlight
)

# LOESS smoothing for each pollster (groupby pollster)
loess = alt.Chart(house_effect_data).transform_loess(
    'end_date', 'house_effect', groupby=['pollster']
).mark_line(size=3).encode(
    x='end_date:T',
    y='house_effect:Q',
    color=alt.condition(highlight, 'pollster:N', alt.value('lightgray')),  # Color highlighting for LOESS curve
    size=alt.condition(highlight, alt.value(3), alt.value(1))  # Thicker line for selected pollsters
)

# Layering the scatter plot and LOESS smoothing curve
final_chart = alt.layer(scatter, loess).properties(
    title="House Effect Data with LOESS Smoothing (Highlight Pollster)"
)

# Display the final chart
final_chart.display()

  highlight = alt.selection_multi(fields=['pollster'], bind='legend')
  ).add_selection(


In [41]:
# Group by 'end_date' and 'pollster' and calculate the mean of 'house_effect'
grouped_data = house_effect_data.groupby(['end_date', 'pollster']).agg(
    avg_house_effect=('house_effect', 'mean')
).reset_index()

# Define selection for pollster highlighting
highlight = alt.selection_multi(fields=['pollster'], bind='legend')

# Create a scatter plot for the average house effect, color-coded by 'pollster'
scatter = alt.Chart(grouped_data).mark_circle(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('avg_house_effect:Q', title='Average House Effect'),
    color=alt.condition(
        highlight,  # Highlight selected pollsters
        'pollster:N',  # Color based on pollster when selected
        alt.value('lightgray')  # Gray out when not selected
    ),
    tooltip=['end_date:T', 'avg_house_effect:Q', 'pollster:N']
).properties(
    width=600,
    height=400,
    title="Average House Effect by End Date, Color-coded by Pollster"
).add_selection(
    highlight  # Add the selection to the chart
)

# LOESS smoothing for each pollster (groupby pollster)
loess = alt.Chart(grouped_data).transform_loess(
    'end_date', 'avg_house_effect', groupby=['pollster']
).mark_line().encode(
    x='end_date:T',
    y='avg_house_effect:Q',
    color=alt.condition(
        highlight,  # Highlight LOESS lines for selected pollsters
        'pollster:N',  # Color based on pollster when selected
        alt.value('lightgray')  # Gray out when not selected
    ),
    size=alt.condition(highlight, alt.value(2), alt.value(1))  # Thicker line for selected pollsters
)

# Layering the scatter plot and LOESS smoothing lines
final_chart = alt.layer(scatter, loess)

# Display the final chart
final_chart.display()

  highlight = alt.selection_multi(fields=['pollster'], bind='legend')
  ).add_selection(


#### Looking at the Distributions of Pollster House Effects in State

In [42]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(house_effect_data).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Let's compute battleground state Michigan

In [43]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Michigan"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
        #lets grab house effect data
        # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
        #Start House effect data computation#
        c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                            (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        #start saving house effect data for boxplots
        new_data = pd.DataFrame({
            'pollster': candidate_data['pollster'],  # Existing pollster data
            'state': candidate_data['state'],  # Existing state data
            'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
            'house_effect': candidate_data['house_effect']  # Existing house effect data
        })
        
        # Append new data to the main DataFrame
        house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
        #end saving house effect data for box plots
        #print(candidate_data['adjusted_pct'].head(10))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)   
print("Michigan")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Michigan


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-09,16661,Kamala Harris,48.250764
1,2024-10-09,16651,Donald Trump,47.242871
2,2024-10-08,16661,Kamala Harris,47.769318
3,2024-10-08,16651,Donald Trump,46.98622
4,2024-10-07,16661,Kamala Harris,47.295378
5,2024-10-07,16651,Donald Trump,46.805408
6,2024-10-04,16661,Kamala Harris,47.500907
7,2024-10-04,16651,Donald Trump,46.860549
8,2024-10-02,16661,Kamala Harris,48.179582
9,2024-10-02,16651,Donald Trump,47.289569


In [44]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Michigan)',
    width=600,
    height=400
)

chart.show()

In [45]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable) for Michigan',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Boxplots of House Effects distributions by Pollster and Candidate

In [46]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(house_effect_data).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground Wisconsin

In [47]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Wisconsin"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
        #lets grab house effect data
        # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
        #Start House effect data computation#
        c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                            (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        #start saving house effect data for boxplots
        new_data = pd.DataFrame({
            'pollster': candidate_data['pollster'],  # Existing pollster data
            'state': candidate_data['state'],  # Existing state data
            'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
            'house_effect': candidate_data['house_effect']  # Existing house effect data
        })
        
        # Append new data to the main DataFrame
        house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
        #end saving house effect data for box plots
        #print(candidate_data['adjusted_pct'].head(10))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Wisconsin")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Wisconsin


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-09,16661,Kamala Harris,48.173713
1,2024-10-09,16651,Donald Trump,47.815124
2,2024-10-08,16661,Kamala Harris,47.870375
3,2024-10-08,16651,Donald Trump,47.55295
4,2024-10-07,16661,Kamala Harris,48.400144
5,2024-10-07,16651,Donald Trump,46.823125
6,2024-10-06,16661,Kamala Harris,48.625476
7,2024-10-06,16651,Donald Trump,46.728814
8,2024-10-02,16661,Kamala Harris,48.732676
9,2024-10-02,16651,Donald Trump,47.174431


In [48]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Wisconsin)',
    width=600,
    height=400
)

chart.show()

In [49]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable) for Wisconsin',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Boxplots of House Effects distributions by Pollster and Candidate

In [50]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(house_effect_data).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground North Carolina

In [51]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="North Carolina"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
        #lets grab house effect data
        # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
        #Start House effect data computation#
        c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                            (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        #start saving house effect data for boxplots
        new_data = pd.DataFrame({
            'pollster': candidate_data['pollster'],  # Existing pollster data
            'state': candidate_data['state'],  # Existing state data
            'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
            'house_effect': candidate_data['house_effect']  # Existing house effect data
        })
        
        # Append new data to the main DataFrame
        house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
        #end saving house effect data for box plots
        #print(candidate_data['adjusted_pct'].head(10))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("North Carolina")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


North Carolina


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-09,16661,Kamala Harris,48.073081
1,2024-10-09,16651,Donald Trump,48.648958
2,2024-10-08,16661,Kamala Harris,48.189925
3,2024-10-08,16651,Donald Trump,48.6535
4,2024-10-06,16661,Kamala Harris,47.888604
5,2024-10-06,16651,Donald Trump,48.528796
6,2024-10-02,16661,Kamala Harris,48.304485
7,2024-10-02,16651,Donald Trump,48.440619
8,2024-09-30,16661,Kamala Harris,48.400404
9,2024-09-30,16651,Donald Trump,48.464351


In [52]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (North Carolina)',
    width=600,
    height=400
)

chart.show()

In [53]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable) for North Carolina',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Boxplots of House Effects distributions by Pollster and Candidate

In [54]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(house_effect_data).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground Georgia

In [55]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Georgia"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
        #lets grab house effect data
        # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
        #Start House effect data computation#
        c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                            (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        #start saving house effect data for boxplots
        new_data = pd.DataFrame({
            'pollster': candidate_data['pollster'],  # Existing pollster data
            'state': candidate_data['state'],  # Existing state data
            'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
            'house_effect': candidate_data['house_effect']  # Existing house effect data
        })
        
        # Append new data to the main DataFrame
        house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
        #end saving house effect data for box plots
        #print(candidate_data['adjusted_pct'].head(10))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages) 
print("Georgia")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Georgia


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-10,16661,Kamala Harris,46.337292
1,2024-10-10,16651,Donald Trump,48.438845
2,2024-10-09,16661,Kamala Harris,46.42663
3,2024-10-09,16651,Donald Trump,48.18216
4,2024-10-08,16661,Kamala Harris,46.516401
5,2024-10-08,16651,Donald Trump,47.690892
6,2024-10-02,16661,Kamala Harris,47.479924
7,2024-10-02,16651,Donald Trump,48.512588
8,2024-09-30,16661,Kamala Harris,47.628123
9,2024-09-30,16651,Donald Trump,48.609356


In [56]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Georgia)',
    width=600,
    height=400
)

chart.show()

In [57]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable) for Georgia',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Boxplots of House Effects distributions by Pollster and Candidate

In [58]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(house_effect_data).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground Florida

In [59]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 21
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Florida"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
        #lets grab house effect data
        # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
        #Start House effect data computation#
        c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                            (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        #start saving house effect data for boxplots
        new_data = pd.DataFrame({
            'pollster': candidate_data['pollster'],  # Existing pollster data
            'state': candidate_data['state'],  # Existing state data
            'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
            'house_effect': candidate_data['house_effect']  # Existing house effect data
        })
        
        # Append new data to the main DataFrame
        house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
        #end saving house effect data for box plots
        #print(candidate_data['adjusted_pct'].head(10))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)   
print("Florida")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Florida


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-07,16661,Kamala Harris,44.846432
1,2024-10-07,16651,Donald Trump,50.587856
2,2024-10-06,16661,Kamala Harris,44.704463
3,2024-10-06,16651,Donald Trump,50.490712
4,2024-10-04,16661,Kamala Harris,45.565734
5,2024-10-04,16651,Donald Trump,49.273821
6,2024-10-02,16661,Kamala Harris,45.985179
7,2024-10-02,16651,Donald Trump,49.322103
8,2024-09-27,16661,Kamala Harris,46.017427
9,2024-09-27,16651,Donald Trump,49.517051


In [60]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Florida)',
    width=600,
    height=400
)

chart.show()

#### Boxplots of House Effects distributions by Pollster and Candidate

In [61]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(house_effect_data).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground Arizona

In [62]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Arizona"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
        #lets grab house effect data
        # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
        #Start House effect data computation#
        c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                            (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        #start saving house effect data for boxplots
        new_data = pd.DataFrame({
            'pollster': candidate_data['pollster'],  # Existing pollster data
            'state': candidate_data['state'],  # Existing state data
            'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
            'house_effect': candidate_data['house_effect']  # Existing house effect data
        })
        
        # Append new data to the main DataFrame
        house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
        #end saving house effect data for box plots
        #print(candidate_data['adjusted_pct'].head(10))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Arizona")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Arizona


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-10,16661,Kamala Harris,47.178125
1,2024-10-10,16651,Donald Trump,49.0232
2,2024-10-09,16661,Kamala Harris,47.357499
3,2024-10-09,16651,Donald Trump,49.075002
4,2024-10-08,16661,Kamala Harris,47.310086
5,2024-10-08,16651,Donald Trump,48.883617
6,2024-10-07,16661,Kamala Harris,47.247711
7,2024-10-07,16651,Donald Trump,48.245368
8,2024-10-02,16661,Kamala Harris,47.529708
9,2024-10-02,16651,Donald Trump,48.340218


In [63]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Arizona)',
    width=600,
    height=400
)

chart.show()

In [64]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable) for Arizona',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Boxplots of House Effects distributions by Pollster and Candidate

In [65]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(house_effect_data).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground Nevada

In [66]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 31
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Nevada"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
        #lets grab house effect data
        # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
        #Start House effect data computation#
        c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                            (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        #start saving house effect data for boxplots
        new_data = pd.DataFrame({
            'pollster': candidate_data['pollster'],  # Existing pollster data
            'state': candidate_data['state'],  # Existing state data
            'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
            'house_effect': candidate_data['house_effect']  # Existing house effect data
        })
        
        # Append new data to the main DataFrame
        house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
        #end saving house effect data for box plots
        #print(candidate_data['adjusted_pct'].head(10))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Nevada")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Nevada


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-09,16661,Kamala Harris,48.326862
1,2024-10-09,16651,Donald Trump,47.251961
2,2024-10-08,16661,Kamala Harris,48.404333
3,2024-10-08,16651,Donald Trump,47.140208
4,2024-10-03,16661,Kamala Harris,48.367023
5,2024-10-03,16651,Donald Trump,47.173854
6,2024-10-02,16661,Kamala Harris,48.35217
7,2024-10-02,16651,Donald Trump,47.105443
8,2024-09-30,16661,Kamala Harris,48.382725
9,2024-09-30,16651,Donald Trump,47.108444


In [67]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Nevada)',
    width=600,
    height=400
)

chart.show()

In [68]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable) for Nevada',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Boxplots of House Effects distributions by Pollster and Candidate

In [69]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(house_effect_data).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### State of Minnesota

In [70]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .15
avg_window = 20
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Minnesota"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
        #lets grab house effect data
        # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
        #Start House effect data computation#
        c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                            (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        #start saving house effect data for boxplots
        new_data = pd.DataFrame({
            'pollster': candidate_data['pollster'],  # Existing pollster data
            'state': candidate_data['state'],  # Existing state data
            'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
            'house_effect': candidate_data['house_effect']  # Existing house effect data
        })
        
        # Append new data to the main DataFrame
        house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
        #end saving house effect data for box plots
        #print(candidate_data['adjusted_pct'].head(10))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-09,16661,Kamala Harris,51.107822
1,2024-10-09,16651,Donald Trump,44.391558
2,2024-10-02,16661,Kamala Harris,49.7577
3,2024-10-02,16651,Donald Trump,43.246685
4,2024-09-26,16661,Kamala Harris,49.612351


In [71]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Nevada)',
    width=600,
    height=400
)

chart.show()

In [72]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable) for Minnesota',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


In [73]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(house_effect_data).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### State of Texas

In [74]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .15
avg_window = 20
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Texas"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
        #lets grab house effect data
        # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
        #Start House effect data computation#
        c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                            (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        #start saving house effect data for boxplots
        new_data = pd.DataFrame({
            'pollster': candidate_data['pollster'],  # Existing pollster data
            'state': candidate_data['state'],  # Existing state data
            'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
            'house_effect': candidate_data['house_effect']  # Existing house effect data
        })
        
        # Append new data to the main DataFrame
        house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
        #end saving house effect data for box plots
        #print(candidate_data['adjusted_pct'].head(10))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-07,16661,Kamala Harris,45.236253
1,2024-10-07,16651,Donald Trump,51.312156
2,2024-10-06,16661,Kamala Harris,45.310371
3,2024-10-06,16651,Donald Trump,51.130032
4,2024-10-04,16661,Kamala Harris,45.258579


In [75]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Texas)',
    width=600,
    height=400
)

chart.show()

In [76]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable) for Texas',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()
