In [1]:
import pandas as pd
import altair as alt
from statsmodels.nonparametric.smoothers_lowess import lowess
import numpy as np

In [2]:
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('president_polls.csv')

# Display the first few rows of the DataFrame to verify the import
df.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,stage,nationwide_batch,ranked_choice_reallocated,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct
0,88685,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,49.0
1,88685,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,general,False,False,,False,REP,Trump,16651,Donald Trump,46.0
2,88691,1754,Patriot Polling,,,Patriot Polling,732,Patriot Polling,1.1,0.6,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,49.0
3,88691,1754,Patriot Polling,,,Patriot Polling,732,Patriot Polling,1.1,0.6,...,general,False,False,,False,REP,Trump,16651,Donald Trump,50.0
4,88697,1741,ActiVote,,,ActiVote,721,ActiVote,,,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,62.9


In [3]:
df_state_na = df[df['state'].isna()]

df_state_not_na = df[df['state'].notna()]

print(df_state_na.head())
print(df_state_not_na.columns)
print(df_state_not_na.head())

   poll_id  pollster_id             pollster sponsor_ids      sponsors  \
0    88685          770                 TIPP         NaN           NaN   
1    88685          770                 TIPP         NaN           NaN   
6    88672          770                 TIPP         NaN           NaN   
7    88672          770                 TIPP         NaN           NaN   
8    88676         1797  HarrisX/Harris Poll         763  Harvard CAPS   

          display_name  pollster_rating_id         pollster_rating_name  \
0        TIPP Insights                 144                TIPP Insights   
1        TIPP Insights                 144                TIPP Insights   
6        TIPP Insights                 144                TIPP Insights   
7        TIPP Insights                 144                TIPP Insights   
8  HarrisX/Harris Poll                 133  Harris Insights & Analytics   

   numeric_grade  pollscore  ...    stage  nationwide_batch  \
0            1.8       -0.4  ...  general

Let's filter out non scored national pollsters 

In [4]:
df_state_na_clean = df_state_na[df_state_na['numeric_grade'].notna()]
print(df_state_na_clean['numeric_grade'])
print(df_state_na_clean['numeric_grade'].max())

0        1.8
1        1.8
6        1.8
7        1.8
8        1.5
        ... 
15518    2.8
15519    2.8
15520    2.8
15521    2.8
15522    2.8
Name: numeric_grade, Length: 6944, dtype: float64
3.0


Let's create a weight for numerically graded pollsters

In [5]:
# Make a copy to avoid the warning
df_state_na_clean = df_state_na_clean.copy()

# Now safely create the 'weight_score' column
df_state_na_clean.loc[:, 'weight_score'] = df_state_na_clean['numeric_grade'] / 3.0


Let's check weight_grade column

In [6]:
df_state_na_clean.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,nationwide_batch,ranked_choice_reallocated,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score
0,88685,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,False,,False,DEM,Harris,16661,Kamala Harris,49.0,0.6
1,88685,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,False,,False,REP,Trump,16651,Donald Trump,46.0,0.6
6,88672,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,False,,False,DEM,Harris,16661,Kamala Harris,49.0,0.6
7,88672,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,False,,False,REP,Trump,16651,Donald Trump,46.0,0.6
8,88676,1797,HarrisX/Harris Poll,763.0,Harvard CAPS,HarrisX/Harris Poll,133,Harris Insights & Analytics,1.5,-0.1,...,False,False,,False,DEM,Harris,16661,Kamala Harris,49.0,0.5


Let's go ahead and examine unique methodologies

In [7]:
print(df_state_na_clean['methodology'].unique())

['Online Panel' 'IVR/Text' 'Live Phone' 'Probability Panel' nan
 'IVR/Online Panel/Text-to-Web' 'Live Phone/Online Panel/Text-to-Web'
 'Live Phone/Text-to-Web' 'Live Phone/Online Panel/App Panel'
 'Live Phone/Online Panel/Text' 'Live Phone/Probability Panel' 'IVR'
 'Online Panel/Text-to-Web' 'Text-to-Web/Online Ad' 'Online Ad'
 'Live Phone/Online Panel' 'Live Phone/Text-to-Web/Online Ad'
 'IVR/Text-to-Web' 'Live Phone/Text/Online Panel' 'IVR/Online Panel'
 'Text' 'Online Panel/Online Ad' 'IVR/Online Panel/Email'
 'IVR/Live Phone/Text/Online Panel/Email' 'Live Phone/Text/Online Ad'
 'Online Panel/Text-to-Web/Text' 'Live Phone/Text-to-Web/App Panel'
 'Online Panel/Probability Panel' 'App Panel'
 'IVR/Online Panel/Text-to-Web/Email']


mapping different methodologies to weight_mode

In [8]:
# Make a copy to avoid the warning
df_state_na_clean = df_state_na_clean.copy()
# Mapping the weights to modes based on the table above
mode_weights = {
    'Live Phone': 1.00,
    'Live Phone/Probability Panel': 0.95,
    'Live Phone/Online Panel/Text-to-Web': 0.90,
    'Live Phone/Online Panel/Text': 0.90,
    'Live Phone/Text-to-Web/App Panel': 0.82,
    'Live Phone/Text-to-Web/Online Ad': 0.85,
    'Live Phone/Text-to-Web': 0.85,
    'Live Phone/Text/Online Panel': 0.90,
    'Live Phone/Online Panel': 0.85,
    'Live Phone/Online Panel/App Panel': 0.85,
    'IVR/Live Phone/Text/Online Panel/Email': 0.80,
    'Live Phone/Text/Online Ad': 0.80,
    'IVR/Online Panel/Email': 0.77,
    'IVR/Online Panel/Text-to-Web/Email': 0.75,
    'IVR/Online Panel/Text-to-Web': 0.75,
    'IVR/Online Panel': 0.70,
    'IVR': 0.70,
    'Online Panel/Probability Panel': 0.65,
    'Probability Panel': 0.65,
    'Online Panel/Text-to-Web': 0.60,
    'Online Panel/Online Ad': 0.55,
    'Online Panel': 0.50,
    'Online Ad': 0.50,
    'App Panel': 0.50,
    'Online Panel/Text-to-Web/Text': 0.50,
    'IVR/Text-to-Web': 0.50,
    'Text-to-Web/Online Ad': 0.45,
    'Text': 0.40,
    'IVR/Text': 0.40,
    'nan' : 0.50,
     np.nan: 0.50  # Handling missing or unknown values
}

# Apply the mapping to create a new column 'weight_mode'
df_state_na_clean.loc[:,'weight_mode'] = df_state_na_clean['methodology'].map(mode_weights)

Let's check out the 'weight_mode' column

In [9]:
print(df_state_na_clean.head())

   poll_id  pollster_id             pollster sponsor_ids      sponsors  \
0    88685          770                 TIPP         NaN           NaN   
1    88685          770                 TIPP         NaN           NaN   
6    88672          770                 TIPP         NaN           NaN   
7    88672          770                 TIPP         NaN           NaN   
8    88676         1797  HarrisX/Harris Poll         763  Harvard CAPS   

          display_name  pollster_rating_id         pollster_rating_name  \
0        TIPP Insights                 144                TIPP Insights   
1        TIPP Insights                 144                TIPP Insights   
6        TIPP Insights                 144                TIPP Insights   
7        TIPP Insights                 144                TIPP Insights   
8  HarrisX/Harris Poll                 133  Harris Insights & Analytics   

   numeric_grade  pollscore  ... ranked_choice_reallocated  \
0            1.8       -0.4  ...          

Let's create a weight for sample size, but first let's look for NaN in sample_size column

In [10]:
# Count the number of NaN values in the 'sample_size' column
nan_count = df['sample_size'].isna().sum()

print(f"Number of NaN values in 'sample_size': {nan_count}")

# Calculate the mean of the available (non-NaN) sample sizes
mean_sample_size = df['sample_size'].mean()

print(f"Mean of available sample sizes: {mean_sample_size}")


Number of NaN values in 'sample_size': 132
Mean of available sample sizes: 1615.1498083544468


In [11]:
import numpy as np

In [12]:
# Step 2: Create the 'weight_sample' column
df_state_na_clean['weight_sample'] = df_state_na_clean['sample_size'].apply(lambda x: np.sqrt(x) if not np.isnan(x) else np.sqrt(mean_sample_size))

# Display the first few rows to verify
df_state_na_clean.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
0,88685,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,,False,DEM,Harris,16661,Kamala Harris,49.0,0.6,0.5,34.612137
1,88685,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,,False,REP,Trump,16651,Donald Trump,46.0,0.6,0.5,34.612137
6,88672,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,,False,DEM,Harris,16661,Kamala Harris,49.0,0.6,0.5,34.81379
7,88672,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,,False,REP,Trump,16651,Donald Trump,46.0,0.6,0.5,34.81379
8,88676,1797,HarrisX/Harris Poll,763.0,Harvard CAPS,HarrisX/Harris Poll,133,Harris Insights & Analytics,1.5,-0.1,...,,False,DEM,Harris,16661,Kamala Harris,49.0,0.5,0.5,50.950957


Sort end_date values in descending order

In [13]:
# Convert 'end_date' to datetime format with specified format for single/double digits in month/day
df_state_na_clean['end_date'] = pd.to_datetime(df_state_na_clean['end_date'], format='%m/%d/%y', errors='coerce')

# Sort the DataFrame by 'end_date'
df_state_na_clean_sorted = df_state_na_clean.sort_values(by='end_date',ascending=False)

In [14]:
df_state_na_clean_sorted.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
0,88685,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,,False,DEM,Harris,16661,Kamala Harris,49.0,0.6,0.5,34.612137
1,88685,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,,False,REP,Trump,16651,Donald Trump,46.0,0.6,0.5,34.612137
6,88672,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,,False,DEM,Harris,16661,Kamala Harris,49.0,0.6,0.5,34.81379
11,88676,1797,HarrisX/Harris Poll,763.0,Harvard CAPS,HarrisX/Harris Poll,133,Harris Insights & Analytics,1.5,-0.1,...,,False,GRE,Stein,31116,Jill Stein,1.0,0.5,0.5,50.950957
15,88684,1189,Morning Consult,,,Morning Consult,218,Morning Consult,1.9,-0.3,...,,False,REP,Trump,16651,Donald Trump,46.0,0.633333,0.5,92.989247


Let's create a 'days_past_index' that can be used for weight_time_decay value for moving average

In [15]:
# Step 3: Get the first (top) date after sorting
first_date = df_state_na_clean_sorted['end_date'].iloc[0]

# Step 4: Compute the difference in days and create the 'days_past_index' column
df_state_na_clean_sorted['days_past_index'] = (first_date - df_state_na_clean_sorted['end_date']).dt.days

In [16]:
df_state_na_clean_sorted.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample,days_past_index
0,88685,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,DEM,Harris,16661,Kamala Harris,49.0,0.6,0.5,34.612137,0
1,88685,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,REP,Trump,16651,Donald Trump,46.0,0.6,0.5,34.612137,0
6,88672,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,DEM,Harris,16661,Kamala Harris,49.0,0.6,0.5,34.81379,1
11,88676,1797,HarrisX/Harris Poll,763.0,Harvard CAPS,HarrisX/Harris Poll,133,Harris Insights & Analytics,1.5,-0.1,...,False,GRE,Stein,31116,Jill Stein,1.0,0.5,0.5,50.950957,1
15,88684,1189,Morning Consult,,,Morning Consult,218,Morning Consult,1.9,-0.3,...,False,REP,Trump,16651,Donald Trump,46.0,0.633333,0.5,92.989247,1


In [17]:
df_state_na_clean_sorted.tail()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample,days_past_index
15518,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,REP,Cruz,16641,Ted Cruz,24.0,0.933333,0.65,33.24154,1277
15519,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,DEM,Biden,19368,Joe Biden,41.0,0.933333,0.65,33.256578,1277
15520,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,REP,DeSantis,16646,Ron DeSantis,25.0,0.933333,0.65,33.256578,1277
15521,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,DEM,Biden,19368,Joe Biden,44.0,0.933333,0.65,33.27161,1277
15522,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,REP,Haley,16640,Nikki Haley,19.0,0.933333,0.65,33.27161,1277


In [18]:
# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_na_clean_sorted_cutoff = df_state_na_clean_sorted[df_state_na_clean_sorted['end_date'] >= cutoff_date].copy()

# Check for NaN values in 'weight_mode', 'weight_sample', and 'weight_score'
nan_check = df_state_na_clean_sorted_cutoff[['weight_mode', 'weight_sample', 'weight_score']].isna().sum()

# Step 2: Check for infinite values in the involved columns
inf_check = df_state_na_clean_sorted_cutoff[['weight_mode', 'weight_sample', 'weight_score']].isin([np.inf, -np.inf]).sum()
print(f"Number of infinite values:\n{inf_check}")


# Print the result to verify if there are any NaN values
print(nan_check)

# Step 1: Filter rows where 'weight_mode' is NaN
nan_weight_mode = df_state_na_clean_sorted_cutoff[df_state_na_clean_sorted_cutoff['weight_mode'].isna()]

# Step 2: Display the 'methodology' or other relevant columns to investigate the methodology used
# For example, we'll check 'pollster', 'sponsors', and 'methodology' (if available) along with 'weight_mode'
nan_weight_mode_info = nan_weight_mode[['pollster', 'sponsors', 'methodology', 'weight_mode']]

# Print the resulting DataFrame for verification
print(nan_weight_mode_info)

Number of infinite values:
weight_mode      0
weight_sample    0
weight_score     0
dtype: int64
weight_mode      0
weight_sample    0
weight_score     0
dtype: int64
Empty DataFrame
Columns: [pollster, sponsors, methodology, weight_mode]
Index: []


Let's compute a weight_i where weight_i is for a given 'end_date' the determined index weight which determines for a given end_date the weight assigned to a row used in computing the point average_i. Note: we will use the sorted dates to filter dates that only on the date or before, then we will compute the 'weight_time' using the formula: exp(-lambdat) where lambda = 1.0 and t = days elapsed since the beginning of the poll end date.
finally we can compute the weight_i = 'weight_mode''weight_sample'*weight_score'*weight_time'

In [19]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = 1.0
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_na_clean_sorted_cutoff = df_state_na_clean_sorted[df_state_na_clean_sorted['end_date'] >= cutoff_date].copy()
df_lv = df_state_na_clean_sorted_cutoff[df_state_na_clean_sorted_cutoff['population']=='lv'].copy()

# Iterate through each unique end date
for current_date in df_lv['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_lv[df_lv['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        #  We select H2H if avaible and only use non H2H if H2H is not available
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_gt2 = group[group['count'] > 2]
            
            if not question_ids_with_count_gt2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()
        
        # Step 7: Filter data for the specific candidate
        
        c_mean = candidate_data[candidate_data['days_past_index']< 30]['pct'].mean() #gather mean for past 30 days of likely voters
        c_std =  candidate_data[candidate_data['days_past_index']< 30]['pct'].std()  #gather standard deviation for past 30 days of likely voters
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.0*candidate_data['zscores'])              
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        #Start House effect data computation#
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < 40) & 
                                            (candidate_data['days_past_index'] > 1)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        #print(candidate_data['adjusted_pct'].head(10))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-14,16661,Kamala Harris,49.780542
1,2024-10-14,16651,Donald Trump,46.65153
2,2024-10-13,16661,Kamala Harris,49.707417
3,2024-10-13,16651,Donald Trump,46.682326
4,2024-10-12,16661,Kamala Harris,49.771156


Let's view the plot with weight average:

In [20]:
# Create the line chart with points
# Create a customized color encoding for Trump and Harris
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Create the line chart with points, assigning specific colors to each candidate
chart = alt.Chart(df_weighted_averages).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate_name:N', scale=color_scale),  # Custom color scale for candidates
    tooltip=['end_date', 'candidate_name', 'weighted_average_pct']  # Add tooltips to show details
).properties(
    title='Weighted Average Polling Results Over Time',
    width=600,
    height=400
)

chart.show()


In [21]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q']  # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time',
    width=600,
    height=400
)

chart.show()

Let's look at the loess smoothed curve of this

In [22]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


Let's look at A/B graded pollsters

In [23]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = 0.23
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_na_clean_sorted_cutoff = df_state_na_clean_sorted[df_state_na_clean_sorted['end_date'] >= cutoff_date].copy()
df_ab_pollsters = df_state_na_clean_sorted_cutoff[df_state_na_clean_sorted_cutoff['numeric_grade'] >= 2.4].copy()
df_lv = df_ab_pollsters[df_ab_pollsters['population'] == 'lv'].copy()
# Iterate through each unique end date
for current_date in df_lv['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_lv[df_lv['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        #  We select H2H if avaible and only use non H2H if H2H is not available
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_gt2 = group[group['count'] > 2]
            
            if not question_ids_with_count_gt2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()
        
        # Step 7: Filter data for the specific candidate
        
        c_mean = candidate_data[candidate_data['days_past_index']< 20]['pct'].mean() #gather mean for past 30 days of likely voters
        c_std =  candidate_data[candidate_data['days_past_index']< 20]['pct'].std()  #gather standard deviation for past 30 days of likely voters
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.0*candidate_data['zscores'])              
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )

        #Start House effect data computation#
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < 20) & 
                                            (candidate_data['days_past_index'] > 0)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        ## finish house effect weighting  ##   
        
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-11,16661,Kamala Harris,49.557708
1,2024-10-11,16651,Donald Trump,46.999021
2,2024-10-08,16661,Kamala Harris,49.384833
3,2024-10-08,16651,Donald Trump,46.940021
4,2024-10-07,16661,Kamala Harris,49.065623


In [24]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(        
    title='Weighted Average Polling Results for Trump and Harris Over Time',
    width=600,
    height=400
)

chart.show()

In [25]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


### Moving on to Battleground state data

Let's clean 'numeric_grade' rows for df_state_not_na so that there aren't ungraded pollsters in our list

In [26]:
#df_state_not_na_clean = df_state_not_na[df_state_not_na['numeric_grade'].notna()]

In [27]:
#na_num = df_state_not_na_clean['numeric_grade'].isna().sum()
#print(na_num)

Let's normalize 'numeric_grade' to create a 'weight_score'

In [28]:
# Make a copy to avoid the warning
df_state_not_na = df_state_not_na.copy()

# Step 1: Replace NaN values in 'numeric_grade' with a low grade (e.g., 1.0)
df_state_not_na['numeric_grade'].fillna(0.1, inplace=True)

df_state_not_na_clean = df_state_not_na.copy()
# Step 2: Safely create the 'weight_score' column by dividing 'numeric_grade' by 3.0
df_state_not_na_clean.loc[:, 'weight_score'] = df_state_not_na_clean['numeric_grade'] / 3.0

# Verify the updated DataFrame
print(df_state_not_na_clean[['numeric_grade', 'weight_score']].head())

    numeric_grade  weight_score
2             1.1      0.366667
3             1.1      0.366667
4             0.1      0.033333
5             0.1      0.033333
16            0.1      0.033333


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_state_not_na['numeric_grade'].fillna(0.1, inplace=True)


In [29]:
na_num = df_state_not_na_clean['weight_score'].isna().sum()
print(na_num)

0


#### Creating 'weight_mode' weight for methodology

In [30]:
import numpy as np
# Make a copy to avoid the warning
df_state_not_na_clean = df_state_not_na_clean.copy()
# Mapping the weights to modes based on the table above
mode_weights = {
    "Text-to-Web/Email": 0.75,
    "IVR/Live Phone/Text-to-Web": 0.76,
    "IVR/Live Phone/Online Panel": 0.78,
    "IVR/Live Phone/Online Panel/Text-to-Web": 0.77,
    "Live Phone/Text-to-Web/Email/Mail-to-Web": 0.76,
    "Live Phone/Text-to-Web/Email": 0.78,
    "Email/Online Ad": 0.73,
    "Online Panel/Email": 0.78,
    "Live Phone/Online Panel/Mail-to-Web": 0.78,
    "IVR/Text-to-Web/Email": 0.72,
    'Email':0.8,
    'Live Phone': 1.00,
    'Live Phone/Probability Panel': 0.95,
    'Live Phone/Online Panel/Text-to-Web': 0.90,
    'Live Phone/Online Panel/Text': 0.90,
    'Live Phone/Text-to-Web/App Panel': 0.85,
    'Live Phone/Text-to-Web/Online Ad': 0.85,
    'Live Phone/Text-to-Web': 0.85,
    'Live Phone/Text/Online Panel': 0.90,
    'Live Phone/Online Panel': 0.85,
    'Live Phone/Online Panel/App Panel': 0.85,
    'Live Phone/Text-to-Web/Email/Mail-to-Web/Mail-to-Phone':0.76,
    'Live Phone/Email':0.82,
    'Live Phone/Online Panel/Text-to-Web/Text':0.8,
    'Live Phone/Text':0.83,
    'IVR/Live Phone/Text/Online Panel/Email': 0.80,
    'Live Phone/Text/Online Ad': 0.80,
    'IVR/Live Phone/Text':0.78,
    'IVR/Online Panel/Email': 0.77,
    'IVR/Online Panel/Text-to-Web/Email': 0.75,
    'IVR/Online Panel/Text-to-Web': 0.75,
    'IVR/Online Panel': 0.70,
    'IVR': 0.70,
    'Mail-to-Web/Mail-to-Phone': 0.7,
    'Online Panel/Probability Panel': 0.65,
    'Probability Panel': 0.65,
    'Online Panel/Email/Text-to-Web':0.77,
    'Online Panel/Text-to-Web': 0.60,
    'Online Panel/Text':0.78,
    'Online Panel/Online Ad': 0.55,
    'Online Panel': 0.50,
    'Online Ad': 0.50,
    'App Panel': 0.50,
    'Online Panel/Text-to-Web/Text': 0.50,
    'IVR/Text-to-Web': 0.50,
    'Text-to-Web/Online Ad': 0.45,
    'Text-to-Web':0.45,
    'Text': 0.40,
    'IVR/Text': 0.40,
    'nan' : 0.50,
     np.nan: 0.50  # Handling missing or unknown values
}

# Apply the mapping to create a new column 'weight_mode'
df_state_not_na_clean.loc[:,'weight_mode'] = df_state_not_na_clean['methodology'].map(mode_weights)

In [31]:
num_na = df_state_not_na_clean['weight_mode'].isna().sum()
print(num_na)
df_ret = df_state_not_na_clean[df_state_not_na_clean['weight_mode'].isna()][['methodology']]
print(df_ret['methodology'].unique())

0
[]


#### Create a Weight_Sample weight for sample size

In [32]:
# Step 2: Create the 'weight_sample' column
df_state_not_na_clean['weight_sample'] = df_state_not_na_clean['sample_size'].apply(lambda x: np.sqrt(x/600) if not np.isnan(x) else np.sqrt(mean_sample_size/600))

# Display the first few rows to verify
df_state_not_na_clean.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
2,88691,1754,Patriot Polling,,,Patriot Polling,732,Patriot Polling,1.1,0.6,...,,False,DEM,Harris,16661,Kamala Harris,49.0,0.366667,0.7,1.156864
3,88691,1754,Patriot Polling,,,Patriot Polling,732,Patriot Polling,1.1,0.6,...,,False,REP,Trump,16651,Donald Trump,50.0,0.366667,0.7,1.156864
4,88697,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,,False,DEM,Harris,16661,Kamala Harris,62.9,0.033333,0.5,0.816497
5,88697,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,,False,REP,Trump,16651,Donald Trump,37.1,0.033333,0.5,0.816497
16,88689,1890,SoCal Strategies,21522170.0,On Point Politics | Red Eagle Politics,SoCal Strategies,851,SoCal Research,0.1,,...,,False,DEM,Harris,16661,Kamala Harris,49.0,0.033333,0.5,1.073934


In [33]:
num_na = df_state_not_na_clean['weight_sample'].isna().sum()
print(num_na)

0


####  Converting 'end_date' to datetime format and sorting dataframe by end_date

In [34]:
# Convert 'end_date' to datetime format with specified format for single/double digits in month/day
df_state_not_na_clean['end_date'] = pd.to_datetime(df_state_not_na_clean['end_date'], format='%m/%d/%y', errors='coerce')

# Sort the DataFrame by 'end_date'
df_state_not_na_clean_sorted = df_state_not_na_clean.sort_values(by='end_date',ascending=False)

In [35]:
df_state_not_na_clean_sorted.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
2,88691,1754,Patriot Polling,,,Patriot Polling,732,Patriot Polling,1.1,0.6,...,,False,DEM,Harris,16661,Kamala Harris,49.0,0.366667,0.7,1.156864
5,88697,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,,False,REP,Trump,16651,Donald Trump,37.1,0.033333,0.5,0.816497
3,88691,1754,Patriot Polling,,,Patriot Polling,732,Patriot Polling,1.1,0.6,...,,False,REP,Trump,16651,Donald Trump,50.0,0.366667,0.7,1.156864
4,88697,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,,False,DEM,Harris,16661,Kamala Harris,62.9,0.033333,0.5,0.816497
17,88689,1890,SoCal Strategies,21522170.0,On Point Politics | Red Eagle Politics,SoCal Strategies,851,SoCal Research,0.1,,...,,False,REP,Trump,16651,Donald Trump,48.0,0.033333,0.5,1.073934


#### Let's compute battleground state PA

In [78]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]  # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Pennsylvania"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.6].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
        
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
            
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_gt2 = group[group['count'] > 2]
                    
                    if not question_ids_with_count_gt2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()

                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                                    (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
                
                
                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect'],  # Existing house effect data
                    'lambda_value':lambda_value,
                    'avg_window':avg_window
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Pennsylvania")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Pennsylvania


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-10,16661,Kamala Harris,48.536523,0.05,19
1,2024-10-10,16651,Donald Trump,47.906274,0.05,19
2,2024-10-09,16661,Kamala Harris,48.560624,0.05,19
3,2024-10-09,16651,Donald Trump,47.911585,0.05,19
4,2024-10-08,16661,Kamala Harris,48.631508,0.05,19
5,2024-10-08,16651,Donald Trump,47.840891,0.05,19
6,2024-10-07,16661,Kamala Harris,48.559853,0.05,19
7,2024-10-07,16651,Donald Trump,47.611771,0.05,19
8,2024-10-02,16661,Kamala Harris,48.806635,0.05,19
9,2024-10-02,16651,Donald Trump,47.403892,0.05,19


In [79]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index()

# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (Pennsylvania)"
)

# Display the final chart
final_chart.display()


#### Summary Stats of Pollster House Effects in State

In [80]:
# Group by 'pollster' and calculate summary statistics for 'house_effect_data'
he_agg = house_effect_data.groupby(['end_date','pollster','candidate_name']).agg({
    'house_effect': 'mean'
}).reset_index()
house_effect_summary = he_agg.groupby(['pollster','candidate_name'])['house_effect'].describe()

# Display the summary statistics
print(house_effect_summary)

                                                          count      mean  \
pollster                                  candidate_name                    
AtlasIntel                                Donald Trump      7.0  2.148791   
                                          Kamala Harris     7.0 -0.450051   
Beacon/Shaw                               Donald Trump      8.0  0.607980   
                                          Kamala Harris     8.0 -0.518943   
CNN/SSRS                                  Donald Trump     20.0 -0.035135   
                                          Kamala Harris    20.0 -0.297948   
Cygnal                                    Donald Trump     25.0 -0.105592   
                                          Kamala Harris    25.0 -0.316903   
Emerson                                   Donald Trump     26.0  1.616722   
                                          Kamala Harris    26.0  0.518245   
GQR                                       Donald Trump     30.0  0.111951   

#### Time Series of Pollster House Effects in State

In [81]:
import altair as alt

# Define selection for highlighting specific pollster
highlight = alt.selection_multi(fields=['pollster'], bind='legend')

# Manually map shapes for the candidates ('circle' for Kamala Harris and 'square' for Donald Trump)
shape_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['circle', 'square'])

# Scatter plot for house effect data with shape highlight based on candidate_name (restricted to circle and square)
scatter = alt.Chart(he_agg).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('house_effect:Q', title='House Effect'),
    color=alt.condition(highlight, 'pollster:N', alt.value('lightgray'), legend=alt.Legend(title="Pollster")),  # Color for pollster with legend
    shape=alt.condition(
        highlight, 
        alt.Shape('candidate_name:N', scale=shape_scale),  # Shape encoding based on candidate
        alt.value('circle')  # Default shape for non-selected points
    ),
    opacity=alt.condition(highlight, alt.value(1), alt.value(0.3)),  # Reduce opacity for non-selected points
    tooltip=['pollster', 'candidate_name', 'end_date', 'house_effect']
).properties(
    width=800,
    height=600
).add_selection(
    highlight
)

# LOESS smoothing for each pollster (groupby pollster)
loess = alt.Chart(he_agg).transform_loess(
    'end_date', 'house_effect', groupby=['pollster']
).mark_line(size=3).encode(
    x='end_date:T',
    y='house_effect:Q',
    color=alt.condition(highlight, 'pollster:N', alt.value('lightgray')),  # Color highlighting for LOESS curve
    size=alt.condition(highlight, alt.value(3), alt.value(1)),  # Thicker line for selected pollsters
    opacity=alt.condition(highlight, alt.value(1), alt.value(0.3))  # Reduce opacity for non-selected lines
)

# Layering the scatter plot and LOESS smoothing curve
final_chart = alt.layer(scatter, loess).properties(
    title="House Effect Data with LOESS Smoothing (Highlight Pollster)"
)

# Display the final chart
final_chart.display()


  highlight = alt.selection_multi(fields=['pollster'], bind='legend')
  ).add_selection(


In [82]:
import altair as alt

# Group by 'end_date' and 'pollster' and calculate the mean of 'house_effect'
grouped_data = he_agg.groupby(['end_date', 'pollster']).agg(
    avg_house_effect=('house_effect', 'mean')
).reset_index()

# Define selection for pollster highlighting
highlight = alt.selection_multi(fields=['pollster'], bind='legend')

# Create a scatter plot for the average house effect, color-coded by 'pollster'
scatter = alt.Chart(grouped_data).mark_circle(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('avg_house_effect:Q', title='Average House Effect'),
    color=alt.condition(
        highlight,  # Highlight selected pollsters
        'pollster:N',  # Color based on pollster when selected
        alt.value('lightgray')  # Gray out when not selected
    ),
    opacity=alt.condition(highlight, alt.value(1), alt.value(0.3)),  # Opacity for non-selected points
    tooltip=['end_date:T', 'avg_house_effect:Q', 'pollster:N']
).properties(
    width=600,
    height=400,
    title="Average House Effect by End Date, Color-coded by Pollster"
).add_selection(
    highlight  # Add the selection to the chart
)

# LOESS smoothing for each pollster (groupby pollster)
loess = alt.Chart(grouped_data).transform_loess(
    'end_date', 'avg_house_effect', groupby=['pollster']
).mark_line().encode(
    x='end_date:T',
    y='avg_house_effect:Q',
    color=alt.condition(
        highlight,  # Highlight LOESS lines for selected pollsters
        'pollster:N',  # Color based on pollster when selected
        alt.value('lightgray')  # Gray out when not selected
    ),
    size=alt.condition(highlight, alt.value(2), alt.value(1)),  # Thicker line for selected pollsters
    opacity=alt.condition(highlight, alt.value(1), alt.value(0.3))  # Opacity for non-selected lines
)

# Layering the scatter plot and LOESS smoothing lines
final_chart = alt.layer(scatter, loess)

# Display the final chart
final_chart.display()


  highlight = alt.selection_multi(fields=['pollster'], bind='legend')
  ).add_selection(


#### Looking at the Distributions of Pollster House Effects in State

In [83]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Let's compute battleground state Michigan

In [90]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]   # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Michigan"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.3].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
        
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
            
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_gt2 = group[group['count'] > 2]
                    
                    if not question_ids_with_count_gt2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()

                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                                    (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
                
                
                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect'],  # Existing house effect data
                    'lambda_value':lambda_value,
                    'avg_window':avg_window
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Michigan")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Michigan


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-09,16661,Kamala Harris,48.242627,0.05,19
1,2024-10-09,16651,Donald Trump,47.307905,0.05,19
2,2024-10-08,16661,Kamala Harris,48.330988,0.05,19
3,2024-10-08,16651,Donald Trump,47.220375,0.05,19
4,2024-10-07,16661,Kamala Harris,48.216415,0.05,19
5,2024-10-07,16651,Donald Trump,46.688567,0.05,19
6,2024-10-04,16661,Kamala Harris,48.622358,0.05,19
7,2024-10-04,16651,Donald Trump,46.589129,0.05,19
8,2024-10-02,16661,Kamala Harris,48.693259,0.05,19
9,2024-10-02,16651,Donald Trump,46.691034,0.05,19


In [91]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print('Michigan')
print(df_aggregated.head(10))
# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (Michigan)"
)

# Display the final chart
final_chart.display()


Michigan
     end_date candidate_name  weighted_average_pct
61 2024-10-09  Kamala Harris             48.063170
60 2024-10-09   Donald Trump             47.551865
59 2024-10-08  Kamala Harris             48.157241
58 2024-10-08   Donald Trump             47.480030
57 2024-10-07  Kamala Harris             47.966381
56 2024-10-07   Donald Trump             46.970319
55 2024-10-04  Kamala Harris             48.340403
54 2024-10-04   Donald Trump             46.551482
53 2024-10-02  Kamala Harris             48.577301
52 2024-10-02   Donald Trump             47.037230


#### Boxplots of House Effects distributions by Pollster and Candidate

In [92]:
# Group by 'pollster' and calculate summary statistics for 'house_effect_data'
he_agg = house_effect_data.groupby(['end_date','pollster','candidate_name']).agg({
    'house_effect': 'mean'
}).reset_index()
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground Wisconsin

In [93]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]   # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Wisconsin"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
        
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
            
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_gt2 = group[group['count'] > 2]
                    
                    if not question_ids_with_count_gt2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()

                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                                    (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
                
                
                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect'],  # Existing house effect data
                    'lambda_value':lambda_value,
                    'avg_window':avg_window
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Pennsylvania")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Pennsylvania


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-09,16661,Kamala Harris,49.062242,0.05,19
1,2024-10-09,16651,Donald Trump,47.787265,0.05,19
2,2024-10-08,16661,Kamala Harris,49.055935,0.05,19
3,2024-10-08,16651,Donald Trump,47.758475,0.05,19
4,2024-10-07,16661,Kamala Harris,49.028568,0.05,19
5,2024-10-07,16651,Donald Trump,47.53324,0.05,19
6,2024-10-02,16661,Kamala Harris,49.209613,0.05,19
7,2024-10-02,16651,Donald Trump,47.428474,0.05,19
8,2024-09-26,16661,Kamala Harris,49.280845,0.05,19
9,2024-09-26,16651,Donald Trump,47.431226,0.05,19


In [95]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print("Wisconsin")
print(df_aggregated.head(10))

# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (Wisconsin)"
)

# Display the final chart
final_chart.display()


Wisconsin
     end_date candidate_name  weighted_average_pct
61 2024-10-09  Kamala Harris             48.568627
60 2024-10-09   Donald Trump             47.887625
59 2024-10-08  Kamala Harris             48.562086
58 2024-10-08   Donald Trump             47.828807
57 2024-10-07  Kamala Harris             48.876610
56 2024-10-07   Donald Trump             47.536620
55 2024-10-02  Kamala Harris             49.120686
54 2024-10-02   Donald Trump             47.512097
53 2024-09-26  Kamala Harris             49.306256
52 2024-09-26   Donald Trump             47.642080


#### Boxplots of House Effects distributions by Pollster and Candidate

In [96]:
# Group by 'pollster' and calculate summary statistics for 'house_effect_data'
he_agg = house_effect_data.groupby(['end_date','pollster','candidate_name']).agg({
    'house_effect': 'mean'
}).reset_index()
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground North Carolina

In [102]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]   # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="North Carolina"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
        
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
            
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_gt2 = group[group['count'] > 2]
                    
                    if not question_ids_with_count_gt2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()

                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                                    (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
                
                
                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect'],  # Existing house effect data
                    'lambda_value':lambda_value,
                    'avg_window':avg_window
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("North Carolina")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


North Carolina


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-08,16661,Kamala Harris,48.209954,0.05,19
1,2024-10-08,16651,Donald Trump,48.685885,0.05,19
2,2024-10-02,16661,Kamala Harris,47.898189,0.05,19
3,2024-10-02,16651,Donald Trump,48.485176,0.05,19
4,2024-09-30,16661,Kamala Harris,47.917434,0.05,19
5,2024-09-30,16651,Donald Trump,48.54338,0.05,19
6,2024-09-29,16661,Kamala Harris,47.875594,0.05,19
7,2024-09-29,16651,Donald Trump,48.497825,0.05,19
8,2024-09-28,16661,Kamala Harris,47.878804,0.05,19
9,2024-09-28,16651,Donald Trump,48.405932,0.05,19


In [103]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print("North Carolina")
print(df_aggregated.head(10))

# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (North Carolina)"
)

# Display the final chart
final_chart.display()


North Carolina
     end_date candidate_name  weighted_average_pct
57 2024-10-08  Kamala Harris             47.962935
56 2024-10-08   Donald Trump             48.694312
55 2024-10-02  Kamala Harris             48.127131
54 2024-10-02   Donald Trump             48.617566
53 2024-09-30  Kamala Harris             48.228481
52 2024-09-30   Donald Trump             48.699995
51 2024-09-29  Kamala Harris             48.054777
50 2024-09-29   Donald Trump             48.629799
49 2024-09-28  Kamala Harris             48.069563
48 2024-09-28   Donald Trump             48.534682


#### Boxplots of House Effects distributions by Pollster and Candidate

In [104]:
# Group by 'pollster' and calculate summary statistics for 'house_effect_data'
he_agg = house_effect_data.groupby(['end_date','pollster','candidate_name']).agg({
    'house_effect': 'mean'
}).reset_index()
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground Georgia

In [105]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]   # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Georgia"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
        
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
            
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_gt2 = group[group['count'] > 2]
                    
                    if not question_ids_with_count_gt2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()

                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                                    (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
                
                
                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect'],  # Existing house effect data
                    'lambda_value':lambda_value,
                    'avg_window':avg_window
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Georgia")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Georgia


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-10,16661,Kamala Harris,47.84232,0.05,19
1,2024-10-10,16651,Donald Trump,48.876712,0.05,19
2,2024-10-09,16661,Kamala Harris,47.618732,0.05,19
3,2024-10-09,16651,Donald Trump,48.803746,0.05,19
4,2024-10-08,16661,Kamala Harris,47.659686,0.05,19
5,2024-10-08,16651,Donald Trump,48.858757,0.05,19
6,2024-10-02,16661,Kamala Harris,47.601204,0.05,19
7,2024-10-02,16651,Donald Trump,48.6164,0.05,19
8,2024-09-30,16661,Kamala Harris,47.603122,0.05,19
9,2024-09-30,16651,Donald Trump,48.701483,0.05,19


In [106]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print("Georgia")
print(df_aggregated.head(10))

# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (North Carolina)"
)

# Display the final chart
final_chart.display()


Georgia
     end_date candidate_name  weighted_average_pct
45 2024-10-10  Kamala Harris             47.870892
44 2024-10-10   Donald Trump             48.885911
43 2024-10-09  Kamala Harris             47.828596
42 2024-10-09   Donald Trump             48.742238
41 2024-10-08  Kamala Harris             47.933754
40 2024-10-08   Donald Trump             48.561089
39 2024-10-02  Kamala Harris             47.856193
38 2024-10-02   Donald Trump             48.798915
37 2024-09-30  Kamala Harris             47.830404
36 2024-09-30   Donald Trump             48.936774


#### Boxplots of House Effects distributions by Pollster and Candidate

In [107]:
# Group by 'pollster' and calculate summary statistics for 'house_effect_data'
he_agg = house_effect_data.groupby(['end_date','pollster','candidate_name']).agg({
    'house_effect': 'mean'
}).reset_index()
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground Florida

In [108]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]   # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Florida"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
        
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
            
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_gt2 = group[group['count'] > 2]
                    
                    if not question_ids_with_count_gt2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()

                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                                    (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
                
                
                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect'],  # Existing house effect data
                    'lambda_value':lambda_value,
                    'avg_window':avg_window
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Florida")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Florida


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-07,16661,Kamala Harris,44.323807,0.05,19
1,2024-10-07,16651,Donald Trump,50.30315,0.05,19
2,2024-10-06,16661,Kamala Harris,44.251534,0.05,19
3,2024-10-06,16651,Donald Trump,50.120764,0.05,19
4,2024-10-04,16661,Kamala Harris,45.499811,0.05,19
5,2024-10-04,16651,Donald Trump,48.940639,0.05,19
6,2024-10-02,16661,Kamala Harris,45.995776,0.05,19
7,2024-10-02,16651,Donald Trump,48.898709,0.05,19
8,2024-09-27,16661,Kamala Harris,45.88446,0.05,19
9,2024-09-27,16651,Donald Trump,49.136027,0.05,19


In [109]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print("Florida")
print(df_aggregated.head(10))

# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (North Carolina)"
)

# Display the final chart
final_chart.display()


Florida
     end_date candidate_name  weighted_average_pct
31 2024-10-07  Kamala Harris             43.796659
30 2024-10-07   Donald Trump             50.774010
29 2024-10-06  Kamala Harris             43.355465
28 2024-10-06   Donald Trump             50.128030
27 2024-10-04  Kamala Harris             45.516456
26 2024-10-04   Donald Trump             49.164203
25 2024-10-02  Kamala Harris             46.043621
24 2024-10-02   Donald Trump             48.768895
23 2024-09-27  Kamala Harris             45.920537
22 2024-09-27   Donald Trump             49.125235


#### Boxplots of House Effects distributions by Pollster and Candidate

In [110]:
# Group by 'pollster' and calculate summary statistics for 'house_effect_data'
he_agg = house_effect_data.groupby(['end_date','pollster','candidate_name']).agg({
    'house_effect': 'mean'
}).reset_index()
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground Arizona

In [111]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]   # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Arizona"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
        
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
            
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_gt2 = group[group['count'] > 2]
                    
                    if not question_ids_with_count_gt2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()

                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                                    (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
                
                
                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect'],  # Existing house effect data
                    'lambda_value':lambda_value,
                    'avg_window':avg_window
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Arizona")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Arizona


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-10,16661,Kamala Harris,47.321721,0.05,19
1,2024-10-10,16651,Donald Trump,48.788983,0.05,19
2,2024-10-08,16661,Kamala Harris,47.156819,0.05,19
3,2024-10-08,16651,Donald Trump,48.773512,0.05,19
4,2024-10-02,16661,Kamala Harris,47.197815,0.05,19
5,2024-10-02,16651,Donald Trump,48.640533,0.05,19
6,2024-10-01,16661,Kamala Harris,47.268271,0.05,19
7,2024-10-01,16651,Donald Trump,48.586533,0.05,19
8,2024-09-30,16661,Kamala Harris,47.244623,0.05,19
9,2024-09-30,16651,Donald Trump,48.496847,0.05,19


In [112]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print("Arizona")
print(df_aggregated.head(10))

# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (Arizona)"
)

# Display the final chart
final_chart.display()


Arizona
     end_date candidate_name  weighted_average_pct
49 2024-10-10  Kamala Harris             47.129175
48 2024-10-10   Donald Trump             49.246225
47 2024-10-08  Kamala Harris             47.257239
46 2024-10-08   Donald Trump             49.027893
45 2024-10-02  Kamala Harris             47.337883
44 2024-10-02   Donald Trump             48.715798
43 2024-10-01  Kamala Harris             47.433436
42 2024-10-01   Donald Trump             48.650742
41 2024-09-30  Kamala Harris             47.410608
40 2024-09-30   Donald Trump             48.580199


#### Boxplots of House Effects distributions by Pollster and Candidate

In [113]:
# Group by 'pollster' and calculate summary statistics for 'house_effect_data'
he_agg = house_effect_data.groupby(['end_date','pollster','candidate_name']).agg({
    'house_effect': 'mean'
}).reset_index()
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### Battleground Nevada

In [114]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .05
avg_window = 9
weighted_averages = []
# Initialize lambda and average_window settings
lambda_values = [.05,.15,.35]   # 5 equal subdivisions between 0 and 1
average_windows = [19, 9]

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Nevada"].copy()
df_mi = df_mi[df_mi['weight_score'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['end_date','pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...
for lambda_value in lambda_values:
    for avg_window in average_windows:
        # Iterate through each unique end date
        for current_date in df_mi['end_date'].unique():
            
            # Step 2: Filter the data for polls on or before the current end date
            current_data = df_mi[df_mi['end_date'] <= current_date].copy()
            
            # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
            current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
            
            # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
            current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])
        
            # Check for NaN entries in the 'weight_time' column
            #nan_weight_time = current_data['weight_time'].isna().sum()
            
            # Print the result
            #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")
        
            # Step 2: Check for infinite values in the involved columns
            #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
            #print(f"Number of infinite values:\n{inf_check}")
            
            # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
            for candidate in candidate_ids:
        
                # Step 7: Filter data for the specific candidate
                candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
                # Step 1: Group by 'question_id' and count occurrences
                #question_id_counts = candidate_data['question_id'].value_counts()
                
                # Step 2: Get the 'question_id' values that occur more than 2 times
                #exclude_question_ids = question_id_counts[question_id_counts > 2].index
                
                # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
                #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
                #  We select H2H if avaible and only use non H2H if H2H is not available
        
                # Step 1: Group by 'created_at' and 'question_id' and count occurrences
                grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
                
                # Step 2: Initialize an empty DataFrame to store results
                result_df = pd.DataFrame()
            
                # Step 3: Iterate through each 'created_at' date group
                for date, group in grouped.groupby('created_at'):
                    # Step 4: Filter for question_ids with count == 2
                    question_ids_with_count_gt2 = group[group['count'] > 2]
                    
                    if not question_ids_with_count_gt2.empty:
                        # If question_ids with count 2 exist, include only them
                        selected_question_ids = question_ids_with_count_2['question_id']
                    else:
                        # If no question_ids with count 2 exist, include all question_ids for that date
                        selected_question_ids = group['question_id']
                    
                    # Step 5: Filter the original DataFrame to include only selected question_ids for that date
                    filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
                    
                    # Step 6: Append the filtered DataFrame to the result
                    result_df = pd.concat([result_df, filtered_df])
                candidate_data = result_df.copy()

                c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters
        
                c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
                #print(candidate)
                #print(c_mean)
                #print(c_std)
                #print(candidate_data['pct'].iloc[0])
                #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
                candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
                #print(candidate_data['zscores'])
                #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
                candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
                #lets grab house effect data
                # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
                #Start House effect data computation#
                c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
                c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                                    (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
                
                
                # Convert to a DataFrame for easier manipulation
                c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
                # Check for NaN values in the entire DataFrame
                #nan_check = c_mean_by_pollster_df.isna().sum()
                #print(c_mean)
                
                # Display the result
                #print(nan_check)
                # Calculate the house effect for each pollster (difference from overall mean)
                c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
                # Fill NaN values with 0 in the 'house_effect' column
                #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
                
                c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
                #print(c_mean_by_pollster_df.head())
                # Merge the house effect back into the original data
                candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
                
                candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
                #print(candidate_data.head(10))
                # Apply the house effect to adjust the 'pct' values
                candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
                #start saving house effect data for boxplots
                new_data = pd.DataFrame({
                    'end_date': current_date,
                    'pollster': candidate_data['pollster'],  # Existing pollster data
                    'state': candidate_data['state'],  # Existing state data
                    'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
                    'house_effect': candidate_data['house_effect'],  # Existing house effect data
                    'lambda_value':lambda_value,
                    'avg_window':avg_window
                })
                
                # Append new data to the main DataFrame
                house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
                #end saving house effect data for box plots
                #print(candidate_data['adjusted_pct'].head(10))
                # Step 8: Compute the total weight 'w_i' for each poll
                #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
                candidate_data.loc[:, 'w_i'] = (
                    candidate_data['weight_mode'] * candidate_data['weight_sample'] *
                    candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
                )
                # check for NaN entries in the w_i column
                #nan_weight_i = candidate_data['w_i'].isna().sum()
                #print result
                #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
                
                # Step 3: Identify rows with NaN in w_i to examine their individual values
                #nan_rows = candidate_data[candidate_data['w_i'].isna()]
                #print("Rows producing NaN in 'w_i':")
                #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
                
                # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
                weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
                # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
                if not candidate_data.empty:
                    candidate_name = candidate_data['candidate_name'].iloc[0]
                    weighted_averages.append({
                        'end_date': current_date,
                        'candidate_id': candidate,
                        'candidate_name': candidate_name,
                        'weighted_average_pct': weighted_average_pct,
                        'lambda': lambda_value,
                        'average_windows': avg_window
                    })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
print("Nevada")
df_weighted_averages.head(10)       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Nevada


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct,lambda,average_windows
0,2024-10-08,16661,Kamala Harris,49.275431,0.05,19
1,2024-10-08,16651,Donald Trump,47.565913,0.05,19
2,2024-10-03,16661,Kamala Harris,48.810341,0.05,19
3,2024-10-03,16651,Donald Trump,47.243827,0.05,19
4,2024-10-02,16661,Kamala Harris,48.78976,0.05,19
5,2024-10-02,16651,Donald Trump,47.132469,0.05,19
6,2024-09-30,16661,Kamala Harris,48.769579,0.05,19
7,2024-09-30,16651,Donald Trump,47.110438,0.05,19
8,2024-09-25,16661,Kamala Harris,48.360815,0.05,19
9,2024-09-25,16651,Donald Trump,46.899599,0.05,19


In [115]:
import altair as alt

# Aggregating the average for groupby 'end_date' and 'candidate_name'
df_aggregated = df_weighted_averages.groupby(['end_date', 'candidate_name']).agg({
    'weighted_average_pct': 'mean'
}).reset_index().sort_values(by='end_date',ascending=False)
print("Nevada")
print(df_aggregated.head(10))

# Define the custom color scale for the candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'], range=['blue', 'red'])

# Scatter plot with distinct shapes for each candidate
scatter = alt.Chart(df_aggregated).mark_point(size=60).encode(
    x=alt.X('end_date:T', title='End Date'),
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)', scale=alt.Scale(domain=[41, 52])),  # Restricted vertical axis
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    shape=alt.Shape('candidate_name:N', title='Candidate'),  # Different shapes for each candidate
    tooltip=[alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('weighted_average_pct:Q', title='Weighted Average (%)')]
).properties(
    width=800,
    height=400
)

# LOESS smoothing for each candidate, applied to the aggregated data
loess = alt.Chart(df_aggregated).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name']
).mark_line(size=3).encode(
    x='end_date:T',
    y=alt.Y('weighted_average_pct:Q', title='Weighted Average (%)'),
    color=alt.Color('candidate_name:N', scale=color_scale, title='Candidate'),  # Custom color scale for candidates
    tooltip=[alt.Tooltip('candidate_name:N', title='Candidate'), alt.Tooltip('end_date:T', title='Date'), alt.Tooltip('weighted_average_pct:Q', title='Smoothed Average (%)')]
)

# Layer scatter plot and LOESS smoothing
final_chart = alt.layer(scatter, loess).properties(
    title="Aggregated Weighted Averages and LOESS Curves for Each Candidate (Nevada)"
)

# Display the final chart
final_chart.display()


Nevada
     end_date candidate_name  weighted_average_pct
35 2024-10-08  Kamala Harris             49.000290
34 2024-10-08   Donald Trump             47.923337
33 2024-10-03  Kamala Harris             49.229722
32 2024-10-03   Donald Trump             47.388267
31 2024-10-02  Kamala Harris             49.313638
30 2024-10-02   Donald Trump             47.285671
29 2024-09-30  Kamala Harris             49.334633
28 2024-09-30   Donald Trump             47.136633
27 2024-09-25  Kamala Harris             48.713633
26 2024-09-25   Donald Trump             46.805513


#### Boxplots of House Effects distributions by Pollster and Candidate

In [116]:
# Group by 'pollster' and calculate summary statistics for 'house_effect_data'
he_agg = house_effect_data.groupby(['end_date','pollster','candidate_name']).agg({
    'house_effect': 'mean'
}).reset_index()
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(he_agg).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### State of Minnesota

In [67]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .15
avg_window = 20
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Minnesota"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_gt2 = group[group['count'] > 2]
            
            if not question_ids_with_count_gt2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
        #lets grab house effect data
        # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
        #Start House effect data computation#
        c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                            (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        #start saving house effect data for boxplots
        new_data = pd.DataFrame({
            'pollster': candidate_data['pollster'],  # Existing pollster data
            'state': candidate_data['state'],  # Existing state data
            'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
            'house_effect': candidate_data['house_effect']  # Existing house effect data
        })
        
        # Append new data to the main DataFrame
        house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
        #end saving house effect data for box plots
        #print(candidate_data['adjusted_pct'].head(10))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-09,16661,Kamala Harris,51.107822
1,2024-10-09,16651,Donald Trump,44.391558
2,2024-10-02,16661,Kamala Harris,49.7577
3,2024-10-02,16651,Donald Trump,43.246685
4,2024-09-26,16661,Kamala Harris,49.612351


In [68]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Nevada)',
    width=600,
    height=400
)

chart.show()

In [69]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable) for Minnesota',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


In [70]:
# Define a custom color scale for specific candidates
color_scale = alt.Scale(domain=['Kamala Harris', 'Donald Trump'],  # Candidates
                        range=['blue', 'red'])  # Corresponding colors

# Create a boxplot with Altair, faceted by both pollster and candidate_name
boxplot = alt.Chart(house_effect_data).mark_boxplot().encode(
    x='state:N',  # Categorical data for states on the x-axis
    y='house_effect:Q',  # Quantitative data for house_effect on the y-axis
    color=alt.Color('candidate_name:N', scale=color_scale) # Color by candidate_name
).facet(
    row='candidate_name:N',  # Facet by pollster
    column='pollster:N'  # Facet by candidate_name
).properties(
    title='House Effect Bias Distribution by State, Pollster, and Candidate'
).configure_facet(
    spacing=100  # Adjust space between the facets (columns and rows)
).resolve_scale(
    y='independent'  # Allow each facet to have its own independent y-scale
)

boxplot.show()

#### State of Texas

In [71]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .15
avg_window = 20
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Texas"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 0.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)
#Recording house effect data for visualization reuse
# Step 1: Initialize an empty DataFrame outside the loop
columns = ['pollster', 'state', 'candidate_name', 'house_effect']
house_effect_data = pd.DataFrame(columns=columns)
#end recording house effect data...

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_gt2 = group[group['count'] > 2]
            
            if not question_ids_with_count_gt2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] > 0)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'])#.apply(lambda z: z**2 if abs(z) > 2.5 else (z**1.75 if abs(z) > 2.0 else (z**1.5 if abs(z) > 1.5 else z))))
        #lets grab house effect data
        # Group the data by 'pollster' and calculate the mean of 'pct' for each pollstera
        #Start House effect data computation#
        c_mean2 = candidate_data[(candidate_data['days_past_index'] < avg_window) & (candidate_data['days_past_index'] >= 0)]['pct'].mean() 
        c_mean_by_pollster = candidate_data[(candidate_data['days_past_index'] < avg_window) & 
                                            (candidate_data['days_past_index'] >= 0)].groupby('pollster_id')['pct'].mean()
        
        
        # Convert to a DataFrame for easier manipulation
        c_mean_by_pollster_df = c_mean_by_pollster.reset_index()
        # Check for NaN values in the entire DataFrame
        #nan_check = c_mean_by_pollster_df.isna().sum()
        #print(c_mean)
        
        # Display the result
        #print(nan_check)
        # Calculate the house effect for each pollster (difference from overall mean)
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['pct'] - c_mean2
        # Fill NaN values with 0 in the 'house_effect' column
        #c_mean_by_pollster_df['house_effect'].fillna(0, inplace=True)
        
        c_mean_by_pollster_df['house_effect'] = c_mean_by_pollster_df['house_effect'].fillna(0)
        #print(c_mean_by_pollster_df.head())
        # Merge the house effect back into the original data
        candidate_data = candidate_data.merge(c_mean_by_pollster_df[['pollster_id', 'house_effect']], on='pollster_id', how='left')
        
        candidate_data['house_effect']=candidate_data['house_effect'].fillna(0)
        #print(candidate_data.head(10))
        # Apply the house effect to adjust the 'pct' values
        candidate_data['adjusted_pct'] = candidate_data['pct'] - candidate_data['house_effect']
        #start saving house effect data for boxplots
        new_data = pd.DataFrame({
            'pollster': candidate_data['pollster'],  # Existing pollster data
            'state': candidate_data['state'],  # Existing state data
            'candidate_name': candidate_data['candidate_name'],  # Adding candidate_name
            'house_effect': candidate_data['house_effect']  # Existing house effect data
        })
        
        # Append new data to the main DataFrame
        house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)
        #end saving house effect data for box plots
        #print(candidate_data['adjusted_pct'].head(10))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['adjusted_pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()       


  house_effect_data = pd.concat([house_effect_data, new_data], ignore_index=True)


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-10,16661,Kamala Harris,45.317636
1,2024-10-10,16651,Donald Trump,51.586898
2,2024-10-07,16661,Kamala Harris,45.236253
3,2024-10-07,16651,Donald Trump,51.312156
4,2024-10-06,16661,Kamala Harris,45.310371


In [72]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Texas)',
    width=600,
    height=400
)

chart.show()

In [73]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable) for Texas',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()
