In [249]:
import pandas as pd
import altair as alt
from statsmodels.nonparametric.smoothers_lowess import lowess
import numpy as np

In [250]:
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('president_polls.csv')

# Display the first few rows of the DataFrame to verify the import
df.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,stage,nationwide_batch,ranked_choice_reallocated,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct
0,88579,235,InsiderAdvantage,,,InsiderAdvantage,243,InsiderAdvantage,2.0,-0.3,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,47.0
1,88579,235,InsiderAdvantage,,,InsiderAdvantage,243,InsiderAdvantage,2.0,-0.3,...,general,False,False,,False,REP,Trump,16651,Donald Trump,49.0
2,88587,1741,ActiVote,,,ActiVote,721,ActiVote,,,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,49.4
3,88587,1741,ActiVote,,,ActiVote,721,ActiVote,,,...,general,False,False,,False,REP,Trump,16651,Donald Trump,50.6
4,88588,1741,ActiVote,,,ActiVote,721,ActiVote,,,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,49.3


In [251]:
df_state_na = df[df['state'].isna()]

df_state_not_na = df[df['state'].notna()]

print(df_state_na.head())
print(df_state_not_na.columns)
print(df_state_not_na.head())

   poll_id  pollster_id  pollster sponsor_ids   sponsors display_name  \
2    88587         1741  ActiVote         NaN        NaN     ActiVote   
3    88587         1741  ActiVote         NaN        NaN     ActiVote   
6    88590          568    YouGov         352  Economist       YouGov   
7    88590          568    YouGov         352  Economist       YouGov   
8    88590          568    YouGov         352  Economist       YouGov   

   pollster_rating_id pollster_rating_name  numeric_grade  pollscore  ...  \
2                 721             ActiVote            NaN        NaN  ...   
3                 721             ActiVote            NaN        NaN  ...   
6                 391               YouGov            3.0       -1.1  ...   
7                 391               YouGov            3.0       -1.1  ...   
8                 391               YouGov            3.0       -1.1  ...   

     stage  nationwide_batch ranked_choice_reallocated ranked_choice_round  \
2  general          

Let's filter out non scored national pollsters 

In [252]:
df_state_na_clean = df_state_na[df_state_na['numeric_grade'].notna()]
print(df_state_na_clean['numeric_grade'])
print(df_state_na_clean['numeric_grade'].max())

6        3.0
7        3.0
8        3.0
9        3.0
10       3.0
        ... 
15176    2.8
15177    2.8
15178    2.8
15179    2.8
15180    2.8
Name: numeric_grade, Length: 6862, dtype: float64
3.0


Let's create a weight for numerically graded pollsters

In [253]:
# Make a copy to avoid the warning
df_state_na_clean = df_state_na_clean.copy()

# Now safely create the 'weight_score' column
df_state_na_clean.loc[:, 'weight_score'] = df_state_na_clean['numeric_grade'] / 3.0


Let's check weight_grade column

In [254]:
df_state_na_clean.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,nationwide_batch,ranked_choice_reallocated,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score
6,88590,568,YouGov,352,Economist,YouGov,391,YouGov,3.0,-1.1,...,False,False,,False,DEM,Harris,16661,Kamala Harris,47.0,1.0
7,88590,568,YouGov,352,Economist,YouGov,391,YouGov,3.0,-1.1,...,False,False,,False,REP,Trump,16651,Donald Trump,44.0,1.0
8,88590,568,YouGov,352,Economist,YouGov,391,YouGov,3.0,-1.1,...,False,False,,False,GRE,Stein,31116,Jill Stein,1.0,1.0
9,88590,568,YouGov,352,Economist,YouGov,391,YouGov,3.0,-1.1,...,False,False,,False,IND,West,31097,Cornel West,0.0,1.0
10,88590,568,YouGov,352,Economist,YouGov,391,YouGov,3.0,-1.1,...,False,False,,False,DEM,Harris,16661,Kamala Harris,49.0,1.0


Let's go ahead and examine unique methodologies

In [255]:
print(df_state_na_clean['methodology'].unique())

['Online Panel' 'Probability Panel' 'Live Phone' nan
 'IVR/Online Panel/Text-to-Web' 'Live Phone/Online Panel/Text-to-Web'
 'Live Phone/Text-to-Web' 'Live Phone/Online Panel/App Panel'
 'Live Phone/Online Panel/Text' 'Live Phone/Probability Panel' 'IVR'
 'Online Panel/Text-to-Web' 'Text-to-Web/Online Ad' 'Online Ad'
 'Live Phone/Online Panel' 'Live Phone/Text-to-Web/Online Ad'
 'IVR/Text-to-Web' 'Live Phone/Text/Online Panel' 'IVR/Online Panel'
 'Text' 'Online Panel/Online Ad' 'IVR/Online Panel/Email'
 'IVR/Live Phone/Text/Online Panel/Email' 'Live Phone/Text/Online Ad'
 'IVR/Text' 'Online Panel/Text-to-Web/Text'
 'Live Phone/Text-to-Web/App Panel' 'Online Panel/Probability Panel'
 'App Panel' 'IVR/Online Panel/Text-to-Web/Email']


mapping different methodologies to weight_mode

In [256]:
# Make a copy to avoid the warning
df_state_na_clean = df_state_na_clean.copy()
# Mapping the weights to modes based on the table above
mode_weights = {
    'Live Phone': 1.00,
    'Live Phone/Probability Panel': 0.95,
    'Live Phone/Online Panel/Text-to-Web': 0.90,
    'Live Phone/Online Panel/Text': 0.90,
    'Live Phone/Text-to-Web/App Panel': 0.82,
    'Live Phone/Text-to-Web/Online Ad': 0.85,
    'Live Phone/Text-to-Web': 0.85,
    'Live Phone/Text/Online Panel': 0.90,
    'Live Phone/Online Panel': 0.85,
    'Live Phone/Online Panel/App Panel': 0.85,
    'IVR/Live Phone/Text/Online Panel/Email': 0.80,
    'Live Phone/Text/Online Ad': 0.80,
    'IVR/Online Panel/Email': 0.77,
    'IVR/Online Panel/Text-to-Web/Email': 0.75,
    'IVR/Online Panel/Text-to-Web': 0.75,
    'IVR/Online Panel': 0.70,
    'IVR': 0.70,
    'Online Panel/Probability Panel': 0.65,
    'Probability Panel': 0.65,
    'Online Panel/Text-to-Web': 0.60,
    'Online Panel/Online Ad': 0.55,
    'Online Panel': 0.50,
    'Online Ad': 0.50,
    'App Panel': 0.50,
    'Online Panel/Text-to-Web/Text': 0.50,
    'IVR/Text-to-Web': 0.50,
    'Text-to-Web/Online Ad': 0.45,
    'Text': 0.40,
    'IVR/Text': 0.40,
    'nan' : 0.50,
     np.nan: 0.50  # Handling missing or unknown values
}

# Apply the mapping to create a new column 'weight_mode'
df_state_na_clean.loc[:,'weight_mode'] = df_state_na_clean['methodology'].map(mode_weights)

Let's check out the 'weight_mode' column

In [257]:
print(df_state_na_clean.head())

    poll_id  pollster_id pollster sponsor_ids   sponsors display_name  \
6     88590          568   YouGov         352  Economist       YouGov   
7     88590          568   YouGov         352  Economist       YouGov   
8     88590          568   YouGov         352  Economist       YouGov   
9     88590          568   YouGov         352  Economist       YouGov   
10    88590          568   YouGov         352  Economist       YouGov   

    pollster_rating_id pollster_rating_name  numeric_grade  pollscore  ...  \
6                  391               YouGov            3.0       -1.1  ...   
7                  391               YouGov            3.0       -1.1  ...   
8                  391               YouGov            3.0       -1.1  ...   
9                  391               YouGov            3.0       -1.1  ...   
10                 391               YouGov            3.0       -1.1  ...   

   ranked_choice_reallocated  ranked_choice_round hypothetical party  answer  \
6           

Let's create a weight for sample size, but first let's look for NaN in sample_size column

In [258]:
# Count the number of NaN values in the 'sample_size' column
nan_count = df['sample_size'].isna().sum()

print(f"Number of NaN values in 'sample_size': {nan_count}")

# Calculate the mean of the available (non-NaN) sample sizes
mean_sample_size = df['sample_size'].mean()

print(f"Mean of available sample sizes: {mean_sample_size}")


Number of NaN values in 'sample_size': 132
Mean of available sample sizes: 1617.3308750249153


In [259]:
import numpy as np

In [260]:
# Step 2: Create the 'weight_sample' column
df_state_na_clean['weight_sample'] = df_state_na_clean['sample_size'].apply(lambda x: np.sqrt(x) if not np.isnan(x) else np.sqrt(mean_sample_size))

# Display the first few rows to verify
df_state_na_clean.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
6,88590,568,YouGov,352,Economist,YouGov,391,YouGov,3.0,-1.1,...,,False,DEM,Harris,16661,Kamala Harris,47.0,1.0,0.5,37.603191
7,88590,568,YouGov,352,Economist,YouGov,391,YouGov,3.0,-1.1,...,,False,REP,Trump,16651,Donald Trump,44.0,1.0,0.5,37.603191
8,88590,568,YouGov,352,Economist,YouGov,391,YouGov,3.0,-1.1,...,,False,GRE,Stein,31116,Jill Stein,1.0,1.0,0.5,37.603191
9,88590,568,YouGov,352,Economist,YouGov,391,YouGov,3.0,-1.1,...,,False,IND,West,31097,Cornel West,0.0,1.0,0.5,37.603191
10,88590,568,YouGov,352,Economist,YouGov,391,YouGov,3.0,-1.1,...,,False,DEM,Harris,16661,Kamala Harris,49.0,1.0,0.5,35.071356


Sort end_date values in descending order

In [261]:
# Convert 'end_date' to datetime format with specified format for single/double digits in month/day
df_state_na_clean['end_date'] = pd.to_datetime(df_state_na_clean['end_date'], format='%m/%d/%y', errors='coerce')

# Sort the DataFrame by 'end_date'
df_state_na_clean_sorted = df_state_na_clean.sort_values(by='end_date',ascending=False)

In [262]:
df_state_na_clean_sorted.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
6,88590,568,YouGov,352,Economist,YouGov,391,YouGov,3.0,-1.1,...,,False,DEM,Harris,16661,Kamala Harris,47.0,1.0,0.5,37.603191
25,88558,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,,False,REP,Trump,16651,Donald Trump,43.0,0.933333,0.65,32.802439
7,88590,568,YouGov,352,Economist,YouGov,391,YouGov,3.0,-1.1,...,,False,REP,Trump,16651,Donald Trump,44.0,1.0,0.5,37.603191
33,88558,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,,False,REP,Trump,16651,Donald Trump,49.0,0.933333,0.65,32.802439
32,88558,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,,False,DEM,Harris,16661,Kamala Harris,51.0,0.933333,0.65,32.802439


Let's create a 'days_past_index' that can be used for weight_time_decay value for moving average

In [263]:
# Step 3: Get the first (top) date after sorting
first_date = df_state_na_clean_sorted['end_date'].iloc[0]

# Step 4: Compute the difference in days and create the 'days_past_index' column
df_state_na_clean_sorted['days_past_index'] = (first_date - df_state_na_clean_sorted['end_date']).dt.days

In [264]:
df_state_na_clean_sorted.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample,days_past_index
6,88590,568,YouGov,352,Economist,YouGov,391,YouGov,3.0,-1.1,...,False,DEM,Harris,16661,Kamala Harris,47.0,1.0,0.5,37.603191,0
25,88558,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,False,REP,Trump,16651,Donald Trump,43.0,0.933333,0.65,32.802439,0
7,88590,568,YouGov,352,Economist,YouGov,391,YouGov,3.0,-1.1,...,False,REP,Trump,16651,Donald Trump,44.0,1.0,0.5,37.603191,0
33,88558,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,False,REP,Trump,16651,Donald Trump,49.0,0.933333,0.65,32.802439,0
32,88558,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,False,DEM,Harris,16661,Kamala Harris,51.0,0.933333,0.65,32.802439,0


In [265]:
df_state_na_clean_sorted.tail()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample,days_past_index
15176,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,REP,Cruz,16641,Ted Cruz,24.0,0.933333,0.65,33.24154,1270
15177,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,DEM,Biden,19368,Joe Biden,41.0,0.933333,0.65,33.256578,1270
15178,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,REP,DeSantis,16646,Ron DeSantis,25.0,0.933333,0.65,33.256578,1270
15179,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,DEM,Biden,19368,Joe Biden,44.0,0.933333,0.65,33.27161,1270
15180,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,True,REP,Haley,16640,Nikki Haley,19.0,0.933333,0.65,33.27161,1270


In [266]:
# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_na_clean_sorted_cutoff = df_state_na_clean_sorted[df_state_na_clean_sorted['end_date'] >= cutoff_date].copy()

# Check for NaN values in 'weight_mode', 'weight_sample', and 'weight_score'
nan_check = df_state_na_clean_sorted_cutoff[['weight_mode', 'weight_sample', 'weight_score']].isna().sum()

# Step 2: Check for infinite values in the involved columns
inf_check = df_state_na_clean_sorted_cutoff[['weight_mode', 'weight_sample', 'weight_score']].isin([np.inf, -np.inf]).sum()
print(f"Number of infinite values:\n{inf_check}")


# Print the result to verify if there are any NaN values
print(nan_check)

# Step 1: Filter rows where 'weight_mode' is NaN
nan_weight_mode = df_state_na_clean_sorted_cutoff[df_state_na_clean_sorted_cutoff['weight_mode'].isna()]

# Step 2: Display the 'methodology' or other relevant columns to investigate the methodology used
# For example, we'll check 'pollster', 'sponsors', and 'methodology' (if available) along with 'weight_mode'
nan_weight_mode_info = nan_weight_mode[['pollster', 'sponsors', 'methodology', 'weight_mode']]

# Print the resulting DataFrame for verification
print(nan_weight_mode_info)

Number of infinite values:
weight_mode      0
weight_sample    0
weight_score     0
dtype: int64
weight_mode      0
weight_sample    0
weight_score     0
dtype: int64
Empty DataFrame
Columns: [pollster, sponsors, methodology, weight_mode]
Index: []


Let's compute a weight_i where weight_i is for a given 'end_date' the determined index weight which determines for a given end_date the weight assigned to a row used in computing the point average_i. Note: we will use the sorted dates to filter dates that only on the date or before, then we will compute the 'weight_time' using the formula: exp(-lambdat) where lambda = 1.0 and t = days elapsed since the beginning of the poll end date.
finally we can compute the weight_i = 'weight_mode''weight_sample'*weight_score'*weight_time'

In [267]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = 1.0
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_na_clean_sorted_cutoff = df_state_na_clean_sorted[df_state_na_clean_sorted['end_date'] >= cutoff_date].copy()
df_lv = df_state_na_clean_sorted_cutoff[df_state_na_clean_sorted_cutoff['population']=='lv'].copy()

# Iterate through each unique end date
for current_date in df_lv['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_lv[df_lv['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        c_mean = candidate_data[candidate_data['days_past_index']< 30]['pct'].mean() #gather mean for past 30 days of likely voters
        c_std =  candidate_data[candidate_data['days_past_index']< 30]['pct'].std()  #gather standard deviation for past 30 days of likely voters
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.0*candidate_data['zscores'])              
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-07,16661,Kamala Harris,49.142905
1,2024-10-07,16651,Donald Trump,45.584557
2,2024-10-06,16661,Kamala Harris,49.07447
3,2024-10-06,16651,Donald Trump,45.634819
4,2024-10-05,16661,Kamala Harris,49.937005


Let's view the plot with weight average:

In [268]:
# Create the line chart with points
# Create a customized color encoding for Trump and Harris
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Create the line chart with points, assigning specific colors to each candidate
chart = alt.Chart(df_weighted_averages).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate_name:N', scale=color_scale),  # Custom color scale for candidates
    tooltip=['end_date', 'candidate_name', 'weighted_average_pct']  # Add tooltips to show details
).properties(
    title='Weighted Average Polling Results Over Time',
    width=600,
    height=400
)

chart.show()


In [269]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q']  # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time',
    width=600,
    height=400
)

chart.show()

Let's look at the loess smoothed curve of this

In [270]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


Let's look at A/B graded pollsters

In [271]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = 0.23
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_na_clean_sorted_cutoff = df_state_na_clean_sorted[df_state_na_clean_sorted['end_date'] >= cutoff_date].copy()
df_ab_pollsters = df_state_na_clean_sorted_cutoff[df_state_na_clean_sorted_cutoff['numeric_grade'] >= 2.4].copy()
df_lv = df_ab_pollsters[df_ab_pollsters['population'] == 'lv'].copy()
# Iterate through each unique end date
for current_date in df_lv['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_lv[df_lv['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        c_mean = candidate_data[candidate_data['days_past_index']< 30]['pct'].mean() #gather mean for past 30 days of likely voters
        c_std =  candidate_data[candidate_data['days_past_index']< 30]['pct'].std()  #gather standard deviation for past 30 days of likely voters
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.0*candidate_data['zscores'])              
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-07,16661,Kamala Harris,48.757456
1,2024-10-07,16651,Donald Trump,46.000878
2,2024-10-06,16661,Kamala Harris,48.688203
3,2024-10-06,16651,Donald Trump,46.017391
4,2024-10-04,16661,Kamala Harris,48.798699


In [272]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time',
    width=600,
    height=400
)

chart.show()

In [273]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


### Moving on to Battleground state data

Let's clean 'numeric_grade' rows for df_state_not_na so that there aren't ungraded pollsters in our list

In [274]:
#df_state_not_na_clean = df_state_not_na[df_state_not_na['numeric_grade'].notna()]

In [275]:
na_num = df_state_not_na_clean['numeric_grade'].isna().sum()
print(na_num)

0


Let's normalize 'numeric_grade' to create a 'weight_score'

In [276]:
# Make a copy to avoid the warning
df_state_not_na = df_state_not_na.copy()

# Step 1: Replace NaN values in 'numeric_grade' with a low grade (e.g., 1.0)
df_state_not_na['numeric_grade'].fillna(0.1, inplace=True)

df_state_not_na_clean = df_state_not_na.copy()
# Step 2: Safely create the 'weight_score' column by dividing 'numeric_grade' by 3.0
df_state_not_na_clean.loc[:, 'weight_score'] = df_state_not_na_clean['numeric_grade'] / 3.0

# Verify the updated DataFrame
print(df_state_not_na_clean[['numeric_grade', 'weight_score']].head())

    numeric_grade  weight_score
0             2.0      0.666667
1             2.0      0.666667
4             0.1      0.033333
5             0.1      0.033333
14            0.1      0.033333


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_state_not_na['numeric_grade'].fillna(0.1, inplace=True)


In [277]:
na_num = df_state_not_na_clean['weight_score'].isna().sum()
print(na_num)

0


#### Creating 'weight_mode' weight for methodology

In [278]:
import numpy as np
# Make a copy to avoid the warning
df_state_not_na_clean = df_state_not_na_clean.copy()
# Mapping the weights to modes based on the table above
mode_weights = {
    "Text-to-Web/Email": 0.75,
    "IVR/Live Phone/Text-to-Web": 0.76,
    "IVR/Live Phone/Online Panel": 0.78,
    "IVR/Live Phone/Online Panel/Text-to-Web": 0.77,
    "Live Phone/Text-to-Web/Email/Mail-to-Web": 0.76,
    "Live Phone/Text-to-Web/Email": 0.78,
    "Email/Online Ad": 0.73,
    "Online Panel/Email": 0.78,
    "Live Phone/Online Panel/Mail-to-Web": 0.78,
    "IVR/Text-to-Web/Email": 0.72,
    'Email':0.8,
    'Live Phone': 1.00,
    'Live Phone/Probability Panel': 0.95,
    'Live Phone/Online Panel/Text-to-Web': 0.90,
    'Live Phone/Online Panel/Text': 0.90,
    'Live Phone/Text-to-Web/App Panel': 0.85,
    'Live Phone/Text-to-Web/Online Ad': 0.85,
    'Live Phone/Text-to-Web': 0.85,
    'Live Phone/Text/Online Panel': 0.90,
    'Live Phone/Online Panel': 0.85,
    'Live Phone/Online Panel/App Panel': 0.85,
    'Live Phone/Text-to-Web/Email/Mail-to-Web/Mail-to-Phone':0.76,
    'Live Phone/Email':0.82,
    'Live Phone/Online Panel/Text-to-Web/Text':0.8,
    'Live Phone/Text':0.83,
    'IVR/Live Phone/Text/Online Panel/Email': 0.80,
    'Live Phone/Text/Online Ad': 0.80,
    'IVR/Live Phone/Text':0.78,
    'IVR/Online Panel/Email': 0.77,
    'IVR/Online Panel/Text-to-Web/Email': 0.75,
    'IVR/Online Panel/Text-to-Web': 0.75,
    'IVR/Online Panel': 0.70,
    'IVR': 0.70,
    'Mail-to-Web/Mail-to-Phone': 0.7,
    'Online Panel/Probability Panel': 0.65,
    'Probability Panel': 0.65,
    'Online Panel/Email/Text-to-Web':0.77,
    'Online Panel/Text-to-Web': 0.60,
    'Online Panel/Text':0.78,
    'Online Panel/Online Ad': 0.55,
    'Online Panel': 0.50,
    'Online Ad': 0.50,
    'App Panel': 0.50,
    'Online Panel/Text-to-Web/Text': 0.50,
    'IVR/Text-to-Web': 0.50,
    'Text-to-Web/Online Ad': 0.45,
    'Text-to-Web':0.45,
    'Text': 0.40,
    'IVR/Text': 0.40,
    'nan' : 0.50,
     np.nan: 0.50  # Handling missing or unknown values
}

# Apply the mapping to create a new column 'weight_mode'
df_state_not_na_clean.loc[:,'weight_mode'] = df_state_not_na_clean['methodology'].map(mode_weights)

In [279]:
num_na = df_state_not_na_clean['weight_mode'].isna().sum()
print(num_na)
df_ret = df_state_not_na_clean[df_state_not_na_clean['weight_mode'].isna()][['methodology']]
print(df_ret['methodology'].unique())

0
[]


#### Create a Weight_Sample weight for sample size

In [280]:
# Step 2: Create the 'weight_sample' column
df_state_not_na_clean['weight_sample'] = df_state_not_na_clean['sample_size'].apply(lambda x: np.sqrt(x/600) if not np.isnan(x) else np.sqrt(mean_sample_size/600))

# Display the first few rows to verify
df_state_not_na_clean.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
0,88579,235,InsiderAdvantage,,,InsiderAdvantage,243,InsiderAdvantage,2.0,-0.3,...,,False,DEM,Harris,16661,Kamala Harris,47.0,0.666667,0.4,1.154701
1,88579,235,InsiderAdvantage,,,InsiderAdvantage,243,InsiderAdvantage,2.0,-0.3,...,,False,REP,Trump,16651,Donald Trump,49.0,0.666667,0.4,1.154701
4,88588,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,,False,DEM,Harris,16661,Kamala Harris,49.3,0.033333,0.5,0.816497
5,88588,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,,False,REP,Trump,16651,Donald Trump,50.7,0.033333,0.5,0.816497
14,88552,1890,SoCal Strategies,21522170.0,On Point Politics | Red Eagle Politics,SoCal Strategies,851,SoCal Research,0.1,,...,,False,DEM,Harris,16661,Kamala Harris,48.7,0.033333,0.5,1.106797


In [281]:
num_na = df_state_not_na_clean['weight_sample'].isna().sum()
print(num_na)

0


####  Converting 'end_date' to datetime format and sorting dataframe by end_date

In [282]:
# Convert 'end_date' to datetime format with specified format for single/double digits in month/day
df_state_not_na_clean['end_date'] = pd.to_datetime(df_state_not_na_clean['end_date'], format='%m/%d/%y', errors='coerce')

# Sort the DataFrame by 'end_date'
df_state_not_na_clean_sorted = df_state_not_na_clean.sort_values(by='end_date',ascending=False)

In [283]:
df_state_not_na_clean_sorted.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
0,88579,235,InsiderAdvantage,,,InsiderAdvantage,243,InsiderAdvantage,2.0,-0.3,...,,False,DEM,Harris,16661,Kamala Harris,47.0,0.666667,0.4,1.154701
4,88588,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,,False,DEM,Harris,16661,Kamala Harris,49.3,0.033333,0.5,0.816497
5,88588,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,,False,REP,Trump,16651,Donald Trump,50.7,0.033333,0.5,0.816497
1,88579,235,InsiderAdvantage,,,InsiderAdvantage,243,InsiderAdvantage,2.0,-0.3,...,,False,REP,Trump,16651,Donald Trump,49.0,0.666667,0.4,1.154701
50,88593,396,Quinnipiac,,,Quinnipiac University,267,Quinnipiac University,2.8,-0.5,...,,False,GRE,Stein,31116,Jill Stein,1.0,0.933333,1.0,1.337286


#### Let's compute battleground state PA

In [376]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .13
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-08-10')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Pennsylvania"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 2.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < 40) & (candidate_data['days_past_index'] > 1)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < 40) & (candidate_data['days_past_index'] > 1)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'].apply(lambda z: z**2 if abs(z) > 2 else z))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()       


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-08,16661,Kamala Harris,48.818413
1,2024-10-08,16651,Donald Trump,47.136318
2,2024-10-07,16661,Kamala Harris,48.856769
3,2024-10-07,16651,Donald Trump,47.078376
4,2024-10-02,16661,Kamala Harris,48.600595


In [377]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Pennsylvania)',
    width=600,
    height=400
)

chart.show()

In [378]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Let's compute battleground state Michigan

In [382]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .13
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-08-10')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Michigan"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 2.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < 40) & (candidate_data['days_past_index'] > 1)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < 40) & (candidate_data['days_past_index'] > 1)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'].apply(lambda z: z**3 if abs(z) > 2 else z))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()       


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-07,16661,Kamala Harris,47.598113
1,2024-10-07,16651,Donald Trump,47.138952
2,2024-10-04,16661,Kamala Harris,47.886203
3,2024-10-04,16651,Donald Trump,46.630802
4,2024-10-02,16661,Kamala Harris,48.029212


In [383]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Michigan)',
    width=600,
    height=400
)

chart.show()

In [384]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


####  Battleground Wisconsin

In [385]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .13
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-08-10')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Wisconsin"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 2.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < 40) & (candidate_data['days_past_index'] > 1)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < 40) & (candidate_data['days_past_index'] > 1)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'].apply(lambda z: z**3 if abs(z) > 2 else z))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()       


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-07,16661,Kamala Harris,48.3877
1,2024-10-07,16651,Donald Trump,47.61974
2,2024-10-06,16661,Kamala Harris,48.903384
3,2024-10-06,16651,Donald Trump,47.010825
4,2024-10-02,16661,Kamala Harris,49.012025


In [386]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Wisconsin)',
    width=600,
    height=400
)

chart.show()

In [387]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Battleground North Carolina

In [388]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .13
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-08-10')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="North Carolina"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 2.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < 40) & (candidate_data['days_past_index'] > 1)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < 40) & (candidate_data['days_past_index'] > 1)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'].apply(lambda z: z**3 if abs(z) > 2 else z))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()       


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-06,16661,Kamala Harris,47.864871
1,2024-10-06,16651,Donald Trump,48.671837
2,2024-10-02,16661,Kamala Harris,47.884394
3,2024-10-02,16651,Donald Trump,48.700589
4,2024-09-30,16661,Kamala Harris,47.931635


In [389]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (North Carolina)',
    width=600,
    height=400
)

chart.show()

In [390]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Battleground Georgia

In [391]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .13
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-08-10')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Georgia"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 2.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < 40) & (candidate_data['days_past_index'] > 1)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < 40) & (candidate_data['days_past_index'] > 1)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'].apply(lambda z: z**3 if abs(z) > 2 else z))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()       


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-02,16661,Kamala Harris,47.329204
1,2024-10-02,16651,Donald Trump,48.575419
2,2024-09-30,16661,Kamala Harris,47.585389
3,2024-09-30,16651,Donald Trump,48.759181
4,2024-09-29,16661,Kamala Harris,47.524006


In [392]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Georgia)',
    width=600,
    height=400
)

chart.show()

In [393]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Battleground Florida

In [394]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .13
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-08-10')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Florida"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 2.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < 40) & (candidate_data['days_past_index'] > 1)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < 40) & (candidate_data['days_past_index'] > 1)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'].apply(lambda z: z**3 if abs(z) > 2 else z))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()       


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-06,16661,Kamala Harris,45.53615
1,2024-10-06,16651,Donald Trump,50.272593
2,2024-10-02,16661,Kamala Harris,45.513317
3,2024-10-02,16651,Donald Trump,49.498484
4,2024-09-27,16661,Kamala Harris,46.031393


In [395]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Florida)',
    width=600,
    height=400
)

chart.show()

#### Battleground Arizona

In [396]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .13
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-08-10')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Arizona"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 2.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < 40) & (candidate_data['days_past_index'] > 1)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < 40) & (candidate_data['days_past_index'] > 1)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'].apply(lambda z: z**3 if abs(z) > 2 else z))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()       


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-08,16661,Kamala Harris,47.57871
1,2024-10-08,16651,Donald Trump,48.757065
2,2024-10-07,16661,Kamala Harris,47.539626
3,2024-10-07,16651,Donald Trump,48.763386
4,2024-10-02,16661,Kamala Harris,47.549381


In [397]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Arizona)',
    width=600,
    height=400
)

chart.show()

In [398]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Battleground Nevada

In [399]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .13
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-08-10')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Nevada"].copy()
#df_mi = df_mi[df_mi['numeric_grade'] >= 2.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data
#print(df_mi)

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        # Step 1: Group by 'question_id' and count occurrences
        #question_id_counts = candidate_data['question_id'].value_counts()
        
        # Step 2: Get the 'question_id' values that occur more than 2 times
        #exclude_question_ids = question_id_counts[question_id_counts > 2].index
        
        # Step 3: Filter the DataFrame to exclude 'question_id' values with more than 2 occurrences
        #candidate_data = candidate_data[~candidate_data['question_id'].isin(exclude_question_ids)].copy()
        #  We select H2H if avaible and only use non H2H if H2H is not available

        # Step 1: Group by 'created_at' and 'question_id' and count occurrences
        grouped = candidate_data.groupby(['created_at', 'question_id']).size().reset_index(name='count')
        
        # Step 2: Initialize an empty DataFrame to store results
        result_df = pd.DataFrame()
    
        # Step 3: Iterate through each 'created_at' date group
        for date, group in grouped.groupby('created_at'):
            # Step 4: Filter for question_ids with count == 2
            question_ids_with_count_2 = group[group['count'] == 2]
            
            if not question_ids_with_count_2.empty:
                # If question_ids with count 2 exist, include only them
                selected_question_ids = question_ids_with_count_2['question_id']
            else:
                # If no question_ids with count 2 exist, include all question_ids for that date
                selected_question_ids = group['question_id']
            
            # Step 5: Filter the original DataFrame to include only selected question_ids for that date
            filtered_df = candidate_data[(candidate_data['created_at'] == date) & (candidate_data['question_id'].isin(selected_question_ids))]
            
            # Step 6: Append the filtered DataFrame to the result
            result_df = pd.concat([result_df, filtered_df])
        candidate_data = result_df.copy()

        c_mean = candidate_data[(candidate_data['days_past_index'] < 40) & (candidate_data['days_past_index'] > 1)]['pct'].mean() #gather mean for past 30 days of likely voters

        c_std = candidate_data[(candidate_data['days_past_index'] < 40) & (candidate_data['days_past_index'] > 1)]['pct'].std() #gather standard deviation for past 30 days of likely voters
        #print(candidate)
        #print(c_mean)
        #print(c_std)
        #print(candidate_data['pct'].iloc[0])
        #print(abs((candidate_data['pct'].iloc[0]-c_mean)/c_std))
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        #print(candidate_data['zscores'])
        #df['z_score'].apply(lambda z: z ** 2 if abs(z) > 1 else z)
        candidate_data['weight_outlier'] = np.exp(-1*candidate_data['zscores'].apply(lambda z: z**3 if abs(z) > 2 else z))
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()       


Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-03,16661,Kamala Harris,48.701325
1,2024-10-03,16651,Donald Trump,47.571742
2,2024-10-02,16661,Kamala Harris,48.613213
3,2024-10-02,16651,Donald Trump,47.388175
4,2024-09-30,16661,Kamala Harris,48.847923


In [400]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Nevada)',
    width=600,
    height=400
)

chart.show()

In [401]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()
