In [10]:
import pandas as pd
import altair as alt
from statsmodels.nonparametric.smoothers_lowess import lowess
import numpy as np

In [3]:
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('president_polls.csv')

# Display the first few rows of the DataFrame to verify the import
df.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,election_date,stage,nationwide_batch,ranked_choice_reallocated,ranked_choice_round,party,answer,candidate_id,candidate_name,pct
0,88528,1478,Research Co.,,,Research Co.,449,Research Co.,2.4,-0.5,...,11/5/24,general,False,False,,DEM,Harris,16661,Kamala Harris,49.0
1,88528,1478,Research Co.,,,Research Co.,449,Research Co.,2.4,-0.5,...,11/5/24,general,False,False,,REP,Trump,16651,Donald Trump,45.0
2,88525,1741,ActiVote,,,ActiVote,721,ActiVote,,,...,11/5/24,general,False,False,,DEM,Harris,16661,Kamala Harris,47.0
3,88525,1741,ActiVote,,,ActiVote,721,ActiVote,,,...,11/5/24,general,False,False,,REP,Trump,16651,Donald Trump,53.0
4,88524,1741,ActiVote,,,ActiVote,721,ActiVote,,,...,11/5/24,general,False,False,,DEM,Harris,16661,Kamala Harris,50.8


In [4]:
df_state_na = df[df['state'].isna()]

df_state_not_na = df[df['state'].notna()]

print(df_state_na.head())
print(df_state_not_na.columns)
print(df_state_not_na.head())

    poll_id  pollster_id      pollster sponsor_ids           sponsors  \
0     88528         1478  Research Co.         NaN                NaN   
1     88528         1478  Research Co.         NaN                NaN   
8     88523          770          TIPP        1742  Issues & Insights   
9     88523          770          TIPP        1742  Issues & Insights   
10    88523          770          TIPP        1742  Issues & Insights   

     display_name  pollster_rating_id pollster_rating_name  numeric_grade  \
0    Research Co.                 449         Research Co.            2.4   
1    Research Co.                 449         Research Co.            2.4   
8   TIPP Insights                 144        TIPP Insights            1.8   
9   TIPP Insights                 144        TIPP Insights            1.8   
10  TIPP Insights                 144        TIPP Insights            1.8   

    pollscore  ... election_date    stage nationwide_batch  \
0        -0.5  ...       11/5/24  ge

Let's filter out non scored national pollsters 

In [5]:
df_state_na_clean = df_state_na[df_state_na['numeric_grade'].notna()]
print(df_state_na_clean['numeric_grade'])
print(df_state_na_clean['numeric_grade'].max())

0        2.4
1        2.4
8        1.8
9        1.8
10       1.8
        ... 
14965    2.8
14966    2.8
14967    2.8
14968    2.8
14969    2.8
Name: numeric_grade, Length: 6797, dtype: float64
3.0


Let's create a weight for numerically graded pollsters

In [6]:
# Make a copy to avoid the warning
df_state_na_clean = df_state_na_clean.copy()

# Now safely create the 'weight_score' column
df_state_na_clean.loc[:, 'weight_score'] = df_state_na_clean['numeric_grade'] / 3.0


Let's check weight_grade column

In [7]:
df_state_na_clean.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,stage,nationwide_batch,ranked_choice_reallocated,ranked_choice_round,party,answer,candidate_id,candidate_name,pct,weight_score
0,88528,1478,Research Co.,,,Research Co.,449,Research Co.,2.4,-0.5,...,general,False,False,,DEM,Harris,16661,Kamala Harris,49.0,0.8
1,88528,1478,Research Co.,,,Research Co.,449,Research Co.,2.4,-0.5,...,general,False,False,,REP,Trump,16651,Donald Trump,45.0,0.8
8,88523,770,TIPP,1742.0,Issues & Insights,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,general,False,False,,DEM,Harris,16661,Kamala Harris,49.0,0.6
9,88523,770,TIPP,1742.0,Issues & Insights,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,general,False,False,,REP,Trump,16651,Donald Trump,46.0,0.6
10,88523,770,TIPP,1742.0,Issues & Insights,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,general,False,False,,DEM,Harris,16661,Kamala Harris,48.0,0.6


Let's go ahead and examine unique methodologies

In [8]:
print(df_state_na_clean['methodology'].unique())

['Online Panel' 'IVR/Online Panel/Text-to-Web'
 'Live Phone/Online Panel/Text-to-Web' nan 'Probability Panel'
 'Live Phone' 'Live Phone/Text-to-Web' 'Live Phone/Online Panel/App Panel'
 'Live Phone/Online Panel/Text' 'Live Phone/Probability Panel' 'IVR'
 'Online Panel/Text-to-Web' 'Text-to-Web/Online Ad' 'Online Ad'
 'Live Phone/Online Panel' 'Live Phone/Text-to-Web/Online Ad'
 'IVR/Text-to-Web' 'Live Phone/Text/Online Panel' 'IVR/Online Panel'
 'Text' 'Online Panel/Online Ad' 'IVR/Online Panel/Email'
 'IVR/Live Phone/Text/Online Panel/Email' 'Live Phone/Text/Online Ad'
 'IVR/Text' 'Online Panel/Text-to-Web/Text'
 'Live Phone/Text-to-Web/App Panel' 'Online Panel/Probability Panel'
 'App Panel' 'IVR/Online Panel/Text-to-Web/Email']


mapping different methodologies to weight_mode

In [11]:
# Make a copy to avoid the warning
df_state_na_clean = df_state_na_clean.copy()
# Mapping the weights to modes based on the table above
mode_weights = {
    'Live Phone': 1.00,
    'Live Phone/Probability Panel': 0.95,
    'Live Phone/Online Panel/Text-to-Web': 0.90,
    'Live Phone/Online Panel/Text': 0.90,
    'Live Phone/Text-to-Web/App Panel': 0.82,
    'Live Phone/Text-to-Web/Online Ad': 0.85,
    'Live Phone/Text-to-Web': 0.85,
    'Live Phone/Text/Online Panel': 0.90,
    'Live Phone/Online Panel': 0.85,
    'Live Phone/Online Panel/App Panel': 0.85,
    'IVR/Live Phone/Text/Online Panel/Email': 0.80,
    'Live Phone/Text/Online Ad': 0.80,
    'IVR/Online Panel/Email': 0.77,
    'IVR/Online Panel/Text-to-Web/Email': 0.75,
    'IVR/Online Panel/Text-to-Web': 0.75,
    'IVR/Online Panel': 0.70,
    'IVR': 0.70,
    'Online Panel/Probability Panel': 0.65,
    'Probability Panel': 0.65,
    'Online Panel/Text-to-Web': 0.60,
    'Online Panel/Online Ad': 0.55,
    'Online Panel': 0.50,
    'Online Ad': 0.50,
    'App Panel': 0.50,
    'Online Panel/Text-to-Web/Text': 0.50,
    'IVR/Text-to-Web': 0.50,
    'Text-to-Web/Online Ad': 0.45,
    'Text': 0.40,
    'IVR/Text': 0.40,
    'nan' : 0.50,
     np.nan: 0.50  # Handling missing or unknown values
}

# Apply the mapping to create a new column 'weight_mode'
df_state_na_clean.loc[:,'weight_mode'] = df_state_na_clean['methodology'].map(mode_weights)

Let's check out the 'weight_mode' column

In [12]:
print(df_state_na_clean.head())

    poll_id  pollster_id      pollster sponsor_ids           sponsors  \
0     88528         1478  Research Co.         NaN                NaN   
1     88528         1478  Research Co.         NaN                NaN   
8     88523          770          TIPP        1742  Issues & Insights   
9     88523          770          TIPP        1742  Issues & Insights   
10    88523          770          TIPP        1742  Issues & Insights   

     display_name  pollster_rating_id pollster_rating_name  numeric_grade  \
0    Research Co.                 449         Research Co.            2.4   
1    Research Co.                 449         Research Co.            2.4   
8   TIPP Insights                 144        TIPP Insights            1.8   
9   TIPP Insights                 144        TIPP Insights            1.8   
10  TIPP Insights                 144        TIPP Insights            1.8   

    pollscore  ... nationwide_batch  ranked_choice_reallocated  \
0        -0.5  ...            Fa

Let's create a weight for sample size, but first let's look for NaN in sample_size column

In [13]:
# Count the number of NaN values in the 'sample_size' column
nan_count = df['sample_size'].isna().sum()

print(f"Number of NaN values in 'sample_size': {nan_count}")

# Calculate the mean of the available (non-NaN) sample sizes
mean_sample_size = df['sample_size'].mean()

print(f"Mean of available sample sizes: {mean_sample_size}")


Number of NaN values in 'sample_size': 132
Mean of available sample sizes: 1618.3398247978437


In [14]:
import numpy as np

In [15]:
# Step 2: Create the 'weight_sample' column
df_state_na_clean['weight_sample'] = df_state_na_clean['sample_size'].apply(lambda x: np.sqrt(x) if not np.isnan(x) else np.sqrt(mean_sample_size))

# Display the first few rows to verify
df_state_na_clean.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_reallocated,ranked_choice_round,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
0,88528,1478,Research Co.,,,Research Co.,449,Research Co.,2.4,-0.5,...,False,,DEM,Harris,16661,Kamala Harris,49.0,0.8,0.5,31.638584
1,88528,1478,Research Co.,,,Research Co.,449,Research Co.,2.4,-0.5,...,False,,REP,Trump,16651,Donald Trump,45.0,0.8,0.5,31.638584
8,88523,770,TIPP,1742.0,Issues & Insights,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,,DEM,Harris,16661,Kamala Harris,49.0,0.6,0.5,31.575307
9,88523,770,TIPP,1742.0,Issues & Insights,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,,REP,Trump,16651,Donald Trump,46.0,0.6,0.5,31.575307
10,88523,770,TIPP,1742.0,Issues & Insights,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,,DEM,Harris,16661,Kamala Harris,48.0,0.6,0.5,31.575307


Sort end_date values in descending order

In [16]:
# Convert 'end_date' to datetime format with specified format for single/double digits in month/day
df_state_na_clean['end_date'] = pd.to_datetime(df_state_na_clean['end_date'], format='%m/%d/%y', errors='coerce')

# Sort the DataFrame by 'end_date'
df_state_na_clean_sorted = df_state_na_clean.sort_values(by='end_date',ascending=False)

In [17]:
df_state_na_clean_sorted.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_reallocated,ranked_choice_round,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
0,88528,1478,Research Co.,,,Research Co.,449,Research Co.,2.4,-0.5,...,False,,DEM,Harris,16661,Kamala Harris,49.0,0.8,0.5,31.638584
1,88528,1478,Research Co.,,,Research Co.,449,Research Co.,2.4,-0.5,...,False,,REP,Trump,16651,Donald Trump,45.0,0.8,0.5,31.638584
8,88523,770,TIPP,1742.0,Issues & Insights,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,,DEM,Harris,16661,Kamala Harris,49.0,0.6,0.5,31.575307
22,88527,568,YouGov,1213.0,Yahoo News,YouGov,391,YouGov,3.0,-1.1,...,False,,IND,West,31097,Cornel West,1.0,1.0,0.5,32.124757
9,88523,770,TIPP,1742.0,Issues & Insights,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,False,,REP,Trump,16651,Donald Trump,46.0,0.6,0.5,31.575307


Let's create a 'days_past_index' that can be used for weight_time_decay value for moving average

In [18]:
# Step 3: Get the first (top) date after sorting
first_date = df_state_na_clean_sorted['end_date'].iloc[0]

# Step 4: Compute the difference in days and create the 'days_past_index' column
df_state_na_clean_sorted['days_past_index'] = (first_date - df_state_na_clean_sorted['end_date']).dt.days

In [19]:
df_state_na_clean_sorted.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample,days_past_index
0,88528,1478,Research Co.,,,Research Co.,449,Research Co.,2.4,-0.5,...,,DEM,Harris,16661,Kamala Harris,49.0,0.8,0.5,31.638584,0
1,88528,1478,Research Co.,,,Research Co.,449,Research Co.,2.4,-0.5,...,,REP,Trump,16651,Donald Trump,45.0,0.8,0.5,31.638584,0
8,88523,770,TIPP,1742.0,Issues & Insights,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,,DEM,Harris,16661,Kamala Harris,49.0,0.6,0.5,31.575307,2
22,88527,568,YouGov,1213.0,Yahoo News,YouGov,391,YouGov,3.0,-1.1,...,,IND,West,31097,Cornel West,1.0,1.0,0.5,32.124757,2
9,88523,770,TIPP,1742.0,Issues & Insights,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,,REP,Trump,16651,Donald Trump,46.0,0.6,0.5,31.575307,2


In [20]:
df_state_na_clean_sorted.tail()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_round,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample,days_past_index
14965,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,,REP,Cruz,16641,Ted Cruz,24.0,0.933333,0.65,33.24154,1269
14966,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,,DEM,Biden,19368,Joe Biden,41.0,0.933333,0.65,33.256578,1269
14967,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,,REP,DeSantis,16646,Ron DeSantis,25.0,0.933333,0.65,33.256578,1269
14968,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,,DEM,Biden,19368,Joe Biden,44.0,0.933333,0.65,33.27161,1269
14969,74812,241,Ipsos,71,Reuters,Ipsos,154,Ipsos,2.8,-0.9,...,,REP,Haley,16640,Nikki Haley,19.0,0.933333,0.65,33.27161,1269


In [21]:
# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_na_clean_sorted_cutoff = df_state_na_clean_sorted[df_state_na_clean_sorted['end_date'] >= cutoff_date].copy()

# Check for NaN values in 'weight_mode', 'weight_sample', and 'weight_score'
nan_check = df_state_na_clean_sorted_cutoff[['weight_mode', 'weight_sample', 'weight_score']].isna().sum()

# Step 2: Check for infinite values in the involved columns
inf_check = df_state_na_clean_sorted_cutoff[['weight_mode', 'weight_sample', 'weight_score']].isin([np.inf, -np.inf]).sum()
print(f"Number of infinite values:\n{inf_check}")


# Print the result to verify if there are any NaN values
print(nan_check)

# Step 1: Filter rows where 'weight_mode' is NaN
nan_weight_mode = df_state_na_clean_sorted_cutoff[df_state_na_clean_sorted_cutoff['weight_mode'].isna()]

# Step 2: Display the 'methodology' or other relevant columns to investigate the methodology used
# For example, we'll check 'pollster', 'sponsors', and 'methodology' (if available) along with 'weight_mode'
nan_weight_mode_info = nan_weight_mode[['pollster', 'sponsors', 'methodology', 'weight_mode']]

# Print the resulting DataFrame for verification
print(nan_weight_mode_info)

Number of infinite values:
weight_mode      0
weight_sample    0
weight_score     0
dtype: int64
weight_mode      0
weight_sample    0
weight_score     0
dtype: int64
Empty DataFrame
Columns: [pollster, sponsors, methodology, weight_mode]
Index: []


Let's compute a weight_i where weight_i is for a given 'end_date' the determined index weight which determines for a given end_date the weight assigned to a row used in computing the point average_i. Note: we will use the sorted dates to filter dates that only on the date or before, then we will compute the 'weight_time' using the formula: exp(-lambdat) where lambda = 1.0 and t = days elapsed since the beginning of the poll end date.
finally we can compute the weight_i = 'weight_mode''weight_sample'*weight_score'*weight_time'

In [22]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = 1.0
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_na_clean_sorted_cutoff = df_state_na_clean_sorted[df_state_na_clean_sorted['end_date'] >= cutoff_date].copy()
df_lv = df_state_na_clean_sorted_cutoff[df_state_na_clean_sorted_cutoff['population']=='lv'].copy()

# Iterate through each unique end date
for current_date in df_lv['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_lv[df_lv['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        c_mean = candidate_data[candidate_data['days_past_index']< 30]['pct'].mean() #gather mean for past 30 days of likely voters
        c_std =  candidate_data[candidate_data['days_past_index']< 30]['pct'].std()  #gather standard deviation for past 30 days of likely voters
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.0*candidate_data['zscores'])              
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-06,16661,Kamala Harris,48.876474
1,2024-10-06,16651,Donald Trump,45.652614
2,2024-10-04,16661,Kamala Harris,48.626943
3,2024-10-04,16651,Donald Trump,46.430593
4,2024-10-03,16661,Kamala Harris,49.172692


Let's view the plot with weight average:

In [23]:
# Create the line chart with points
# Create a customized color encoding for Trump and Harris
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Create the line chart with points, assigning specific colors to each candidate
chart = alt.Chart(df_weighted_averages).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate_name:N', scale=color_scale),  # Custom color scale for candidates
    tooltip=['end_date', 'candidate_name', 'weighted_average_pct']  # Add tooltips to show details
).properties(
    title='Weighted Average Polling Results Over Time',
    width=600,
    height=400
)

chart.show()


In [24]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q']  # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time',
    width=600,
    height=400
)

chart.show()

Let's look at the loess smoothed curve of this

In [25]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


Let's look at A/B graded pollsters

In [26]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = 0.23
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-07-25')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_na_clean_sorted_cutoff = df_state_na_clean_sorted[df_state_na_clean_sorted['end_date'] >= cutoff_date].copy()
df_ab_pollsters = df_state_na_clean_sorted_cutoff[df_state_na_clean_sorted_cutoff['numeric_grade'] >= 2.4].copy()
df_lv = df_ab_pollsters[df_ab_pollsters['population'] == 'lv'].copy()
# Iterate through each unique end date
for current_date in df_lv['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_lv[df_lv['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        c_mean = candidate_data[candidate_data['days_past_index']< 30]['pct'].mean() #gather mean for past 30 days of likely voters
        c_std =  candidate_data[candidate_data['days_past_index']< 30]['pct'].std()  #gather standard deviation for past 30 days of likely voters
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.0*candidate_data['zscores'])              
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['weight_score'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-06,16661,Kamala Harris,48.962442
1,2024-10-06,16651,Donald Trump,46.77591
2,2024-10-04,16661,Kamala Harris,48.837904
3,2024-10-04,16651,Donald Trump,47.022421
4,2024-10-03,16661,Kamala Harris,49.336147


In [27]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time',
    width=600,
    height=400
)

chart.show()

In [28]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


### Moving on to Battleground state data

Let's clean 'numeric_grade' rows for df_state_not_na so that there aren't ungraded pollsters in our list

In [29]:
#df_state_not_na_clean = df_state_not_na[df_state_not_na['numeric_grade'].notna()]

In [30]:
na_num = df_state_not_na_clean['numeric_grade'].isna().sum()
print(na_num)

NameError: name 'df_state_not_na_clean' is not defined

Let's normalize 'numeric_grade' to create a 'weight_score'

In [31]:
# Make a copy to avoid the warning
df_state_not_na = df_state_not_na.copy()

# Step 1: Replace NaN values in 'numeric_grade' with a low grade (e.g., 1.0)
df_state_not_na['numeric_grade'].fillna(0.1, inplace=True)

df_state_not_na_clean = df_state_not_na.copy()
# Step 2: Safely create the 'weight_score' column by dividing 'numeric_grade' by 3.0
df_state_not_na_clean.loc[:, 'weight_score'] = df_state_not_na_clean['numeric_grade'] / 3.0

# Verify the updated DataFrame
print(df_state_not_na_clean[['numeric_grade', 'weight_score']].head())

   numeric_grade  weight_score
2            0.1      0.033333
3            0.1      0.033333
4            0.1      0.033333
5            0.1      0.033333
6            0.1      0.033333


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_state_not_na['numeric_grade'].fillna(0.1, inplace=True)


In [32]:
na_num = df_state_not_na_clean['weight_score'].isna().sum()
print(na_num)

0


#### Creating 'weight_mode' weight for methodology

In [33]:
import numpy as np
# Make a copy to avoid the warning
df_state_not_na_clean = df_state_not_na_clean.copy()
# Mapping the weights to modes based on the table above
mode_weights = {
    "Text-to-Web/Email": 0.75,
    "IVR/Live Phone/Text-to-Web": 0.76,
    "IVR/Live Phone/Online Panel": 0.78,
    "IVR/Live Phone/Online Panel/Text-to-Web": 0.77,
    "Live Phone/Text-to-Web/Email/Mail-to-Web": 0.76,
    "Live Phone/Text-to-Web/Email": 0.78,
    "Email/Online Ad": 0.73,
    "Online Panel/Email": 0.78,
    "Live Phone/Online Panel/Mail-to-Web": 0.78,
    "IVR/Text-to-Web/Email": 0.72,
    'Email':0.8,
    'Live Phone': 1.00,
    'Live Phone/Probability Panel': 0.95,
    'Live Phone/Online Panel/Text-to-Web': 0.90,
    'Live Phone/Online Panel/Text': 0.90,
    'Live Phone/Text-to-Web/App Panel': 0.85,
    'Live Phone/Text-to-Web/Online Ad': 0.85,
    'Live Phone/Text-to-Web': 0.85,
    'Live Phone/Text/Online Panel': 0.90,
    'Live Phone/Online Panel': 0.85,
    'Live Phone/Online Panel/App Panel': 0.85,
    'Live Phone/Text-to-Web/Email/Mail-to-Web/Mail-to-Phone':0.76,
    'Live Phone/Email':0.82,
    'Live Phone/Online Panel/Text-to-Web/Text':0.8,
    'Live Phone/Text':0.83,
    'IVR/Live Phone/Text/Online Panel/Email': 0.80,
    'Live Phone/Text/Online Ad': 0.80,
    'IVR/Live Phone/Text':0.78,
    'IVR/Online Panel/Email': 0.77,
    'IVR/Online Panel/Text-to-Web/Email': 0.75,
    'IVR/Online Panel/Text-to-Web': 0.75,
    'IVR/Online Panel': 0.70,
    'IVR': 0.70,
    'Mail-to-Web/Mail-to-Phone': 0.7,
    'Online Panel/Probability Panel': 0.65,
    'Probability Panel': 0.65,
    'Online Panel/Email/Text-to-Web':0.77,
    'Online Panel/Text-to-Web': 0.60,
    'Online Panel/Text':0.78,
    'Online Panel/Online Ad': 0.55,
    'Online Panel': 0.50,
    'Online Ad': 0.50,
    'App Panel': 0.50,
    'Online Panel/Text-to-Web/Text': 0.50,
    'IVR/Text-to-Web': 0.50,
    'Text-to-Web/Online Ad': 0.45,
    'Text-to-Web':0.45,
    'Text': 0.40,
    'IVR/Text': 0.40,
    'nan' : 0.50,
     np.nan: 0.50  # Handling missing or unknown values
}

# Apply the mapping to create a new column 'weight_mode'
df_state_not_na_clean.loc[:,'weight_mode'] = df_state_not_na_clean['methodology'].map(mode_weights)

In [34]:
num_na = df_state_not_na_clean['weight_mode'].isna().sum()
print(num_na)
df_ret = df_state_not_na_clean[df_state_not_na_clean['weight_mode'].isna()][['methodology']]
print(df_ret['methodology'].unique())

0
[]


#### Create a Weight_Sample weight for sample size

In [35]:
# Step 2: Create the 'weight_sample' column
df_state_not_na_clean['weight_sample'] = df_state_not_na_clean['sample_size'].apply(lambda x: np.sqrt(x/600) if not np.isnan(x) else np.sqrt(mean_sample_size/600))

# Display the first few rows to verify
df_state_not_na_clean.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_reallocated,ranked_choice_round,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
2,88525,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,False,,DEM,Harris,16661,Kamala Harris,47.0,0.033333,0.5,0.816497
3,88525,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,False,,REP,Trump,16651,Donald Trump,53.0,0.033333,0.5,0.816497
4,88524,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,False,,DEM,Harris,16661,Kamala Harris,50.8,0.033333,0.5,0.816497
5,88524,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,False,,REP,Trump,16651,Donald Trump,49.2,0.033333,0.5,0.816497
6,88518,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,False,,DEM,Harris,16661,Kamala Harris,58.8,0.033333,0.5,0.816497


In [36]:
num_na = df_state_not_na_clean['weight_sample'].isna().sum()
print(num_na)

0


####  Converting 'end_date' to datetime format and sorting dataframe by end_date

In [37]:
# Convert 'end_date' to datetime format with specified format for single/double digits in month/day
df_state_not_na_clean['end_date'] = pd.to_datetime(df_state_not_na_clean['end_date'], format='%m/%d/%y', errors='coerce')

# Sort the DataFrame by 'end_date'
df_state_not_na_clean_sorted = df_state_not_na_clean.sort_values(by='end_date',ascending=False)

In [38]:
df_state_not_na_clean_sorted.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,ranked_choice_reallocated,ranked_choice_round,party,answer,candidate_id,candidate_name,pct,weight_score,weight_mode,weight_sample
2,88525,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,False,,DEM,Harris,16661,Kamala Harris,47.0,0.033333,0.5,0.816497
4,88524,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,False,,DEM,Harris,16661,Kamala Harris,50.8,0.033333,0.5,0.816497
5,88524,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,False,,REP,Trump,16651,Donald Trump,49.2,0.033333,0.5,0.816497
3,88525,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,False,,REP,Trump,16651,Donald Trump,53.0,0.033333,0.5,0.816497
6,88518,1741,ActiVote,,,ActiVote,721,ActiVote,0.1,,...,False,,DEM,Harris,16661,Kamala Harris,58.8,0.033333,0.5,0.816497


#### Let's compute battleground state PA

In [87]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = 0.23
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-08-10')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()
#print(df_state_not_na_clean_sorted_cutoff['state'].unique())

df_pa = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Pennsylvania"].copy()
df_pa = df_pa[df_pa['numeric_grade'] >= 2.4].copy() #numeric grade for those rated F or above
df_pa = df_pa[df_pa['population']=='lv'].copy() # retrieve likely voter result set
#print(df_pa.head())

# Iterate through each unique end date
for current_date in df_pa['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_pa[df_pa['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        #outlier detection starts here.  This weights against too much noisy change to disrupt burst outlier pollling
        c_mean = candidate_data[candidate_data['days_past_index']< 14]['pct'].mean() #gather mean for past 30 days of likely voters
        c_std =  candidate_data[candidate_data['days_past_index']< 14]['pct'].std()  #gather standard deviation for past 30 days of likely voters
        # outlier detection ends
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.0*candidate_data['zscores'])        
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['numeric_grade'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-09-28,16661,Kamala Harris,48.839718
1,2024-09-28,16651,Donald Trump,48.271791
2,2024-09-25,16661,Kamala Harris,48.625535
3,2024-09-25,16651,Donald Trump,48.0062
4,2024-09-24,16661,Kamala Harris,48.896028


In [88]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Pennsylvania)',
    width=600,
    height=400
)

chart.show()

In [89]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Let's compute battleground state Michigan

In [94]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .23
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-08-10')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()


df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Michigan"].copy()
df_mi = df_mi[df_mi['numeric_grade'] >= 2.4].copy() #using data from F rated pollsters and above
df_mi = df_mi[df_mi['population'] == 'lv'].copy() #gather likely voter data

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)  
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        c_mean = candidate_data[candidate_data['days_past_index']< 30]['pct'].mean() #gather mean for past 30 days of likely voters
        c_std =  candidate_data[candidate_data['days_past_index']< 30]['pct'].std()  #gather standard deviation for past 30 days of likely voters
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.5*candidate_data['zscores'])
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['numeric_grade'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-09-30,16661,Kamala Harris,47.780916
1,2024-09-30,16651,Donald Trump,47.096524
2,2024-09-26,16661,Kamala Harris,47.939012
3,2024-09-26,16651,Donald Trump,46.76797
4,2024-09-25,16661,Kamala Harris,48.524346


In [95]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Michigan)',
    width=600,
    height=400
)

chart.show()

In [96]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


####  Battleground Wisconsin

In [99]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .23
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-08-10')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()
print(df_state_not_na_clean_sorted_cutoff['state'].unique())

df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Wisconsin"].copy()
df_mi = df_mi[df_mi['numeric_grade'] >= 2.4].copy()
df_mi = df_mi[df_mi['population'] == 'lv'].copy()
print(df_mi.head())

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        c_mean = candidate_data[candidate_data['days_past_index']< 30]['pct'].mean()
        c_std =  candidate_data[candidate_data['days_past_index']< 30]['pct'].std()
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.0*candidate_data['zscores'])
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['numeric_grade'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



['Florida' 'North Carolina' 'Illinois' 'Virginia' 'Nevada' 'Minnesota'
 'Pennsylvania' 'Michigan' 'Wisconsin' 'New Jersey' 'Arizona'
 'New Hampshire' 'Georgia' 'Nebraska' 'North Dakota' 'Indiana' 'New York'
 'Ohio' 'South Carolina' 'Alaska' 'Iowa' 'Texas' 'Nebraska CD-2'
 'California' 'Maryland' 'Delaware' 'Montana' 'Rhode Island' 'Connecticut'
 'New Mexico' 'Massachusetts' 'Colorado' 'Maine CD-2' 'Maine' 'Maine CD-1'
 'Missouri' 'Washington' 'Arkansas' 'Oklahoma' 'Tennessee' 'West Virginia'
 'Vermont']
     poll_id  pollster_id   pollster sponsor_ids sponsors  \
288    88406         1424  Siena/NYT         NaN      NaN   
287    88406         1424  Siena/NYT         NaN      NaN   
286    88406         1424  Siena/NYT         NaN      NaN   
290    88406         1424  Siena/NYT         NaN      NaN   
289    88406         1424  Siena/NYT         NaN      NaN   

                         display_name  pollster_rating_id  \
288  The New York Times/Siena College                 448   
28

Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-09-26,16661,Kamala Harris,48.984833
1,2024-09-26,16651,Donald Trump,47.193531
2,2024-09-25,16661,Kamala Harris,48.858304
3,2024-09-25,16651,Donald Trump,48.471012
4,2024-09-20,16661,Kamala Harris,49.58313


In [100]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Wisconsin)',
    width=600,
    height=400
)

chart.show()

In [101]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Battleground North Carolina

In [105]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .13
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-08-10')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()
#print(df_state_not_na_clean_sorted_cutoff['state'].unique())

df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="North Carolina"].copy()
df_mi = df_mi[df_mi['numeric_grade'] >= 2.4].copy()
df_mi = df_mi[df_mi['population']=='lv'].copy()
#print(df_mi.head())

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        c_mean = candidate_data[candidate_data['days_past_index']< 30]['pct'].mean()
        c_std =  candidate_data[candidate_data['days_past_index']< 30]['pct'].std()
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.0*candidate_data['zscores'])
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['numeric_grade'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-09-29,16661,Kamala Harris,48.174288
1,2024-09-29,16651,Donald Trump,48.811552
2,2024-09-28,16661,Kamala Harris,48.637729
3,2024-09-28,16651,Donald Trump,48.563405
4,2024-09-26,16661,Kamala Harris,48.473694


In [103]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (North Carolina)',
    width=600,
    height=400
)

chart.show()

In [104]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Battleground Georgia

In [127]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .23
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-08-10')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()
print(df_state_not_na_clean_sorted_cutoff['state'].unique())

df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Georgia"].copy()
df_mi = df_mi[df_mi['numeric_grade'] >= 1.5].copy()
df_mi = df_mi[df_mi['population'] == 'lv'].copy()
print(df_mi.head())

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        c_mean = candidate_data[candidate_data['days_past_index']< 30]['pct'].mean()
        c_std =  candidate_data[candidate_data['days_past_index']< 30]['pct'].std()
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.5*candidate_data['zscores'])
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['numeric_grade'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



['Florida' 'North Carolina' 'Illinois' 'Virginia' 'Nevada' 'Minnesota'
 'Pennsylvania' 'Michigan' 'Wisconsin' 'New Jersey' 'Arizona'
 'New Hampshire' 'Georgia' 'Nebraska' 'North Dakota' 'Indiana' 'New York'
 'Ohio' 'South Carolina' 'Alaska' 'Iowa' 'Texas' 'Nebraska CD-2'
 'California' 'Maryland' 'Delaware' 'Montana' 'Rhode Island' 'Connecticut'
 'New Mexico' 'Massachusetts' 'Colorado' 'Maine CD-2' 'Maine' 'Maine CD-1'
 'Missouri' 'Washington' 'Arkansas' 'Oklahoma' 'Tennessee' 'West Virginia'
 'Vermont']
     poll_id  pollster_id                      pollster sponsor_ids  \
57     88532         1562  Redfield & Wilton Strategies        1473   
58     88532         1562  Redfield & Wilton Strategies        1473   
59     88532         1562  Redfield & Wilton Strategies        1473   
56     88532         1562  Redfield & Wilton Strategies        1473   
122    88453          235              InsiderAdvantage         NaN   

          sponsors                  display_name  pollster_ratin

Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-02,16661,Kamala Harris,47.210849
1,2024-10-02,16651,Donald Trump,48.822675
2,2024-09-30,16661,Kamala Harris,47.430742
3,2024-09-30,16651,Donald Trump,49.086084
4,2024-09-29,16661,Kamala Harris,47.269829


In [128]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Georgia)',
    width=600,
    height=400
)

chart.show()

In [129]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Battleground Florida

In [130]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .13
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-08-10')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()
#print(df_state_not_na_clean_sorted_cutoff['state'].unique())

df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Florida"].copy()
df_mi = df_mi[df_mi['numeric_grade'] >= 1.0].copy()
#df_mi = df_mi[df_mi['population'] == 'lv'].copy()
#print(df_mi.head())

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        c_mean = candidate_data['pct'].mean()
        c_std =  candidate_data['pct'].std()
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.0*candidate_data['zscores'])
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['numeric_grade'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        #print(candidate_data['weight_outlier'])
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-02,16661,Kamala Harris,45.310126
1,2024-10-02,16651,Donald Trump,49.282985
2,2024-09-27,16661,Kamala Harris,45.749591
3,2024-09-27,16651,Donald Trump,49.595694
4,2024-09-26,16661,Kamala Harris,45.366379


In [131]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Florida)',
    width=600,
    height=400
)

chart.show()

#### Battleground Arizona

In [56]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .13
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-08-10')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()
print(df_state_not_na_clean_sorted_cutoff['state'].unique())

df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Arizona"].copy()
df_mi = df_mi[df_mi['numeric_grade'] >= 1.5].copy()
df_mi = df_mi[df_mi['population'] == 'lv'].copy()
print(df_mi.head())

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        c_mean = candidate_data[candidate_data['days_past_index']< 50]['pct'].mean()
        c_std =  candidate_data[candidate_data['days_past_index']< 50]['pct'].std()
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.0*candidate_data['zscores'])
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['numeric_grade'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



['Florida' 'North Carolina' 'Illinois' 'Virginia' 'Nevada' 'Minnesota'
 'Pennsylvania' 'Michigan' 'Wisconsin' 'New Jersey' 'Arizona'
 'New Hampshire' 'Georgia' 'Nebraska' 'North Dakota' 'Indiana' 'New York'
 'Ohio' 'South Carolina' 'Alaska' 'Iowa' 'Texas' 'Nebraska CD-2'
 'California' 'Maryland' 'Delaware' 'Montana' 'Rhode Island' 'Connecticut'
 'New Mexico' 'Massachusetts' 'Colorado' 'Maine CD-2' 'Maine' 'Maine CD-1'
 'Missouri' 'Washington' 'Arkansas' 'Oklahoma' 'Tennessee' 'West Virginia'
 'Vermont']
    poll_id  pollster_id                      pollster sponsor_ids  \
50    88530         1562  Redfield & Wilton Strategies        1473   
46    88526         1554                  RMG Research        2178   
47    88526         1554                  RMG Research        2178   
48    88530         1562  Redfield & Wilton Strategies        1473   
49    88530         1562  Redfield & Wilton Strategies        1473   

                  sponsors                  display_name  pollster_rat

Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-02,16661,Kamala Harris,47.368758
1,2024-10-02,16651,Donald Trump,48.732373
2,2024-09-30,16661,Kamala Harris,47.592439
3,2024-09-30,16651,Donald Trump,48.707994
4,2024-09-29,16661,Kamala Harris,47.572912


In [57]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Arizona)',
    width=600,
    height=400
)

chart.show()

In [58]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()


#### Battleground Nevada

In [59]:
# Initialize lambda and create an empty DataFrame to store the results
lambda_value = .13
weighted_averages = []

# Define the two candidate IDs of interest
candidate_ids = [16661, 16651]

# Define the cut-off date for polls (July 25, 2024)
cutoff_date = pd.to_datetime('2024-08-10')

# Step 2: Filter to exclude polls with an 'end_date' before July 25, 2024
df_state_not_na_clean_sorted_cutoff = df_state_not_na_clean_sorted[df_state_not_na_clean_sorted['end_date'] >= cutoff_date].copy()
print(df_state_not_na_clean_sorted_cutoff['state'].unique())

df_mi = df_state_not_na_clean_sorted_cutoff[df_state_not_na_clean_sorted_cutoff['state']=="Nevada"].copy()
df_mi = df_mi[df_mi['numeric_grade'] >= 1.7].copy()
df_mi = df_mi[df_mi['population']=='lv'].copy()
print(df_mi.head())

# Iterate through each unique end date
for current_date in df_mi['end_date'].unique():
    
    # Step 2: Filter the data for polls on or before the current end date
    current_data = df_mi[df_mi['end_date'] <= current_date].copy()
    
    # Step 3: Compute 'days_past_index' as the difference between current_date and each poll's end_date
    current_data['days_past_index'] = (current_date - current_data['end_date']).dt.days
    
    # Step 4: Compute 'weight_time' using the formula exp(-lambda * t)
    current_data['weight_time'] = np.exp(-lambda_value * current_data['days_past_index'])

    # Check for NaN entries in the 'weight_time' column
    #nan_weight_time = current_data['weight_time'].isna().sum()
    
    # Print the result
    #print(f"Number of NaN values in 'weight_time': {nan_weight_time}")

    # Step 2: Check for infinite values in the involved columns
    #inf_check = current_data[['weight_time']].isin([np.inf, -np.inf]).sum()
    #print(f"Number of infinite values:\n{inf_check}")
    
    # Step 6: Iterate through each unique candidate to compute the weighted average for that candidate
    for candidate in candidate_ids:
        
        # Step 7: Filter data for the specific candidate
        candidate_data = current_data[current_data['candidate_id'] == candidate].copy()
        c_mean = candidate_data[candidate_data['days_past_index']< 50]['pct'].mean()
        c_std =  candidate_data[candidate_data['days_past_index']< 50]['pct'].std()
        candidate_data['zscores'] = abs((candidate_data['pct']-c_mean)/c_std)
        candidate_data['weight_outlier'] = np.exp(-1.0*candidate_data['zscores'])
        # Step 8: Compute the total weight 'w_i' for each poll
        #candidate_data['weight_mode'] = 1.0  # Assuming 'weight_mode' is a fixed value of 1.0, adjust if needed
        candidate_data.loc[:, 'w_i'] = (
            candidate_data['weight_mode'] * candidate_data['weight_sample'] *
            candidate_data['numeric_grade'] * candidate_data['weight_time'] * candidate_data['weight_outlier']
        )
        # check for NaN entries in the w_i column
        #nan_weight_i = candidate_data['w_i'].isna().sum()
        #print result
        #print(f"Number of NaN value in 'w_i': {nan_weight_i}")
        
        # Step 3: Identify rows with NaN in w_i to examine their individual values
        #nan_rows = candidate_data[candidate_data['w_i'].isna()]
        #print("Rows producing NaN in 'w_i':")
        #print(nan_rows[['methodology','weight_mode', 'weight_sample', 'weight_score', 'weight_time', 'w_i']])
        
        # Step 9: Compute the weighted average of the poll points ('pct') using the computed weights
        weighted_average_pct = np.average(candidate_data['pct'], weights=candidate_data['w_i'])
        
        # Step 10: Store the current date, candidate_id, candidate_name, and the weighted average in a new DataFrame
        if not candidate_data.empty:
            candidate_name = candidate_data['candidate_name'].iloc[0]
            weighted_averages.append({
                'end_date': current_date,
                'candidate_id': candidate,
                'candidate_name': candidate_name,
                'weighted_average_pct': weighted_average_pct
            })

# Convert the results into a new DataFrame
df_weighted_averages = pd.DataFrame(weighted_averages)    
df_weighted_averages.head()



['Florida' 'North Carolina' 'Illinois' 'Virginia' 'Nevada' 'Minnesota'
 'Pennsylvania' 'Michigan' 'Wisconsin' 'New Jersey' 'Arizona'
 'New Hampshire' 'Georgia' 'Nebraska' 'North Dakota' 'Indiana' 'New York'
 'Ohio' 'South Carolina' 'Alaska' 'Iowa' 'Texas' 'Nebraska CD-2'
 'California' 'Maryland' 'Delaware' 'Montana' 'Rhode Island' 'Connecticut'
 'New Mexico' 'Massachusetts' 'Colorado' 'Maine CD-2' 'Maine' 'Maine CD-1'
 'Missouri' 'Washington' 'Arkansas' 'Oklahoma' 'Tennessee' 'West Virginia'
 'Vermont']
     poll_id  pollster_id                      pollster sponsor_ids  \
69     88535         1562  Redfield & Wilton Strategies        1473   
71     88535         1562  Redfield & Wilton Strategies        1473   
70     88535         1562  Redfield & Wilton Strategies        1473   
68     88535         1562  Redfield & Wilton Strategies        1473   
119    88451          235              InsiderAdvantage         NaN   

          sponsors                  display_name  pollster_ratin

Unnamed: 0,end_date,candidate_id,candidate_name,weighted_average_pct
0,2024-10-02,16661,Kamala Harris,48.703397
1,2024-10-02,16651,Donald Trump,47.414009
2,2024-09-30,16661,Kamala Harris,48.916646
3,2024-09-30,16651,Donald Trump,47.557987
4,2024-09-25,16661,Kamala Harris,49.057264


In [60]:
# Pivot the data so that we have both candidates' weighted averages in the same row for each end_date
df_pivot = df_weighted_averages.pivot(index='end_date', columns='candidate_name', values='weighted_average_pct').reset_index()

# Create the line chart with points, showing both candidates' weighted percentages in the tooltip
chart = alt.Chart(df_pivot).transform_fold(
    ['Donald Trump', 'Kamala Harris'],  # Specify the candidates to include
    as_=['candidate', 'weighted_average_pct']  # Assign new names for the folded fields
).mark_line(point=True).encode(
    x='end_date:T',  # Temporal encoding for end_date
    y='weighted_average_pct:Q',  # Quantitative encoding for weighted averages
    color=alt.Color('candidate:N', scale=alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])),  # Custom colors
    

tooltip=['end_date:T', 'Kamala Harris:Q','Donald Trump:Q'] 

 # Tooltip to display both candidates' percentages
).properties(
    title='Weighted Average Polling Results for Trump and Harris Over Time (Nevada)',
    width=600,
    height=400
)

chart.show()

In [61]:



# Define the color scale for the two candidates
color_scale = alt.Scale(domain=['Donald Trump', 'Kamala Harris'], range=['red', 'blue'])

# Step 1: Create a selection brush for the horizontal axis (end_date)
brush = alt.selection_interval(encodings=['x'])  # Brush selection on the x-axis

# Step 2: Create the base chart with points and LOESS smoothing (Chart 1)
base = alt.Chart(df_weighted_averages).mark_point().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale)
)

loess = base.transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line()

chart1 = (base + loess).add_params(
    brush  # Add the brush selection to Chart 1
).properties(
    title='Weighted Average Polling Results with LOESS Smoothing (Selectable)',
    width=600,
    height=400
)

# Step 3: Create the side-by-side chart that will show the selected region with tooltip

# Apply the same selection filter to both points and LOESS in chart2
selected_points = base.encode(
    opacity=alt.condition(brush, alt.value(1), alt.value(0)),  # Highlight only selected points
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']  # Add tooltip for points
).transform_filter(
    brush  # Filter by brush selection
)

# Apply LOESS smoothing for the selected range
selected_loess = alt.Chart(df_weighted_averages).transform_loess(
    'end_date', 'weighted_average_pct', groupby=['candidate_name'], bandwidth=0.3
).mark_line().encode(
    x='end_date:T',
    y='weighted_average_pct:Q',
    color=alt.Color('candidate_name:N', scale=color_scale),
    tooltip=['end_date:T', 'candidate_name:N', 'weighted_average_pct:Q']
).transform_filter(
    brush  # Filter the LOESS line by the brush selection
)

# Combine selected points and LOESS for the second chart
chart2 = (selected_points + selected_loess).properties(
    title='Selected Region with LOESS Smoothing',
    width=600,
    height=400
)

# Combine the two charts side by side
final_chart = alt.hconcat(chart1, chart2)

# Display the final chart
final_chart.show()
