In [None]:
#Preprocessing the data for the linear regression model
def regression_preprocessing(df):
    """
    Preprocess the dataframe for regression analysis by segmenting it into 5-year periods.
    
    Args:
        df (pd.DataFrame): Input DataFrame containing results
        
    Returns:
        dict: Dictionary of DataFrames for each time period
    """
    
    # Create time periods (5-year intervals)
    df['period'] = pd.cut(df['year'], 
                         bins=range(df['year'].min(), 
                                  df['year'].max() + 5, 
                                  5),
                         labels=[f"{i}-{i+4}" 
                                for i in range(df['year'].min(), 
                                             df['year'].max(), 
                                             5)])
    
    # Create dictionary to store period-specific dataframes
    period_dfs = {}
    
    for period in df['period'].unique():
        period_df = df[df['period'] == period].copy()
        
        # Basic data quality checks
        print(f"\nPeriod {period}:")
        print(f"Number of observations: {len(period_df)}")
        print(f"Number of unique movies: {period_df['movie_id'].nunique()}")
        
        # Store the period DataFrame if it has enough observations
        if len(period_df) > 500:  # Minimum threshold for meaningful analysis
            period_dfs[period] = period_df
        else:
            print(f"Warning: Period {period} has insufficient data and will be excluded")
    
    return period_dfs
